unicodeobject.c revision 7f11ad4594f63dec8cd18a16243fb58cf0e9589b
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49/* Limit for the Unicode object free list */
50
51#define PyUnicode_MAXFREELIST       1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55   The implementation will keep allocated Unicode memory intact for
56   all objects on the free list having a size less than this
57   limit. This reduces malloc() overhead for small Unicode objects.
58
59   At worst this will result in PyUnicode_MAXFREELIST *
60   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
61   malloc()-overhead) bytes of unused garbage.
62
63   Setting the limit to 0 effectively turns the feature off.
64
65   Note: This is an experimental feature ! If you get core dumps when
66   using Unicode objects, turn this feature off.
67
68*/
69
70#define KEEPALIVE_SIZE_LIMIT       9
71
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
80/* --- Globals ------------------------------------------------------------
81
82   The globals are initialized by the _PyUnicode_Init() API and should
83   not be used before calling that API.
84
85*/
86
87
88#ifdef __cplusplus
89extern "C" {
90#endif
91
92#ifdef Py_DEBUG
93#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
97
98#define _PyUnicode_UTF8(op)                             \
99    (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op)                              \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     assert(PyUnicode_IS_READY(op)),                    \
103     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
104         ((char*)((PyASCIIObject*)(op) + 1)) :          \
105         _PyUnicode_UTF8(op))
106#define _PyUnicode_UTF8_LENGTH(op)                      \
107    (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     assert(PyUnicode_IS_READY(op)),                    \
111     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
112         ((PyASCIIObject*)(op))->length :               \
113         _PyUnicode_UTF8_LENGTH(op))
114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
119#define _PyUnicode_KIND(op)                             \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     ((PyASCIIObject *)(op))->state.kind)
122#define _PyUnicode_GET_LENGTH(op)                       \
123    (assert(_PyUnicode_CHECK(op)),                      \
124     ((PyASCIIObject *)(op))->length)
125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
126
127#undef PyUnicode_READY
128#define PyUnicode_READY(op)                             \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     (PyUnicode_IS_READY(op) ?                          \
131      0 : _PyUnicode_Ready((PyObject *)(op))))
132
133#define _PyUnicode_READY_REPLACE(p_obj)                 \
134    (assert(_PyUnicode_CHECK(*p_obj)),                  \
135     (PyUnicode_IS_READY(*p_obj) ?                      \
136      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
138#define _PyUnicode_SHARE_UTF8(op)                       \
139    (assert(_PyUnicode_CHECK(op)),                      \
140     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
141     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op)                       \
143    (assert(_PyUnicode_CHECK(op)),                      \
144     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
146/* true if the Unicode object has an allocated UTF-8 memory block
147   (not shared with other data) */
148#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
149    (assert(_PyUnicode_CHECK(op)),                      \
150     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
151      && _PyUnicode_UTF8(op)                            \
152      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
154/* true if the Unicode object has an allocated wstr memory block
155   (not shared with other data) */
156#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
157    (assert(_PyUnicode_CHECK(op)),                      \
158     (_PyUnicode_WSTR(op) &&                            \
159      (!PyUnicode_IS_READY(op) ||                       \
160       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
161
162/* Generic helper macro to convert characters of different types.
163   from_type and to_type have to be valid type names, begin and end
164   are pointers to the source characters which should be of type
165   "from_type *".  to is a pointer of type "to_type *" and points to the
166   buffer where the result characters are written to. */
167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
168    do {                                                \
169        const from_type *iter_; to_type *to_;           \
170        for (iter_ = (begin), to_ = (to_type *)(to);    \
171             iter_ < (end);                             \
172             ++iter_, ++to_) {                          \
173            *to_ = (to_type)*iter_;                     \
174        }                                               \
175    } while (0)
176
177/* The Unicode string has been modified: reset the hash */
178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
179
180/* This dictionary holds all interned unicode strings.  Note that references
181   to strings in this dictionary are *not* counted in the string's ob_refcnt.
182   When the interned string reaches a refcnt of 0 the string deallocation
183   function will delete the reference from this dictionary.
184
185   Another way to look at this is that to say that the actual reference
186   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
187*/
188static PyObject *interned;
189
190/* The empty Unicode object is shared to improve performance. */
191static PyObject *unicode_empty;
192
193/* Single character Unicode strings in the Latin-1 range are being
194   shared as well. */
195static PyObject *unicode_latin1[256];
196
197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
199    0, 0, 0, 0, 0, 0, 0, 0,
200/*     case 0x0009: * CHARACTER TABULATION */
201/*     case 0x000A: * LINE FEED */
202/*     case 0x000B: * LINE TABULATION */
203/*     case 0x000C: * FORM FEED */
204/*     case 0x000D: * CARRIAGE RETURN */
205    0, 1, 1, 1, 1, 1, 0, 0,
206    0, 0, 0, 0, 0, 0, 0, 0,
207/*     case 0x001C: * FILE SEPARATOR */
208/*     case 0x001D: * GROUP SEPARATOR */
209/*     case 0x001E: * RECORD SEPARATOR */
210/*     case 0x001F: * UNIT SEPARATOR */
211    0, 0, 0, 0, 1, 1, 1, 1,
212/*     case 0x0020: * SPACE */
213    1, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0,
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224    0, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0
226};
227
228/* forward */
229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
230static PyObject* get_latin1_char(unsigned char ch);
231
232static PyObject *
233unicode_encode_call_errorhandler(const char *errors,
234       PyObject **errorHandler,const char *encoding, const char *reason,
235       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
236       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
237
238static void
239raise_encode_exception(PyObject **exceptionObject,
240                       const char *encoding,
241                       const Py_UNICODE *unicode, Py_ssize_t size,
242                       Py_ssize_t startpos, Py_ssize_t endpos,
243                       const char *reason);
244
245/* Same for linebreaks */
246static unsigned char ascii_linebreak[] = {
247    0, 0, 0, 0, 0, 0, 0, 0,
248/*         0x000A, * LINE FEED */
249/*         0x000B, * LINE TABULATION */
250/*         0x000C, * FORM FEED */
251/*         0x000D, * CARRIAGE RETURN */
252    0, 0, 1, 1, 1, 1, 0, 0,
253    0, 0, 0, 0, 0, 0, 0, 0,
254/*         0x001C, * FILE SEPARATOR */
255/*         0x001D, * GROUP SEPARATOR */
256/*         0x001E, * RECORD SEPARATOR */
257    0, 0, 0, 0, 1, 1, 1, 0,
258    0, 0, 0, 0, 0, 0, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260    0, 0, 0, 0, 0, 0, 0, 0,
261    0, 0, 0, 0, 0, 0, 0, 0,
262
263    0, 0, 0, 0, 0, 0, 0, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0,
269    0, 0, 0, 0, 0, 0, 0, 0,
270    0, 0, 0, 0, 0, 0, 0, 0
271};
272
273/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
274   This function is kept for backward compatibility with the old API. */
275Py_UNICODE
276PyUnicode_GetMax(void)
277{
278#ifdef Py_UNICODE_WIDE
279    return 0x10FFFF;
280#else
281    /* This is actually an illegal character, so it should
282       not be passed to unichr. */
283    return 0xFFFF;
284#endif
285}
286
287#ifdef Py_DEBUG
288static int
289_PyUnicode_CheckConsistency(void *op)
290{
291    PyASCIIObject *ascii;
292    unsigned int kind;
293
294    assert(PyUnicode_Check(op));
295
296    ascii = (PyASCIIObject *)op;
297    kind = ascii->state.kind;
298
299    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
300        assert(kind == PyUnicode_1BYTE_KIND);
301        assert(ascii->state.ready == 1);
302    }
303    else if (ascii->state.compact == 1) {
304        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
305        void *data;
306        assert(kind == PyUnicode_1BYTE_KIND
307               || kind == PyUnicode_2BYTE_KIND
308               || kind == PyUnicode_4BYTE_KIND);
309        assert(ascii->state.ascii == 0);
310        assert(ascii->state.ready == 1);
311        data = compact + 1;
312        assert (compact->utf8 != data);
313        if (
314#if SIZEOF_WCHAR_T == 2
315            kind == PyUnicode_2BYTE_KIND
316#else
317            kind == PyUnicode_4BYTE_KIND
318#endif
319           )
320            assert(ascii->wstr == data);
321        else
322            assert(ascii->wstr != data);
323    } else {
324        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325        PyUnicodeObject *unicode = (PyUnicodeObject *)op;
326
327        if (kind == PyUnicode_WCHAR_KIND) {
328            assert(ascii->state.compact == 0);
329            assert(ascii->state.ascii == 0);
330            assert(ascii->state.ready == 0);
331            assert(ascii->wstr != NULL);
332            assert(unicode->data.any == NULL);
333            assert(compact->utf8 == NULL);
334            assert(ascii->state.interned == SSTATE_NOT_INTERNED);
335        }
336        else {
337            assert(kind == PyUnicode_1BYTE_KIND
338                   || kind == PyUnicode_2BYTE_KIND
339                   || kind == PyUnicode_4BYTE_KIND);
340            assert(ascii->state.compact == 0);
341            assert(ascii->state.ready == 1);
342            assert(unicode->data.any != NULL);
343            if (ascii->state.ascii)
344                assert (compact->utf8 == unicode->data.any);
345            else
346                assert (compact->utf8 != unicode->data.any);
347            if (
348#if SIZEOF_WCHAR_T == 2
349                kind == PyUnicode_2BYTE_KIND
350#else
351                kind == PyUnicode_4BYTE_KIND
352#endif
353               )
354                assert(ascii->wstr == unicode->data.any);
355            else
356                assert(ascii->wstr != unicode->data.any);
357        }
358    }
359    return 1;
360}
361#endif
362
363/* --- Bloom Filters ----------------------------------------------------- */
364
365/* stuff to implement simple "bloom filters" for Unicode characters.
366   to keep things simple, we use a single bitmask, using the least 5
367   bits from each unicode characters as the bit index. */
368
369/* the linebreak mask is set up by Unicode_Init below */
370
371#if LONG_BIT >= 128
372#define BLOOM_WIDTH 128
373#elif LONG_BIT >= 64
374#define BLOOM_WIDTH 64
375#elif LONG_BIT >= 32
376#define BLOOM_WIDTH 32
377#else
378#error "LONG_BIT is smaller than 32"
379#endif
380
381#define BLOOM_MASK unsigned long
382
383static BLOOM_MASK bloom_linebreak;
384
385#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
386#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
387
388#define BLOOM_LINEBREAK(ch)                                             \
389    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
390     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
391
392Py_LOCAL_INLINE(BLOOM_MASK)
393make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
394{
395    /* calculate simple bloom-style bitmask for a given unicode string */
396
397    BLOOM_MASK mask;
398    Py_ssize_t i;
399
400    mask = 0;
401    for (i = 0; i < len; i++)
402        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
403
404    return mask;
405}
406
407#define BLOOM_MEMBER(mask, chr, str) \
408    (BLOOM(mask, chr) \
409     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
410
411/* --- Unicode Object ----------------------------------------------------- */
412
413static PyObject *
414fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
415
416Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
417                                 Py_ssize_t size, Py_UCS4 ch,
418                                 int direction)
419{
420    /* like wcschr, but doesn't stop at NULL characters */
421    Py_ssize_t i;
422    if (direction == 1) {
423        for(i = 0; i < size; i++)
424            if (PyUnicode_READ(kind, s, i) == ch)
425                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
426    }
427    else {
428        for(i = size-1; i >= 0; i--)
429            if (PyUnicode_READ(kind, s, i) == ch)
430                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
431    }
432    return NULL;
433}
434
435static PyObject*
436resize_compact(PyObject *unicode, Py_ssize_t length)
437{
438    Py_ssize_t char_size;
439    Py_ssize_t struct_size;
440    Py_ssize_t new_size;
441    int share_wstr;
442
443    assert(PyUnicode_IS_READY(unicode));
444    char_size = PyUnicode_CHARACTER_SIZE(unicode);
445    if (PyUnicode_IS_COMPACT_ASCII(unicode))
446        struct_size = sizeof(PyASCIIObject);
447    else
448        struct_size = sizeof(PyCompactUnicodeObject);
449    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
450
451    _Py_DEC_REFTOTAL;
452    _Py_ForgetReference(unicode);
453
454    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
455        PyErr_NoMemory();
456        return NULL;
457    }
458    new_size = (struct_size + (length + 1) * char_size);
459
460    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
461    if (unicode == NULL) {
462        PyObject_Del(unicode);
463        PyErr_NoMemory();
464        return NULL;
465    }
466    _Py_NewReference(unicode);
467    _PyUnicode_LENGTH(unicode) = length;
468    if (share_wstr) {
469        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
470        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
471            _PyUnicode_WSTR_LENGTH(unicode) = length;
472    }
473    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
474                    length, 0);
475    return unicode;
476}
477
478static int
479resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
480{
481    void *oldstr;
482
483    assert(!PyUnicode_IS_COMPACT(unicode));
484
485    assert(Py_REFCNT(unicode) == 1);
486    _PyUnicode_DIRTY(unicode);
487
488    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
489    {
490        PyObject_DEL(_PyUnicode_UTF8(unicode));
491        _PyUnicode_UTF8(unicode) = NULL;
492    }
493
494    if (PyUnicode_IS_READY(unicode)) {
495        Py_ssize_t char_size;
496        Py_ssize_t new_size;
497        int share_wstr, share_utf8;
498        void *data;
499
500        data = _PyUnicode_DATA_ANY(unicode);
501        assert(data != NULL);
502        char_size = PyUnicode_CHARACTER_SIZE(unicode);
503        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
504        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
505
506        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
507            PyErr_NoMemory();
508            return -1;
509        }
510        new_size = (length + 1) * char_size;
511
512        data = (PyObject *)PyObject_REALLOC(data, new_size);
513        if (data == NULL) {
514            PyErr_NoMemory();
515            return -1;
516        }
517        _PyUnicode_DATA_ANY(unicode) = data;
518        if (share_wstr) {
519            _PyUnicode_WSTR(unicode) = data;
520            _PyUnicode_WSTR_LENGTH(unicode) = length;
521        }
522        if (share_utf8) {
523            _PyUnicode_UTF8(unicode) = data;
524            _PyUnicode_UTF8_LENGTH(unicode) = length;
525        }
526        _PyUnicode_LENGTH(unicode) = length;
527        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
528        if (share_wstr)
529            return 0;
530    }
531    if (_PyUnicode_WSTR(unicode) != NULL) {
532        assert(_PyUnicode_WSTR(unicode) != NULL);
533
534        oldstr = _PyUnicode_WSTR(unicode);
535        _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
536                                         sizeof(Py_UNICODE) * (length + 1));
537        if (!_PyUnicode_WSTR(unicode)) {
538            _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
539            PyErr_NoMemory();
540            return -1;
541        }
542        _PyUnicode_WSTR(unicode)[length] = 0;
543        _PyUnicode_WSTR_LENGTH(unicode) = length;
544    }
545    return 0;
546}
547
548static PyObject*
549resize_copy(PyObject *unicode, Py_ssize_t length)
550{
551    Py_ssize_t copy_length;
552    if (PyUnicode_IS_COMPACT(unicode)) {
553        PyObject *copy;
554        assert(PyUnicode_IS_READY(unicode));
555
556        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
557        if (copy == NULL)
558            return NULL;
559
560        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
561        if (PyUnicode_CopyCharacters(copy, 0,
562                                     unicode, 0,
563                                     copy_length) < 0)
564        {
565            Py_DECREF(copy);
566            return NULL;
567        }
568        return copy;
569    }
570    else {
571        PyUnicodeObject *w;
572        assert(_PyUnicode_WSTR(unicode) != NULL);
573        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
574        w = _PyUnicode_New(length);
575        if (w == NULL)
576            return NULL;
577        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
578        copy_length = Py_MIN(copy_length, length);
579        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
580                        copy_length);
581        return (PyObject*)w;
582    }
583}
584
585/* We allocate one more byte to make sure the string is
586   Ux0000 terminated; some code (e.g. new_identifier)
587   relies on that.
588
589   XXX This allocator could further be enhanced by assuring that the
590   free list never reduces its size below 1.
591
592*/
593
594#ifdef Py_DEBUG
595int unicode_old_new_calls = 0;
596#endif
597
598static PyUnicodeObject *
599_PyUnicode_New(Py_ssize_t length)
600{
601    register PyUnicodeObject *unicode;
602    size_t new_size;
603
604    /* Optimization for empty strings */
605    if (length == 0 && unicode_empty != NULL) {
606        Py_INCREF(unicode_empty);
607        return (PyUnicodeObject*)unicode_empty;
608    }
609
610    /* Ensure we won't overflow the size. */
611    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
612        return (PyUnicodeObject *)PyErr_NoMemory();
613    }
614    if (length < 0) {
615        PyErr_SetString(PyExc_SystemError,
616                        "Negative size passed to _PyUnicode_New");
617        return NULL;
618    }
619
620#ifdef Py_DEBUG
621    ++unicode_old_new_calls;
622#endif
623
624    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
625    if (unicode == NULL)
626        return NULL;
627    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
628    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
629    if (!_PyUnicode_WSTR(unicode)) {
630        PyErr_NoMemory();
631        goto onError;
632    }
633
634    /* Initialize the first element to guard against cases where
635     * the caller fails before initializing str -- unicode_resize()
636     * reads str[0], and the Keep-Alive optimization can keep memory
637     * allocated for str alive across a call to unicode_dealloc(unicode).
638     * We don't want unicode_resize to read uninitialized memory in
639     * that case.
640     */
641    _PyUnicode_WSTR(unicode)[0] = 0;
642    _PyUnicode_WSTR(unicode)[length] = 0;
643    _PyUnicode_WSTR_LENGTH(unicode) = length;
644    _PyUnicode_HASH(unicode) = -1;
645    _PyUnicode_STATE(unicode).interned = 0;
646    _PyUnicode_STATE(unicode).kind = 0;
647    _PyUnicode_STATE(unicode).compact = 0;
648    _PyUnicode_STATE(unicode).ready = 0;
649    _PyUnicode_STATE(unicode).ascii = 0;
650    _PyUnicode_DATA_ANY(unicode) = NULL;
651    _PyUnicode_LENGTH(unicode) = 0;
652    _PyUnicode_UTF8(unicode) = NULL;
653    _PyUnicode_UTF8_LENGTH(unicode) = 0;
654    return unicode;
655
656  onError:
657    /* XXX UNREF/NEWREF interface should be more symmetrical */
658    _Py_DEC_REFTOTAL;
659    _Py_ForgetReference((PyObject *)unicode);
660    PyObject_Del(unicode);
661    return NULL;
662}
663
664static const char*
665unicode_kind_name(PyObject *unicode)
666{
667    /* don't check consistency: unicode_kind_name() is called from
668       _PyUnicode_Dump() */
669    if (!PyUnicode_IS_COMPACT(unicode))
670    {
671        if (!PyUnicode_IS_READY(unicode))
672            return "wstr";
673        switch(PyUnicode_KIND(unicode))
674        {
675        case PyUnicode_1BYTE_KIND:
676            if (PyUnicode_IS_ASCII(unicode))
677                return "legacy ascii";
678            else
679                return "legacy latin1";
680        case PyUnicode_2BYTE_KIND:
681            return "legacy UCS2";
682        case PyUnicode_4BYTE_KIND:
683            return "legacy UCS4";
684        default:
685            return "<legacy invalid kind>";
686        }
687    }
688    assert(PyUnicode_IS_READY(unicode));
689    switch(PyUnicode_KIND(unicode))
690    {
691    case PyUnicode_1BYTE_KIND:
692        if (PyUnicode_IS_ASCII(unicode))
693            return "ascii";
694        else
695            return "latin1";
696    case PyUnicode_2BYTE_KIND:
697        return "UCS2";
698    case PyUnicode_4BYTE_KIND:
699        return "UCS4";
700    default:
701        return "<invalid compact kind>";
702    }
703}
704
705#ifdef Py_DEBUG
706int unicode_new_new_calls = 0;
707
708/* Functions wrapping macros for use in debugger */
709char *_PyUnicode_utf8(void *unicode){
710    return PyUnicode_UTF8(unicode);
711}
712
713void *_PyUnicode_compact_data(void *unicode) {
714    return _PyUnicode_COMPACT_DATA(unicode);
715}
716void *_PyUnicode_data(void *unicode){
717    printf("obj %p\n", unicode);
718    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
719    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
720    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
721    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
722    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
723    return PyUnicode_DATA(unicode);
724}
725
726void
727_PyUnicode_Dump(PyObject *op)
728{
729    PyASCIIObject *ascii = (PyASCIIObject *)op;
730    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
731    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
732    void *data;
733    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
734    if (ascii->state.compact)
735        data = (compact + 1);
736    else
737        data = unicode->data.any;
738    if (ascii->wstr == data)
739        printf("shared ");
740    printf("wstr=%p", ascii->wstr);
741    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
742        printf(" (%zu), ", compact->wstr_length);
743        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
744            printf("shared ");
745        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
746    }
747    printf(", data=%p\n", data);
748}
749#endif
750
751PyObject *
752PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
753{
754    PyObject *obj;
755    PyCompactUnicodeObject *unicode;
756    void *data;
757    int kind_state;
758    int is_sharing = 0, is_ascii = 0;
759    Py_ssize_t char_size;
760    Py_ssize_t struct_size;
761
762    /* Optimization for empty strings */
763    if (size == 0 && unicode_empty != NULL) {
764        Py_INCREF(unicode_empty);
765        return unicode_empty;
766    }
767
768#ifdef Py_DEBUG
769    ++unicode_new_new_calls;
770#endif
771
772    struct_size = sizeof(PyCompactUnicodeObject);
773    if (maxchar < 128) {
774        kind_state = PyUnicode_1BYTE_KIND;
775        char_size = 1;
776        is_ascii = 1;
777        struct_size = sizeof(PyASCIIObject);
778    }
779    else if (maxchar < 256) {
780        kind_state = PyUnicode_1BYTE_KIND;
781        char_size = 1;
782    }
783    else if (maxchar < 65536) {
784        kind_state = PyUnicode_2BYTE_KIND;
785        char_size = 2;
786        if (sizeof(wchar_t) == 2)
787            is_sharing = 1;
788    }
789    else {
790        kind_state = PyUnicode_4BYTE_KIND;
791        char_size = 4;
792        if (sizeof(wchar_t) == 4)
793            is_sharing = 1;
794    }
795
796    /* Ensure we won't overflow the size. */
797    if (size < 0) {
798        PyErr_SetString(PyExc_SystemError,
799                        "Negative size passed to PyUnicode_New");
800        return NULL;
801    }
802    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
803        return PyErr_NoMemory();
804
805    /* Duplicated allocation code from _PyObject_New() instead of a call to
806     * PyObject_New() so we are able to allocate space for the object and
807     * it's data buffer.
808     */
809    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
810    if (obj == NULL)
811        return PyErr_NoMemory();
812    obj = PyObject_INIT(obj, &PyUnicode_Type);
813    if (obj == NULL)
814        return NULL;
815
816    unicode = (PyCompactUnicodeObject *)obj;
817    if (is_ascii)
818        data = ((PyASCIIObject*)obj) + 1;
819    else
820        data = unicode + 1;
821    _PyUnicode_LENGTH(unicode) = size;
822    _PyUnicode_HASH(unicode) = -1;
823    _PyUnicode_STATE(unicode).interned = 0;
824    _PyUnicode_STATE(unicode).kind = kind_state;
825    _PyUnicode_STATE(unicode).compact = 1;
826    _PyUnicode_STATE(unicode).ready = 1;
827    _PyUnicode_STATE(unicode).ascii = is_ascii;
828    if (is_ascii) {
829        ((char*)data)[size] = 0;
830        _PyUnicode_WSTR(unicode) = NULL;
831    }
832    else if (kind_state == PyUnicode_1BYTE_KIND) {
833        ((char*)data)[size] = 0;
834        _PyUnicode_WSTR(unicode) = NULL;
835        _PyUnicode_WSTR_LENGTH(unicode) = 0;
836        unicode->utf8_length = 0;
837        unicode->utf8 = NULL;
838        }
839    else {
840        unicode->utf8 = NULL;
841        if (kind_state == PyUnicode_2BYTE_KIND)
842            ((Py_UCS2*)data)[size] = 0;
843        else /* kind_state == PyUnicode_4BYTE_KIND */
844            ((Py_UCS4*)data)[size] = 0;
845        if (is_sharing) {
846            _PyUnicode_WSTR_LENGTH(unicode) = size;
847            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
848        }
849        else {
850            _PyUnicode_WSTR_LENGTH(unicode) = 0;
851            _PyUnicode_WSTR(unicode) = NULL;
852        }
853    }
854    return obj;
855}
856
857#if SIZEOF_WCHAR_T == 2
858/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
859   will decode surrogate pairs, the other conversions are implemented as macros
860   for efficency.
861
862   This function assumes that unicode can hold one more code point than wstr
863   characters for a terminating null character. */
864static void
865unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
866                              PyUnicodeObject *unicode)
867{
868    const wchar_t *iter;
869    Py_UCS4 *ucs4_out;
870
871    assert(unicode != NULL);
872    assert(_PyUnicode_CHECK(unicode));
873    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
874    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
875
876    for (iter = begin; iter < end; ) {
877        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
878                           _PyUnicode_GET_LENGTH(unicode)));
879        if (*iter >= 0xD800 && *iter <= 0xDBFF
880            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
881        {
882            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
883            iter += 2;
884        }
885        else {
886            *ucs4_out++ = *iter;
887            iter++;
888        }
889    }
890    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
891                        _PyUnicode_GET_LENGTH(unicode)));
892
893}
894#endif
895
896static int
897_PyUnicode_Dirty(PyObject *unicode)
898{
899    assert(_PyUnicode_CHECK(unicode));
900    if (Py_REFCNT(unicode) != 1) {
901        PyErr_SetString(PyExc_ValueError,
902                        "Cannot modify a string having more than 1 reference");
903        return -1;
904    }
905    _PyUnicode_DIRTY(unicode);
906    return 0;
907}
908
909Py_ssize_t
910PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
911                         PyObject *from, Py_ssize_t from_start,
912                         Py_ssize_t how_many)
913{
914    unsigned int from_kind, to_kind;
915    void *from_data, *to_data;
916
917    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
918        PyErr_BadInternalCall();
919        return -1;
920    }
921
922    if (PyUnicode_READY(from))
923        return -1;
924    if (PyUnicode_READY(to))
925        return -1;
926
927    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
928    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
929        PyErr_Format(PyExc_ValueError,
930                     "Cannot write %zi characters at %zi "
931                     "in a string of %zi characters",
932                     how_many, to_start, PyUnicode_GET_LENGTH(to));
933        return -1;
934    }
935    if (how_many == 0)
936        return 0;
937
938    if (_PyUnicode_Dirty(to))
939        return -1;
940
941    from_kind = PyUnicode_KIND(from);
942    from_data = PyUnicode_DATA(from);
943    to_kind = PyUnicode_KIND(to);
944    to_data = PyUnicode_DATA(to);
945
946    if (from_kind == to_kind
947        /* deny latin1 => ascii */
948        && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
949    {
950        Py_MEMCPY((char*)to_data
951                      + PyUnicode_KIND_SIZE(to_kind, to_start),
952                  (char*)from_data
953                      + PyUnicode_KIND_SIZE(from_kind, from_start),
954                  PyUnicode_KIND_SIZE(to_kind, how_many));
955    }
956    else if (from_kind == PyUnicode_1BYTE_KIND
957             && to_kind == PyUnicode_2BYTE_KIND)
958    {
959        _PyUnicode_CONVERT_BYTES(
960            Py_UCS1, Py_UCS2,
961            PyUnicode_1BYTE_DATA(from) + from_start,
962            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
963            PyUnicode_2BYTE_DATA(to) + to_start
964            );
965    }
966    else if (from_kind == PyUnicode_1BYTE_KIND
967             && to_kind == PyUnicode_4BYTE_KIND)
968    {
969        _PyUnicode_CONVERT_BYTES(
970            Py_UCS1, Py_UCS4,
971            PyUnicode_1BYTE_DATA(from) + from_start,
972            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
973            PyUnicode_4BYTE_DATA(to) + to_start
974            );
975    }
976    else if (from_kind == PyUnicode_2BYTE_KIND
977             && to_kind == PyUnicode_4BYTE_KIND)
978    {
979        _PyUnicode_CONVERT_BYTES(
980            Py_UCS2, Py_UCS4,
981            PyUnicode_2BYTE_DATA(from) + from_start,
982            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
983            PyUnicode_4BYTE_DATA(to) + to_start
984            );
985    }
986    else {
987        int invalid_kinds;
988
989        /* check if max_char(from substring) <= max_char(to) */
990        if (from_kind > to_kind
991                /* latin1 => ascii */
992            || (PyUnicode_IS_ASCII(to)
993                && to_kind == PyUnicode_1BYTE_KIND
994                && !PyUnicode_IS_ASCII(from)))
995        {
996            /* slow path to check for character overflow */
997            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
998            Py_UCS4 ch, maxchar;
999            Py_ssize_t i;
1000
1001            maxchar = 0;
1002            invalid_kinds = 0;
1003            for (i=0; i < how_many; i++) {
1004                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1005                if (ch > maxchar) {
1006                    maxchar = ch;
1007                    if (maxchar > to_maxchar) {
1008                        invalid_kinds = 1;
1009                        break;
1010                    }
1011                }
1012                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1013            }
1014        }
1015        else
1016            invalid_kinds = 1;
1017        if (invalid_kinds) {
1018            PyErr_Format(PyExc_ValueError,
1019                         "Cannot copy %s characters "
1020                         "into a string of %s characters",
1021                         unicode_kind_name(from),
1022                         unicode_kind_name(to));
1023            return -1;
1024        }
1025    }
1026    return how_many;
1027}
1028
1029/* Find the maximum code point and count the number of surrogate pairs so a
1030   correct string length can be computed before converting a string to UCS4.
1031   This function counts single surrogates as a character and not as a pair.
1032
1033   Return 0 on success, or -1 on error. */
1034static int
1035find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1036                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1037{
1038    const wchar_t *iter;
1039
1040    assert(num_surrogates != NULL && maxchar != NULL);
1041    if (num_surrogates == NULL || maxchar == NULL) {
1042        PyErr_SetString(PyExc_SystemError,
1043                        "unexpected NULL arguments to "
1044                        "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1045        return -1;
1046    }
1047
1048    *num_surrogates = 0;
1049    *maxchar = 0;
1050
1051    for (iter = begin; iter < end; ) {
1052        if (*iter > *maxchar)
1053            *maxchar = *iter;
1054#if SIZEOF_WCHAR_T == 2
1055        if (*iter >= 0xD800 && *iter <= 0xDBFF
1056            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1057        {
1058            Py_UCS4 surrogate_val;
1059            surrogate_val = (((iter[0] & 0x3FF)<<10)
1060                             | (iter[1] & 0x3FF)) + 0x10000;
1061            ++(*num_surrogates);
1062            if (surrogate_val > *maxchar)
1063                *maxchar = surrogate_val;
1064            iter += 2;
1065        }
1066        else
1067            iter++;
1068#else
1069        iter++;
1070#endif
1071    }
1072    return 0;
1073}
1074
1075#ifdef Py_DEBUG
1076int unicode_ready_calls = 0;
1077#endif
1078
1079static int
1080unicode_ready(PyObject **p_obj, int replace)
1081{
1082    PyUnicodeObject *unicode;
1083    wchar_t *end;
1084    Py_UCS4 maxchar = 0;
1085    Py_ssize_t num_surrogates;
1086#if SIZEOF_WCHAR_T == 2
1087    Py_ssize_t length_wo_surrogates;
1088#endif
1089
1090    assert(p_obj != NULL);
1091    unicode = (PyUnicodeObject *)*p_obj;
1092
1093    /* _PyUnicode_Ready() is only intented for old-style API usage where
1094       strings were created using _PyObject_New() and where no canonical
1095       representation (the str field) has been set yet aka strings
1096       which are not yet ready. */
1097    assert(_PyUnicode_CHECK(unicode));
1098    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1099    assert(_PyUnicode_WSTR(unicode) != NULL);
1100    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1101    assert(_PyUnicode_UTF8(unicode) == NULL);
1102    /* Actually, it should neither be interned nor be anything else: */
1103    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1104
1105#ifdef Py_DEBUG
1106    ++unicode_ready_calls;
1107#endif
1108
1109#ifdef Py_DEBUG
1110    assert(!replace || Py_REFCNT(unicode) == 1);
1111#else
1112    if (replace && Py_REFCNT(unicode) != 1)
1113        replace = 0;
1114#endif
1115    if (replace) {
1116        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1117        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1118        /* Optimization for empty strings */
1119        if (len == 0) {
1120            Py_INCREF(unicode_empty);
1121            Py_DECREF(*p_obj);
1122            *p_obj = unicode_empty;
1123            return 0;
1124        }
1125        if (len == 1 && wstr[0] < 256) {
1126            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1127            if (latin1_char == NULL)
1128                return -1;
1129            Py_DECREF(*p_obj);
1130            *p_obj = latin1_char;
1131            return 0;
1132        }
1133    }
1134
1135    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1136    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1137                                &maxchar, &num_surrogates) == -1)
1138        return -1;
1139
1140    if (maxchar < 256) {
1141        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1142        if (!_PyUnicode_DATA_ANY(unicode)) {
1143            PyErr_NoMemory();
1144            return -1;
1145        }
1146        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1147                                _PyUnicode_WSTR(unicode), end,
1148                                PyUnicode_1BYTE_DATA(unicode));
1149        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1150        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1151        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1152        if (maxchar < 128) {
1153            _PyUnicode_STATE(unicode).ascii = 1;
1154            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1155            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1156        }
1157        else {
1158            _PyUnicode_STATE(unicode).ascii = 0;
1159            _PyUnicode_UTF8(unicode) = NULL;
1160            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1161        }
1162        PyObject_FREE(_PyUnicode_WSTR(unicode));
1163        _PyUnicode_WSTR(unicode) = NULL;
1164        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1165    }
1166    /* In this case we might have to convert down from 4-byte native
1167       wchar_t to 2-byte unicode. */
1168    else if (maxchar < 65536) {
1169        assert(num_surrogates == 0 &&
1170               "FindMaxCharAndNumSurrogatePairs() messed up");
1171
1172#if SIZEOF_WCHAR_T == 2
1173        /* We can share representations and are done. */
1174        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1175        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1176        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1177        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1178        _PyUnicode_UTF8(unicode) = NULL;
1179        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1180#else
1181        /* sizeof(wchar_t) == 4 */
1182        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1183            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1184        if (!_PyUnicode_DATA_ANY(unicode)) {
1185            PyErr_NoMemory();
1186            return -1;
1187        }
1188        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1189                                _PyUnicode_WSTR(unicode), end,
1190                                PyUnicode_2BYTE_DATA(unicode));
1191        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1192        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1193        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1194        _PyUnicode_UTF8(unicode) = NULL;
1195        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1196        PyObject_FREE(_PyUnicode_WSTR(unicode));
1197        _PyUnicode_WSTR(unicode) = NULL;
1198        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1199#endif
1200    }
1201    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1202    else {
1203#if SIZEOF_WCHAR_T == 2
1204        /* in case the native representation is 2-bytes, we need to allocate a
1205           new normalized 4-byte version. */
1206        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1207        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1208        if (!_PyUnicode_DATA_ANY(unicode)) {
1209            PyErr_NoMemory();
1210            return -1;
1211        }
1212        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1213        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1214        _PyUnicode_UTF8(unicode) = NULL;
1215        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1216        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1217        _PyUnicode_STATE(unicode).ready = 1;
1218        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1219        PyObject_FREE(_PyUnicode_WSTR(unicode));
1220        _PyUnicode_WSTR(unicode) = NULL;
1221        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1222#else
1223        assert(num_surrogates == 0);
1224
1225        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1226        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1227        _PyUnicode_UTF8(unicode) = NULL;
1228        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1229        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1230#endif
1231        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1232    }
1233    _PyUnicode_STATE(unicode).ready = 1;
1234    return 0;
1235}
1236
1237int
1238_PyUnicode_ReadyReplace(PyObject **op)
1239{
1240    return unicode_ready(op, 1);
1241}
1242
1243int
1244_PyUnicode_Ready(PyObject *op)
1245{
1246    return unicode_ready(&op, 0);
1247}
1248
1249static void
1250unicode_dealloc(register PyUnicodeObject *unicode)
1251{
1252    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1253    case SSTATE_NOT_INTERNED:
1254        break;
1255
1256    case SSTATE_INTERNED_MORTAL:
1257        /* revive dead object temporarily for DelItem */
1258        Py_REFCNT(unicode) = 3;
1259        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1260            Py_FatalError(
1261                "deletion of interned string failed");
1262        break;
1263
1264    case SSTATE_INTERNED_IMMORTAL:
1265        Py_FatalError("Immortal interned string died.");
1266
1267    default:
1268        Py_FatalError("Inconsistent interned string state.");
1269    }
1270
1271    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1272        PyObject_DEL(_PyUnicode_WSTR(unicode));
1273    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1274        PyObject_DEL(_PyUnicode_UTF8(unicode));
1275
1276    if (PyUnicode_IS_COMPACT(unicode)) {
1277        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1278    }
1279    else {
1280        if (_PyUnicode_DATA_ANY(unicode))
1281            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1282        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1283    }
1284}
1285
1286static int
1287unicode_resizable(PyObject *unicode)
1288{
1289    if (Py_REFCNT(unicode) != 1)
1290        return 0;
1291    if (PyUnicode_CHECK_INTERNED(unicode))
1292        return 0;
1293    assert (unicode != unicode_empty);
1294#ifdef Py_DEBUG
1295    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1296        && PyUnicode_GET_LENGTH(unicode) == 1)
1297    {
1298        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1299        if (ch < 256 && unicode_latin1[ch] == unicode)
1300            return 0;
1301    }
1302#endif
1303    return 1;
1304}
1305
1306static int
1307unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1308{
1309    PyObject *unicode;
1310    Py_ssize_t old_length;
1311
1312    assert(p_unicode != NULL);
1313    unicode = *p_unicode;
1314
1315    assert(unicode != NULL);
1316    assert(PyUnicode_Check(unicode));
1317    assert(0 <= length);
1318
1319    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1320        old_length = PyUnicode_WSTR_LENGTH(unicode);
1321    else
1322        old_length = PyUnicode_GET_LENGTH(unicode);
1323    if (old_length == length)
1324        return 0;
1325
1326    if (!unicode_resizable(unicode)) {
1327        PyObject *copy = resize_copy(unicode, length);
1328        if (copy == NULL)
1329            return -1;
1330        Py_DECREF(*p_unicode);
1331        *p_unicode = copy;
1332        return 0;
1333    }
1334
1335    if (PyUnicode_IS_COMPACT(unicode)) {
1336        *p_unicode = resize_compact(unicode, length);
1337        if (*p_unicode == NULL)
1338            return -1;
1339        return 0;
1340    } else
1341        return resize_inplace((PyUnicodeObject*)unicode, length);
1342}
1343
1344int
1345PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1346{
1347    PyObject *unicode;
1348    if (p_unicode == NULL) {
1349        PyErr_BadInternalCall();
1350        return -1;
1351    }
1352    unicode = *p_unicode;
1353    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1354        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1355    {
1356        PyErr_BadInternalCall();
1357        return -1;
1358    }
1359    return unicode_resize(p_unicode, length);
1360}
1361
1362static PyObject*
1363get_latin1_char(unsigned char ch)
1364{
1365    PyObject *unicode = unicode_latin1[ch];
1366    if (!unicode) {
1367        unicode = PyUnicode_New(1, ch);
1368        if (!unicode)
1369            return NULL;
1370        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1371        unicode_latin1[ch] = unicode;
1372    }
1373    Py_INCREF(unicode);
1374    return unicode;
1375}
1376
1377PyObject *
1378PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1379{
1380    PyUnicodeObject *unicode;
1381    Py_UCS4 maxchar = 0;
1382    Py_ssize_t num_surrogates;
1383
1384    if (u == NULL)
1385        return (PyObject*)_PyUnicode_New(size);
1386
1387    /* If the Unicode data is known at construction time, we can apply
1388       some optimizations which share commonly used objects. */
1389
1390    /* Optimization for empty strings */
1391    if (size == 0 && unicode_empty != NULL) {
1392        Py_INCREF(unicode_empty);
1393        return unicode_empty;
1394    }
1395
1396    /* Single character Unicode objects in the Latin-1 range are
1397       shared when using this constructor */
1398    if (size == 1 && *u < 256)
1399        return get_latin1_char((unsigned char)*u);
1400
1401    /* If not empty and not single character, copy the Unicode data
1402       into the new object */
1403    if (find_maxchar_surrogates(u, u + size,
1404                                &maxchar, &num_surrogates) == -1)
1405        return NULL;
1406
1407    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1408                                                maxchar);
1409    if (!unicode)
1410        return NULL;
1411
1412    switch (PyUnicode_KIND(unicode)) {
1413    case PyUnicode_1BYTE_KIND:
1414        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1415                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1416        break;
1417    case PyUnicode_2BYTE_KIND:
1418#if Py_UNICODE_SIZE == 2
1419        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1420#else
1421        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1422                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1423#endif
1424        break;
1425    case PyUnicode_4BYTE_KIND:
1426#if SIZEOF_WCHAR_T == 2
1427        /* This is the only case which has to process surrogates, thus
1428           a simple copy loop is not enough and we need a function. */
1429        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1430#else
1431        assert(num_surrogates == 0);
1432        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1433#endif
1434        break;
1435    default:
1436        assert(0 && "Impossible state");
1437    }
1438
1439    return (PyObject *)unicode;
1440}
1441
1442PyObject *
1443PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1444{
1445    PyUnicodeObject *unicode;
1446
1447    if (size < 0) {
1448        PyErr_SetString(PyExc_SystemError,
1449                        "Negative size passed to PyUnicode_FromStringAndSize");
1450        return NULL;
1451    }
1452
1453    /* If the Unicode data is known at construction time, we can apply
1454       some optimizations which share commonly used objects.
1455       Also, this means the input must be UTF-8, so fall back to the
1456       UTF-8 decoder at the end. */
1457    if (u != NULL) {
1458
1459        /* Optimization for empty strings */
1460        if (size == 0 && unicode_empty != NULL) {
1461            Py_INCREF(unicode_empty);
1462            return unicode_empty;
1463        }
1464
1465        /* Single characters are shared when using this constructor.
1466           Restrict to ASCII, since the input must be UTF-8. */
1467        if (size == 1 && Py_CHARMASK(*u) < 128)
1468            return get_latin1_char(Py_CHARMASK(*u));
1469
1470        return PyUnicode_DecodeUTF8(u, size, NULL);
1471    }
1472
1473    unicode = _PyUnicode_New(size);
1474    if (!unicode)
1475        return NULL;
1476
1477    return (PyObject *)unicode;
1478}
1479
1480PyObject *
1481PyUnicode_FromString(const char *u)
1482{
1483    size_t size = strlen(u);
1484    if (size > PY_SSIZE_T_MAX) {
1485        PyErr_SetString(PyExc_OverflowError, "input too long");
1486        return NULL;
1487    }
1488
1489    return PyUnicode_FromStringAndSize(u, size);
1490}
1491
1492static PyObject*
1493_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1494{
1495    PyObject *res;
1496    unsigned char max = 127;
1497    Py_ssize_t i;
1498    for (i = 0; i < size; i++) {
1499        if (u[i] & 0x80) {
1500            max = 255;
1501            break;
1502        }
1503    }
1504    res = PyUnicode_New(size, max);
1505    if (!res)
1506        return NULL;
1507    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1508    return res;
1509}
1510
1511static PyObject*
1512_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1513{
1514    PyObject *res;
1515    Py_UCS2 max = 0;
1516    Py_ssize_t i;
1517    for (i = 0; i < size; i++)
1518        if (u[i] > max)
1519            max = u[i];
1520    res = PyUnicode_New(size, max);
1521    if (!res)
1522        return NULL;
1523    if (max >= 256)
1524        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1525    else
1526        for (i = 0; i < size; i++)
1527            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1528    return res;
1529}
1530
1531static PyObject*
1532_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1533{
1534    PyObject *res;
1535    Py_UCS4 max = 0;
1536    Py_ssize_t i;
1537    for (i = 0; i < size; i++)
1538        if (u[i] > max)
1539            max = u[i];
1540    res = PyUnicode_New(size, max);
1541    if (!res)
1542        return NULL;
1543    if (max >= 0x10000)
1544        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1545    else {
1546        int kind = PyUnicode_KIND(res);
1547        void *data = PyUnicode_DATA(res);
1548        for (i = 0; i < size; i++)
1549            PyUnicode_WRITE(kind, data, i, u[i]);
1550    }
1551    return res;
1552}
1553
1554PyObject*
1555PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1556{
1557    switch(kind) {
1558    case PyUnicode_1BYTE_KIND:
1559        return _PyUnicode_FromUCS1(buffer, size);
1560    case PyUnicode_2BYTE_KIND:
1561        return _PyUnicode_FromUCS2(buffer, size);
1562    case PyUnicode_4BYTE_KIND:
1563        return _PyUnicode_FromUCS4(buffer, size);
1564    }
1565    PyErr_SetString(PyExc_ValueError, "invalid kind");
1566    return NULL;
1567}
1568
1569PyObject*
1570PyUnicode_Copy(PyObject *unicode)
1571{
1572    Py_ssize_t size;
1573    PyObject *copy;
1574    void *data;
1575
1576    if (!PyUnicode_Check(unicode)) {
1577        PyErr_BadInternalCall();
1578        return NULL;
1579    }
1580    if (PyUnicode_READY(unicode))
1581        return NULL;
1582
1583    size = PyUnicode_GET_LENGTH(unicode);
1584    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1585    if (!copy)
1586        return NULL;
1587    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1588
1589    data = PyUnicode_DATA(unicode);
1590    switch (PyUnicode_KIND(unicode))
1591    {
1592    case PyUnicode_1BYTE_KIND:
1593        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1594        break;
1595    case PyUnicode_2BYTE_KIND:
1596        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1597        break;
1598    case PyUnicode_4BYTE_KIND:
1599        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1600        break;
1601    default:
1602        assert(0);
1603        break;
1604    }
1605    return copy;
1606}
1607
1608
1609/* Widen Unicode objects to larger buffers. Don't write terminating null
1610   character. Return NULL on error. */
1611
1612void*
1613_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1614{
1615    Py_ssize_t len;
1616    void *result;
1617    unsigned int skind;
1618
1619    if (PyUnicode_READY(s))
1620        return NULL;
1621
1622    len = PyUnicode_GET_LENGTH(s);
1623    skind = PyUnicode_KIND(s);
1624    if (skind >= kind) {
1625        PyErr_SetString(PyExc_RuntimeError, "invalid widening attempt");
1626        return NULL;
1627    }
1628    switch(kind) {
1629    case PyUnicode_2BYTE_KIND:
1630        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1631        if (!result)
1632            return PyErr_NoMemory();
1633        assert(skind == PyUnicode_1BYTE_KIND);
1634        _PyUnicode_CONVERT_BYTES(
1635            Py_UCS1, Py_UCS2,
1636            PyUnicode_1BYTE_DATA(s),
1637            PyUnicode_1BYTE_DATA(s) + len,
1638            result);
1639        return result;
1640    case PyUnicode_4BYTE_KIND:
1641        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1642        if (!result)
1643            return PyErr_NoMemory();
1644        if (skind == PyUnicode_2BYTE_KIND) {
1645            _PyUnicode_CONVERT_BYTES(
1646                Py_UCS2, Py_UCS4,
1647                PyUnicode_2BYTE_DATA(s),
1648                PyUnicode_2BYTE_DATA(s) + len,
1649                result);
1650        }
1651        else {
1652            assert(skind == PyUnicode_1BYTE_KIND);
1653            _PyUnicode_CONVERT_BYTES(
1654                Py_UCS1, Py_UCS4,
1655                PyUnicode_1BYTE_DATA(s),
1656                PyUnicode_1BYTE_DATA(s) + len,
1657                result);
1658        }
1659        return result;
1660    default:
1661        break;
1662    }
1663    PyErr_SetString(PyExc_ValueError, "invalid kind");
1664    return NULL;
1665}
1666
1667static Py_UCS4*
1668as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1669        int copy_null)
1670{
1671    int kind;
1672    void *data;
1673    Py_ssize_t len, targetlen;
1674    if (PyUnicode_READY(string) == -1)
1675        return NULL;
1676    kind = PyUnicode_KIND(string);
1677    data = PyUnicode_DATA(string);
1678    len = PyUnicode_GET_LENGTH(string);
1679    targetlen = len;
1680    if (copy_null)
1681        targetlen++;
1682    if (!target) {
1683        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1684            PyErr_NoMemory();
1685            return NULL;
1686        }
1687        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1688        if (!target) {
1689            PyErr_NoMemory();
1690            return NULL;
1691        }
1692    }
1693    else {
1694        if (targetsize < targetlen) {
1695            PyErr_Format(PyExc_SystemError,
1696                         "string is longer than the buffer");
1697            if (copy_null && 0 < targetsize)
1698                target[0] = 0;
1699            return NULL;
1700        }
1701    }
1702    if (kind != PyUnicode_4BYTE_KIND) {
1703        Py_ssize_t i;
1704        for (i = 0; i < len; i++)
1705            target[i] = PyUnicode_READ(kind, data, i);
1706    }
1707    else
1708        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1709    if (copy_null)
1710        target[len] = 0;
1711    return target;
1712}
1713
1714Py_UCS4*
1715PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1716                 int copy_null)
1717{
1718    if (target == NULL || targetsize < 1) {
1719        PyErr_BadInternalCall();
1720        return NULL;
1721    }
1722    return as_ucs4(string, target, targetsize, copy_null);
1723}
1724
1725Py_UCS4*
1726PyUnicode_AsUCS4Copy(PyObject *string)
1727{
1728    return as_ucs4(string, NULL, 0, 1);
1729}
1730
1731#ifdef HAVE_WCHAR_H
1732
1733PyObject *
1734PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
1735{
1736    if (w == NULL) {
1737        if (size == 0)
1738            return PyUnicode_New(0, 0);
1739        PyErr_BadInternalCall();
1740        return NULL;
1741    }
1742
1743    if (size == -1) {
1744        size = wcslen(w);
1745    }
1746
1747    return PyUnicode_FromUnicode(w, size);
1748}
1749
1750#endif /* HAVE_WCHAR_H */
1751
1752static void
1753makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1754        int zeropad, int width, int precision, char c)
1755{
1756    *fmt++ = '%';
1757    if (width) {
1758        if (zeropad)
1759            *fmt++ = '0';
1760        fmt += sprintf(fmt, "%d", width);
1761    }
1762    if (precision)
1763        fmt += sprintf(fmt, ".%d", precision);
1764    if (longflag)
1765        *fmt++ = 'l';
1766    else if (longlongflag) {
1767        /* longlongflag should only ever be nonzero on machines with
1768           HAVE_LONG_LONG defined */
1769#ifdef HAVE_LONG_LONG
1770        char *f = PY_FORMAT_LONG_LONG;
1771        while (*f)
1772            *fmt++ = *f++;
1773#else
1774        /* we shouldn't ever get here */
1775        assert(0);
1776        *fmt++ = 'l';
1777#endif
1778    }
1779    else if (size_tflag) {
1780        char *f = PY_FORMAT_SIZE_T;
1781        while (*f)
1782            *fmt++ = *f++;
1783    }
1784    *fmt++ = c;
1785    *fmt = '\0';
1786}
1787
1788/* helper for PyUnicode_FromFormatV() */
1789
1790static const char*
1791parse_format_flags(const char *f,
1792                   int *p_width, int *p_precision,
1793                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1794{
1795    int width, precision, longflag, longlongflag, size_tflag;
1796
1797    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1798    f++;
1799    width = 0;
1800    while (Py_ISDIGIT((unsigned)*f))
1801        width = (width*10) + *f++ - '0';
1802    precision = 0;
1803    if (*f == '.') {
1804        f++;
1805        while (Py_ISDIGIT((unsigned)*f))
1806            precision = (precision*10) + *f++ - '0';
1807        if (*f == '%') {
1808            /* "%.3%s" => f points to "3" */
1809            f--;
1810        }
1811    }
1812    if (*f == '\0') {
1813        /* bogus format "%.1" => go backward, f points to "1" */
1814        f--;
1815    }
1816    if (p_width != NULL)
1817        *p_width = width;
1818    if (p_precision != NULL)
1819        *p_precision = precision;
1820
1821    /* Handle %ld, %lu, %lld and %llu. */
1822    longflag = 0;
1823    longlongflag = 0;
1824    size_tflag = 0;
1825
1826    if (*f == 'l') {
1827        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
1828            longflag = 1;
1829            ++f;
1830        }
1831#ifdef HAVE_LONG_LONG
1832        else if (f[1] == 'l' &&
1833                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
1834            longlongflag = 1;
1835            f += 2;
1836        }
1837#endif
1838    }
1839    /* handle the size_t flag. */
1840    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
1841        size_tflag = 1;
1842        ++f;
1843    }
1844    if (p_longflag != NULL)
1845        *p_longflag = longflag;
1846    if (p_longlongflag != NULL)
1847        *p_longlongflag = longlongflag;
1848    if (p_size_tflag != NULL)
1849        *p_size_tflag = size_tflag;
1850    return f;
1851}
1852
1853/* maximum number of characters required for output of %ld.  21 characters
1854   allows for 64-bit integers (in decimal) and an optional sign. */
1855#define MAX_LONG_CHARS 21
1856/* maximum number of characters required for output of %lld.
1857   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1858   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
1859#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1860
1861PyObject *
1862PyUnicode_FromFormatV(const char *format, va_list vargs)
1863{
1864    va_list count;
1865    Py_ssize_t callcount = 0;
1866    PyObject **callresults = NULL;
1867    PyObject **callresult = NULL;
1868    Py_ssize_t n = 0;
1869    int width = 0;
1870    int precision = 0;
1871    int zeropad;
1872    const char* f;
1873    PyUnicodeObject *string;
1874    /* used by sprintf */
1875    char fmt[61]; /* should be enough for %0width.precisionlld */
1876    Py_UCS4 maxchar = 127; /* result is ASCII by default */
1877    Py_UCS4 argmaxchar;
1878    Py_ssize_t numbersize = 0;
1879    char *numberresults = NULL;
1880    char *numberresult = NULL;
1881    Py_ssize_t i;
1882    int kind;
1883    void *data;
1884
1885    Py_VA_COPY(count, vargs);
1886    /* step 1: count the number of %S/%R/%A/%s format specifications
1887     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1888     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
1889     * result in an array)
1890     * also esimate a upper bound for all the number formats in the string,
1891     * numbers will be formated in step 3 and be keept in a '\0'-separated
1892     * buffer before putting everything together. */
1893    for (f = format; *f; f++) {
1894        if (*f == '%') {
1895            int longlongflag;
1896            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1897            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1898            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1899                ++callcount;
1900
1901            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
1902#ifdef HAVE_LONG_LONG
1903                if (longlongflag) {
1904                    if (width < MAX_LONG_LONG_CHARS)
1905                        width = MAX_LONG_LONG_CHARS;
1906                }
1907                else
1908#endif
1909                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1910                       including sign.  Decimal takes the most space.  This
1911                       isn't enough for octal.  If a width is specified we
1912                       need more (which we allocate later). */
1913                    if (width < MAX_LONG_CHARS)
1914                        width = MAX_LONG_CHARS;
1915
1916                /* account for the size + '\0' to separate numbers
1917                   inside of the numberresults buffer */
1918                numbersize += (width + 1);
1919            }
1920        }
1921        else if ((unsigned char)*f > 127) {
1922            PyErr_Format(PyExc_ValueError,
1923                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1924                "string, got a non-ASCII byte: 0x%02x",
1925                (unsigned char)*f);
1926            return NULL;
1927        }
1928    }
1929    /* step 2: allocate memory for the results of
1930     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1931    if (callcount) {
1932        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1933        if (!callresults) {
1934            PyErr_NoMemory();
1935            return NULL;
1936        }
1937        callresult = callresults;
1938    }
1939    /* step 2.5: allocate memory for the results of formating numbers */
1940    if (numbersize) {
1941        numberresults = PyObject_Malloc(numbersize);
1942        if (!numberresults) {
1943            PyErr_NoMemory();
1944            goto fail;
1945        }
1946        numberresult = numberresults;
1947    }
1948
1949    /* step 3: format numbers and figure out how large a buffer we need */
1950    for (f = format; *f; f++) {
1951        if (*f == '%') {
1952            const char* p;
1953            int longflag;
1954            int longlongflag;
1955            int size_tflag;
1956            int numprinted;
1957
1958            p = f;
1959            zeropad = (f[1] == '0');
1960            f = parse_format_flags(f, &width, &precision,
1961                                   &longflag, &longlongflag, &size_tflag);
1962            switch (*f) {
1963            case 'c':
1964            {
1965                Py_UCS4 ordinal = va_arg(count, int);
1966                maxchar = Py_MAX(maxchar, ordinal);
1967                n++;
1968                break;
1969            }
1970            case '%':
1971                n++;
1972                break;
1973            case 'i':
1974            case 'd':
1975                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1976                        width, precision, *f);
1977                if (longflag)
1978                    numprinted = sprintf(numberresult, fmt,
1979                                         va_arg(count, long));
1980#ifdef HAVE_LONG_LONG
1981                else if (longlongflag)
1982                    numprinted = sprintf(numberresult, fmt,
1983                                         va_arg(count, PY_LONG_LONG));
1984#endif
1985                else if (size_tflag)
1986                    numprinted = sprintf(numberresult, fmt,
1987                                         va_arg(count, Py_ssize_t));
1988                else
1989                    numprinted = sprintf(numberresult, fmt,
1990                                         va_arg(count, int));
1991                n += numprinted;
1992                /* advance by +1 to skip over the '\0' */
1993                numberresult += (numprinted + 1);
1994                assert(*(numberresult - 1) == '\0');
1995                assert(*(numberresult - 2) != '\0');
1996                assert(numprinted >= 0);
1997                assert(numberresult <= numberresults + numbersize);
1998                break;
1999            case 'u':
2000                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2001                        width, precision, 'u');
2002                if (longflag)
2003                    numprinted = sprintf(numberresult, fmt,
2004                                         va_arg(count, unsigned long));
2005#ifdef HAVE_LONG_LONG
2006                else if (longlongflag)
2007                    numprinted = sprintf(numberresult, fmt,
2008                                         va_arg(count, unsigned PY_LONG_LONG));
2009#endif
2010                else if (size_tflag)
2011                    numprinted = sprintf(numberresult, fmt,
2012                                         va_arg(count, size_t));
2013                else
2014                    numprinted = sprintf(numberresult, fmt,
2015                                         va_arg(count, unsigned int));
2016                n += numprinted;
2017                numberresult += (numprinted + 1);
2018                assert(*(numberresult - 1) == '\0');
2019                assert(*(numberresult - 2) != '\0');
2020                assert(numprinted >= 0);
2021                assert(numberresult <= numberresults + numbersize);
2022                break;
2023            case 'x':
2024                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2025                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2026                n += numprinted;
2027                numberresult += (numprinted + 1);
2028                assert(*(numberresult - 1) == '\0');
2029                assert(*(numberresult - 2) != '\0');
2030                assert(numprinted >= 0);
2031                assert(numberresult <= numberresults + numbersize);
2032                break;
2033            case 'p':
2034                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2035                /* %p is ill-defined:  ensure leading 0x. */
2036                if (numberresult[1] == 'X')
2037                    numberresult[1] = 'x';
2038                else if (numberresult[1] != 'x') {
2039                    memmove(numberresult + 2, numberresult,
2040                            strlen(numberresult) + 1);
2041                    numberresult[0] = '0';
2042                    numberresult[1] = 'x';
2043                    numprinted += 2;
2044                }
2045                n += numprinted;
2046                numberresult += (numprinted + 1);
2047                assert(*(numberresult - 1) == '\0');
2048                assert(*(numberresult - 2) != '\0');
2049                assert(numprinted >= 0);
2050                assert(numberresult <= numberresults + numbersize);
2051                break;
2052            case 's':
2053            {
2054                /* UTF-8 */
2055                const char *s = va_arg(count, const char*);
2056                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2057                if (!str)
2058                    goto fail;
2059                /* since PyUnicode_DecodeUTF8 returns already flexible
2060                   unicode objects, there is no need to call ready on them */
2061                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2062                maxchar = Py_MAX(maxchar, argmaxchar);
2063                n += PyUnicode_GET_LENGTH(str);
2064                /* Remember the str and switch to the next slot */
2065                *callresult++ = str;
2066                break;
2067            }
2068            case 'U':
2069            {
2070                PyObject *obj = va_arg(count, PyObject *);
2071                assert(obj && _PyUnicode_CHECK(obj));
2072                if (PyUnicode_READY(obj) == -1)
2073                    goto fail;
2074                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2075                maxchar = Py_MAX(maxchar, argmaxchar);
2076                n += PyUnicode_GET_LENGTH(obj);
2077                break;
2078            }
2079            case 'V':
2080            {
2081                PyObject *obj = va_arg(count, PyObject *);
2082                const char *str = va_arg(count, const char *);
2083                PyObject *str_obj;
2084                assert(obj || str);
2085                assert(!obj || _PyUnicode_CHECK(obj));
2086                if (obj) {
2087                    if (PyUnicode_READY(obj) == -1)
2088                        goto fail;
2089                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2090                    maxchar = Py_MAX(maxchar, argmaxchar);
2091                    n += PyUnicode_GET_LENGTH(obj);
2092                    *callresult++ = NULL;
2093                }
2094                else {
2095                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2096                    if (!str_obj)
2097                        goto fail;
2098                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2099                    maxchar = Py_MAX(maxchar, argmaxchar);
2100                    n += PyUnicode_GET_LENGTH(str_obj);
2101                    *callresult++ = str_obj;
2102                }
2103                break;
2104            }
2105            case 'S':
2106            {
2107                PyObject *obj = va_arg(count, PyObject *);
2108                PyObject *str;
2109                assert(obj);
2110                str = PyObject_Str(obj);
2111                if (!str || PyUnicode_READY(str) == -1)
2112                    goto fail;
2113                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2114                maxchar = Py_MAX(maxchar, argmaxchar);
2115                n += PyUnicode_GET_LENGTH(str);
2116                /* Remember the str and switch to the next slot */
2117                *callresult++ = str;
2118                break;
2119            }
2120            case 'R':
2121            {
2122                PyObject *obj = va_arg(count, PyObject *);
2123                PyObject *repr;
2124                assert(obj);
2125                repr = PyObject_Repr(obj);
2126                if (!repr || PyUnicode_READY(repr) == -1)
2127                    goto fail;
2128                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2129                maxchar = Py_MAX(maxchar, argmaxchar);
2130                n += PyUnicode_GET_LENGTH(repr);
2131                /* Remember the repr and switch to the next slot */
2132                *callresult++ = repr;
2133                break;
2134            }
2135            case 'A':
2136            {
2137                PyObject *obj = va_arg(count, PyObject *);
2138                PyObject *ascii;
2139                assert(obj);
2140                ascii = PyObject_ASCII(obj);
2141                if (!ascii || PyUnicode_READY(ascii) == -1)
2142                    goto fail;
2143                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2144                maxchar = Py_MAX(maxchar, argmaxchar);
2145                n += PyUnicode_GET_LENGTH(ascii);
2146                /* Remember the repr and switch to the next slot */
2147                *callresult++ = ascii;
2148                break;
2149            }
2150            default:
2151                /* if we stumble upon an unknown
2152                   formatting code, copy the rest of
2153                   the format string to the output
2154                   string. (we cannot just skip the
2155                   code, since there's no way to know
2156                   what's in the argument list) */
2157                n += strlen(p);
2158                goto expand;
2159            }
2160        } else
2161            n++;
2162    }
2163  expand:
2164    /* step 4: fill the buffer */
2165    /* Since we've analyzed how much space we need,
2166       we don't have to resize the string.
2167       There can be no errors beyond this point. */
2168    string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
2169    if (!string)
2170        goto fail;
2171    kind = PyUnicode_KIND(string);
2172    data = PyUnicode_DATA(string);
2173    callresult = callresults;
2174    numberresult = numberresults;
2175
2176    for (i = 0, f = format; *f; f++) {
2177        if (*f == '%') {
2178            const char* p;
2179
2180            p = f;
2181            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2182            /* checking for == because the last argument could be a empty
2183               string, which causes i to point to end, the assert at the end of
2184               the loop */
2185            assert(i <= PyUnicode_GET_LENGTH(string));
2186
2187            switch (*f) {
2188            case 'c':
2189            {
2190                const int ordinal = va_arg(vargs, int);
2191                PyUnicode_WRITE(kind, data, i++, ordinal);
2192                break;
2193            }
2194            case 'i':
2195            case 'd':
2196            case 'u':
2197            case 'x':
2198            case 'p':
2199                /* unused, since we already have the result */
2200                if (*f == 'p')
2201                    (void) va_arg(vargs, void *);
2202                else
2203                    (void) va_arg(vargs, int);
2204                /* extract the result from numberresults and append. */
2205                for (; *numberresult; ++i, ++numberresult)
2206                    PyUnicode_WRITE(kind, data, i, *numberresult);
2207                /* skip over the separating '\0' */
2208                assert(*numberresult == '\0');
2209                numberresult++;
2210                assert(numberresult <= numberresults + numbersize);
2211                break;
2212            case 's':
2213            {
2214                /* unused, since we already have the result */
2215                Py_ssize_t size;
2216                (void) va_arg(vargs, char *);
2217                size = PyUnicode_GET_LENGTH(*callresult);
2218                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2219                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2220                                             *callresult, 0,
2221                                             size) < 0)
2222                    goto fail;
2223                i += size;
2224                /* We're done with the unicode()/repr() => forget it */
2225                Py_DECREF(*callresult);
2226                /* switch to next unicode()/repr() result */
2227                ++callresult;
2228                break;
2229            }
2230            case 'U':
2231            {
2232                PyObject *obj = va_arg(vargs, PyObject *);
2233                Py_ssize_t size;
2234                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2235                size = PyUnicode_GET_LENGTH(obj);
2236                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2237                                             obj, 0,
2238                                             size) < 0)
2239                    goto fail;
2240                i += size;
2241                break;
2242            }
2243            case 'V':
2244            {
2245                Py_ssize_t size;
2246                PyObject *obj = va_arg(vargs, PyObject *);
2247                va_arg(vargs, const char *);
2248                if (obj) {
2249                    size = PyUnicode_GET_LENGTH(obj);
2250                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2251                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2252                                                 obj, 0,
2253                                                 size) < 0)
2254                        goto fail;
2255                    i += size;
2256                } else {
2257                    size = PyUnicode_GET_LENGTH(*callresult);
2258                    assert(PyUnicode_KIND(*callresult) <=
2259                           PyUnicode_KIND(string));
2260                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2261                                                 *callresult,
2262                                                 0, size) < 0)
2263                        goto fail;
2264                    i += size;
2265                    Py_DECREF(*callresult);
2266                }
2267                ++callresult;
2268                break;
2269            }
2270            case 'S':
2271            case 'R':
2272            case 'A':
2273            {
2274                /* unused, since we already have the result */
2275                (void) va_arg(vargs, PyObject *);
2276                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2277                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2278                                             *callresult, 0,
2279                                             PyUnicode_GET_LENGTH(*callresult)) < 0)
2280                    goto fail;
2281                i += PyUnicode_GET_LENGTH(*callresult);
2282                /* We're done with the unicode()/repr() => forget it */
2283                Py_DECREF(*callresult);
2284                /* switch to next unicode()/repr() result */
2285                ++callresult;
2286                break;
2287            }
2288            case '%':
2289                PyUnicode_WRITE(kind, data, i++, '%');
2290                break;
2291            default:
2292                for (; *p; ++p, ++i)
2293                    PyUnicode_WRITE(kind, data, i, *p);
2294                assert(i == PyUnicode_GET_LENGTH(string));
2295                goto end;
2296            }
2297        }
2298        else {
2299            assert(i < PyUnicode_GET_LENGTH(string));
2300            PyUnicode_WRITE(kind, data, i++, *f);
2301        }
2302    }
2303    assert(i == PyUnicode_GET_LENGTH(string));
2304
2305  end:
2306    if (callresults)
2307        PyObject_Free(callresults);
2308    if (numberresults)
2309        PyObject_Free(numberresults);
2310    return (PyObject *)string;
2311  fail:
2312    if (callresults) {
2313        PyObject **callresult2 = callresults;
2314        while (callresult2 < callresult) {
2315            Py_XDECREF(*callresult2);
2316            ++callresult2;
2317        }
2318        PyObject_Free(callresults);
2319    }
2320    if (numberresults)
2321        PyObject_Free(numberresults);
2322    return NULL;
2323}
2324
2325PyObject *
2326PyUnicode_FromFormat(const char *format, ...)
2327{
2328    PyObject* ret;
2329    va_list vargs;
2330
2331#ifdef HAVE_STDARG_PROTOTYPES
2332    va_start(vargs, format);
2333#else
2334    va_start(vargs);
2335#endif
2336    ret = PyUnicode_FromFormatV(format, vargs);
2337    va_end(vargs);
2338    return ret;
2339}
2340
2341#ifdef HAVE_WCHAR_H
2342
2343/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2344   convert a Unicode object to a wide character string.
2345
2346   - If w is NULL: return the number of wide characters (including the null
2347     character) required to convert the unicode object. Ignore size argument.
2348
2349   - Otherwise: return the number of wide characters (excluding the null
2350     character) written into w. Write at most size wide characters (including
2351     the null character). */
2352static Py_ssize_t
2353unicode_aswidechar(PyUnicodeObject *unicode,
2354                   wchar_t *w,
2355                   Py_ssize_t size)
2356{
2357    Py_ssize_t res;
2358    const wchar_t *wstr;
2359
2360    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2361    if (wstr == NULL)
2362        return -1;
2363
2364    if (w != NULL) {
2365        if (size > res)
2366            size = res + 1;
2367        else
2368            res = size;
2369        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2370        return res;
2371    }
2372    else
2373        return res + 1;
2374}
2375
2376Py_ssize_t
2377PyUnicode_AsWideChar(PyObject *unicode,
2378                     wchar_t *w,
2379                     Py_ssize_t size)
2380{
2381    if (unicode == NULL) {
2382        PyErr_BadInternalCall();
2383        return -1;
2384    }
2385    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2386}
2387
2388wchar_t*
2389PyUnicode_AsWideCharString(PyObject *unicode,
2390                           Py_ssize_t *size)
2391{
2392    wchar_t* buffer;
2393    Py_ssize_t buflen;
2394
2395    if (unicode == NULL) {
2396        PyErr_BadInternalCall();
2397        return NULL;
2398    }
2399
2400    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2401    if (buflen == -1)
2402        return NULL;
2403    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2404        PyErr_NoMemory();
2405        return NULL;
2406    }
2407
2408    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2409    if (buffer == NULL) {
2410        PyErr_NoMemory();
2411        return NULL;
2412    }
2413    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2414    if (buflen == -1)
2415        return NULL;
2416    if (size != NULL)
2417        *size = buflen;
2418    return buffer;
2419}
2420
2421#endif /* HAVE_WCHAR_H */
2422
2423PyObject *
2424PyUnicode_FromOrdinal(int ordinal)
2425{
2426    PyObject *v;
2427    if (ordinal < 0 || ordinal > 0x10ffff) {
2428        PyErr_SetString(PyExc_ValueError,
2429                        "chr() arg not in range(0x110000)");
2430        return NULL;
2431    }
2432
2433    if (ordinal < 256)
2434        return get_latin1_char(ordinal);
2435
2436    v = PyUnicode_New(1, ordinal);
2437    if (v == NULL)
2438        return NULL;
2439    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2440    return v;
2441}
2442
2443PyObject *
2444PyUnicode_FromObject(register PyObject *obj)
2445{
2446    /* XXX Perhaps we should make this API an alias of
2447       PyObject_Str() instead ?! */
2448    if (PyUnicode_CheckExact(obj)) {
2449        if (PyUnicode_READY(obj))
2450            return NULL;
2451        Py_INCREF(obj);
2452        return obj;
2453    }
2454    if (PyUnicode_Check(obj)) {
2455        /* For a Unicode subtype that's not a Unicode object,
2456           return a true Unicode object with the same data. */
2457        return PyUnicode_Copy(obj);
2458    }
2459    PyErr_Format(PyExc_TypeError,
2460                 "Can't convert '%.100s' object to str implicitly",
2461                 Py_TYPE(obj)->tp_name);
2462    return NULL;
2463}
2464
2465PyObject *
2466PyUnicode_FromEncodedObject(register PyObject *obj,
2467                            const char *encoding,
2468                            const char *errors)
2469{
2470    Py_buffer buffer;
2471    PyObject *v;
2472
2473    if (obj == NULL) {
2474        PyErr_BadInternalCall();
2475        return NULL;
2476    }
2477
2478    /* Decoding bytes objects is the most common case and should be fast */
2479    if (PyBytes_Check(obj)) {
2480        if (PyBytes_GET_SIZE(obj) == 0) {
2481            Py_INCREF(unicode_empty);
2482            v = unicode_empty;
2483        }
2484        else {
2485            v = PyUnicode_Decode(
2486                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2487                    encoding, errors);
2488        }
2489        return v;
2490    }
2491
2492    if (PyUnicode_Check(obj)) {
2493        PyErr_SetString(PyExc_TypeError,
2494                        "decoding str is not supported");
2495        return NULL;
2496    }
2497
2498    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2499    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2500        PyErr_Format(PyExc_TypeError,
2501                     "coercing to str: need bytes, bytearray "
2502                     "or buffer-like object, %.80s found",
2503                     Py_TYPE(obj)->tp_name);
2504        return NULL;
2505    }
2506
2507    if (buffer.len == 0) {
2508        Py_INCREF(unicode_empty);
2509        v = unicode_empty;
2510    }
2511    else
2512        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2513
2514    PyBuffer_Release(&buffer);
2515    return v;
2516}
2517
2518/* Convert encoding to lower case and replace '_' with '-' in order to
2519   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2520   1 on success. */
2521static int
2522normalize_encoding(const char *encoding,
2523                   char *lower,
2524                   size_t lower_len)
2525{
2526    const char *e;
2527    char *l;
2528    char *l_end;
2529
2530    e = encoding;
2531    l = lower;
2532    l_end = &lower[lower_len - 1];
2533    while (*e) {
2534        if (l == l_end)
2535            return 0;
2536        if (Py_ISUPPER(*e)) {
2537            *l++ = Py_TOLOWER(*e++);
2538        }
2539        else if (*e == '_') {
2540            *l++ = '-';
2541            e++;
2542        }
2543        else {
2544            *l++ = *e++;
2545        }
2546    }
2547    *l = '\0';
2548    return 1;
2549}
2550
2551PyObject *
2552PyUnicode_Decode(const char *s,
2553                 Py_ssize_t size,
2554                 const char *encoding,
2555                 const char *errors)
2556{
2557    PyObject *buffer = NULL, *unicode;
2558    Py_buffer info;
2559    char lower[11];  /* Enough for any encoding shortcut */
2560
2561    if (encoding == NULL)
2562        return PyUnicode_DecodeUTF8(s, size, errors);
2563
2564    /* Shortcuts for common default encodings */
2565    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2566        if ((strcmp(lower, "utf-8") == 0) ||
2567            (strcmp(lower, "utf8") == 0))
2568            return PyUnicode_DecodeUTF8(s, size, errors);
2569        else if ((strcmp(lower, "latin-1") == 0) ||
2570                 (strcmp(lower, "latin1") == 0) ||
2571                 (strcmp(lower, "iso-8859-1") == 0))
2572            return PyUnicode_DecodeLatin1(s, size, errors);
2573#ifdef HAVE_MBCS
2574        else if (strcmp(lower, "mbcs") == 0)
2575            return PyUnicode_DecodeMBCS(s, size, errors);
2576#endif
2577        else if (strcmp(lower, "ascii") == 0)
2578            return PyUnicode_DecodeASCII(s, size, errors);
2579        else if (strcmp(lower, "utf-16") == 0)
2580            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2581        else if (strcmp(lower, "utf-32") == 0)
2582            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2583    }
2584
2585    /* Decode via the codec registry */
2586    buffer = NULL;
2587    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2588        goto onError;
2589    buffer = PyMemoryView_FromBuffer(&info);
2590    if (buffer == NULL)
2591        goto onError;
2592    unicode = PyCodec_Decode(buffer, encoding, errors);
2593    if (unicode == NULL)
2594        goto onError;
2595    if (!PyUnicode_Check(unicode)) {
2596        PyErr_Format(PyExc_TypeError,
2597                     "decoder did not return a str object (type=%.400s)",
2598                     Py_TYPE(unicode)->tp_name);
2599        Py_DECREF(unicode);
2600        goto onError;
2601    }
2602    Py_DECREF(buffer);
2603    if (_PyUnicode_READY_REPLACE(&unicode)) {
2604        Py_DECREF(unicode);
2605        return NULL;
2606    }
2607    return unicode;
2608
2609  onError:
2610    Py_XDECREF(buffer);
2611    return NULL;
2612}
2613
2614PyObject *
2615PyUnicode_AsDecodedObject(PyObject *unicode,
2616                          const char *encoding,
2617                          const char *errors)
2618{
2619    PyObject *v;
2620
2621    if (!PyUnicode_Check(unicode)) {
2622        PyErr_BadArgument();
2623        goto onError;
2624    }
2625
2626    if (encoding == NULL)
2627        encoding = PyUnicode_GetDefaultEncoding();
2628
2629    /* Decode via the codec registry */
2630    v = PyCodec_Decode(unicode, encoding, errors);
2631    if (v == NULL)
2632        goto onError;
2633    return v;
2634
2635  onError:
2636    return NULL;
2637}
2638
2639PyObject *
2640PyUnicode_AsDecodedUnicode(PyObject *unicode,
2641                           const char *encoding,
2642                           const char *errors)
2643{
2644    PyObject *v;
2645
2646    if (!PyUnicode_Check(unicode)) {
2647        PyErr_BadArgument();
2648        goto onError;
2649    }
2650
2651    if (encoding == NULL)
2652        encoding = PyUnicode_GetDefaultEncoding();
2653
2654    /* Decode via the codec registry */
2655    v = PyCodec_Decode(unicode, encoding, errors);
2656    if (v == NULL)
2657        goto onError;
2658    if (!PyUnicode_Check(v)) {
2659        PyErr_Format(PyExc_TypeError,
2660                     "decoder did not return a str object (type=%.400s)",
2661                     Py_TYPE(v)->tp_name);
2662        Py_DECREF(v);
2663        goto onError;
2664    }
2665    return v;
2666
2667  onError:
2668    return NULL;
2669}
2670
2671PyObject *
2672PyUnicode_Encode(const Py_UNICODE *s,
2673                 Py_ssize_t size,
2674                 const char *encoding,
2675                 const char *errors)
2676{
2677    PyObject *v, *unicode;
2678
2679    unicode = PyUnicode_FromUnicode(s, size);
2680    if (unicode == NULL)
2681        return NULL;
2682    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2683    Py_DECREF(unicode);
2684    return v;
2685}
2686
2687PyObject *
2688PyUnicode_AsEncodedObject(PyObject *unicode,
2689                          const char *encoding,
2690                          const char *errors)
2691{
2692    PyObject *v;
2693
2694    if (!PyUnicode_Check(unicode)) {
2695        PyErr_BadArgument();
2696        goto onError;
2697    }
2698
2699    if (encoding == NULL)
2700        encoding = PyUnicode_GetDefaultEncoding();
2701
2702    /* Encode via the codec registry */
2703    v = PyCodec_Encode(unicode, encoding, errors);
2704    if (v == NULL)
2705        goto onError;
2706    return v;
2707
2708  onError:
2709    return NULL;
2710}
2711
2712PyObject *
2713PyUnicode_EncodeFSDefault(PyObject *unicode)
2714{
2715#ifdef HAVE_MBCS
2716    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2717                                PyUnicode_GET_SIZE(unicode),
2718                                NULL);
2719#elif defined(__APPLE__)
2720    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2721#else
2722    PyInterpreterState *interp = PyThreadState_GET()->interp;
2723    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2724       cannot use it to encode and decode filenames before it is loaded. Load
2725       the Python codec requires to encode at least its own filename. Use the C
2726       version of the locale codec until the codec registry is initialized and
2727       the Python codec is loaded.
2728
2729       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2730       cannot only rely on it: check also interp->fscodec_initialized for
2731       subinterpreters. */
2732    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2733        return PyUnicode_AsEncodedString(unicode,
2734                                         Py_FileSystemDefaultEncoding,
2735                                         "surrogateescape");
2736    }
2737    else {
2738        /* locale encoding with surrogateescape */
2739        wchar_t *wchar;
2740        char *bytes;
2741        PyObject *bytes_obj;
2742        size_t error_pos;
2743
2744        wchar = PyUnicode_AsWideCharString(unicode, NULL);
2745        if (wchar == NULL)
2746            return NULL;
2747        bytes = _Py_wchar2char(wchar, &error_pos);
2748        if (bytes == NULL) {
2749            if (error_pos != (size_t)-1) {
2750                char *errmsg = strerror(errno);
2751                PyObject *exc = NULL;
2752                if (errmsg == NULL)
2753                    errmsg = "Py_wchar2char() failed";
2754                raise_encode_exception(&exc,
2755                    "filesystemencoding",
2756                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2757                    error_pos, error_pos+1,
2758                    errmsg);
2759                Py_XDECREF(exc);
2760            }
2761            else
2762                PyErr_NoMemory();
2763            PyMem_Free(wchar);
2764            return NULL;
2765        }
2766        PyMem_Free(wchar);
2767
2768        bytes_obj = PyBytes_FromString(bytes);
2769        PyMem_Free(bytes);
2770        return bytes_obj;
2771    }
2772#endif
2773}
2774
2775PyObject *
2776PyUnicode_AsEncodedString(PyObject *unicode,
2777                          const char *encoding,
2778                          const char *errors)
2779{
2780    PyObject *v;
2781    char lower[11];  /* Enough for any encoding shortcut */
2782
2783    if (!PyUnicode_Check(unicode)) {
2784        PyErr_BadArgument();
2785        return NULL;
2786    }
2787
2788    if (encoding == NULL) {
2789        if (errors == NULL || strcmp(errors, "strict") == 0)
2790            return _PyUnicode_AsUTF8String(unicode, NULL);
2791        else
2792            return _PyUnicode_AsUTF8String(unicode, errors);
2793    }
2794
2795    /* Shortcuts for common default encodings */
2796    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2797        if ((strcmp(lower, "utf-8") == 0) ||
2798            (strcmp(lower, "utf8") == 0))
2799        {
2800            if (errors == NULL || strcmp(errors, "strict") == 0)
2801                return _PyUnicode_AsUTF8String(unicode, NULL);
2802            else
2803                return _PyUnicode_AsUTF8String(unicode, errors);
2804        }
2805        else if ((strcmp(lower, "latin-1") == 0) ||
2806                 (strcmp(lower, "latin1") == 0) ||
2807                 (strcmp(lower, "iso-8859-1") == 0))
2808            return _PyUnicode_AsLatin1String(unicode, errors);
2809#ifdef HAVE_MBCS
2810        else if (strcmp(lower, "mbcs") == 0)
2811            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2812                                        PyUnicode_GET_SIZE(unicode),
2813                                        errors);
2814#endif
2815        else if (strcmp(lower, "ascii") == 0)
2816            return _PyUnicode_AsASCIIString(unicode, errors);
2817    }
2818
2819    /* Encode via the codec registry */
2820    v = PyCodec_Encode(unicode, encoding, errors);
2821    if (v == NULL)
2822        return NULL;
2823
2824    /* The normal path */
2825    if (PyBytes_Check(v))
2826        return v;
2827
2828    /* If the codec returns a buffer, raise a warning and convert to bytes */
2829    if (PyByteArray_Check(v)) {
2830        int error;
2831        PyObject *b;
2832
2833        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2834            "encoder %s returned bytearray instead of bytes",
2835            encoding);
2836        if (error) {
2837            Py_DECREF(v);
2838            return NULL;
2839        }
2840
2841        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2842        Py_DECREF(v);
2843        return b;
2844    }
2845
2846    PyErr_Format(PyExc_TypeError,
2847                 "encoder did not return a bytes object (type=%.400s)",
2848                 Py_TYPE(v)->tp_name);
2849    Py_DECREF(v);
2850    return NULL;
2851}
2852
2853PyObject *
2854PyUnicode_AsEncodedUnicode(PyObject *unicode,
2855                           const char *encoding,
2856                           const char *errors)
2857{
2858    PyObject *v;
2859
2860    if (!PyUnicode_Check(unicode)) {
2861        PyErr_BadArgument();
2862        goto onError;
2863    }
2864
2865    if (encoding == NULL)
2866        encoding = PyUnicode_GetDefaultEncoding();
2867
2868    /* Encode via the codec registry */
2869    v = PyCodec_Encode(unicode, encoding, errors);
2870    if (v == NULL)
2871        goto onError;
2872    if (!PyUnicode_Check(v)) {
2873        PyErr_Format(PyExc_TypeError,
2874                     "encoder did not return an str object (type=%.400s)",
2875                     Py_TYPE(v)->tp_name);
2876        Py_DECREF(v);
2877        goto onError;
2878    }
2879    return v;
2880
2881  onError:
2882    return NULL;
2883}
2884
2885PyObject*
2886PyUnicode_DecodeFSDefault(const char *s) {
2887    Py_ssize_t size = (Py_ssize_t)strlen(s);
2888    return PyUnicode_DecodeFSDefaultAndSize(s, size);
2889}
2890
2891PyObject*
2892PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2893{
2894#ifdef HAVE_MBCS
2895    return PyUnicode_DecodeMBCS(s, size, NULL);
2896#elif defined(__APPLE__)
2897    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2898#else
2899    PyInterpreterState *interp = PyThreadState_GET()->interp;
2900    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2901       cannot use it to encode and decode filenames before it is loaded. Load
2902       the Python codec requires to encode at least its own filename. Use the C
2903       version of the locale codec until the codec registry is initialized and
2904       the Python codec is loaded.
2905
2906       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2907       cannot only rely on it: check also interp->fscodec_initialized for
2908       subinterpreters. */
2909    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2910        return PyUnicode_Decode(s, size,
2911                                Py_FileSystemDefaultEncoding,
2912                                "surrogateescape");
2913    }
2914    else {
2915        /* locale encoding with surrogateescape */
2916        wchar_t *wchar;
2917        PyObject *unicode;
2918        size_t len;
2919
2920        if (s[size] != '\0' || size != strlen(s)) {
2921            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2922            return NULL;
2923        }
2924
2925        wchar = _Py_char2wchar(s, &len);
2926        if (wchar == NULL)
2927            return PyErr_NoMemory();
2928
2929        unicode = PyUnicode_FromWideChar(wchar, len);
2930        PyMem_Free(wchar);
2931        return unicode;
2932    }
2933#endif
2934}
2935
2936
2937int
2938PyUnicode_FSConverter(PyObject* arg, void* addr)
2939{
2940    PyObject *output = NULL;
2941    Py_ssize_t size;
2942    void *data;
2943    if (arg == NULL) {
2944        Py_DECREF(*(PyObject**)addr);
2945        return 1;
2946    }
2947    if (PyBytes_Check(arg)) {
2948        output = arg;
2949        Py_INCREF(output);
2950    }
2951    else {
2952        arg = PyUnicode_FromObject(arg);
2953        if (!arg)
2954            return 0;
2955        output = PyUnicode_EncodeFSDefault(arg);
2956        Py_DECREF(arg);
2957        if (!output)
2958            return 0;
2959        if (!PyBytes_Check(output)) {
2960            Py_DECREF(output);
2961            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2962            return 0;
2963        }
2964    }
2965    size = PyBytes_GET_SIZE(output);
2966    data = PyBytes_AS_STRING(output);
2967    if (size != strlen(data)) {
2968        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2969        Py_DECREF(output);
2970        return 0;
2971    }
2972    *(PyObject**)addr = output;
2973    return Py_CLEANUP_SUPPORTED;
2974}
2975
2976
2977int
2978PyUnicode_FSDecoder(PyObject* arg, void* addr)
2979{
2980    PyObject *output = NULL;
2981    if (arg == NULL) {
2982        Py_DECREF(*(PyObject**)addr);
2983        return 1;
2984    }
2985    if (PyUnicode_Check(arg)) {
2986        if (PyUnicode_READY(arg))
2987            return 0;
2988        output = arg;
2989        Py_INCREF(output);
2990    }
2991    else {
2992        arg = PyBytes_FromObject(arg);
2993        if (!arg)
2994            return 0;
2995        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2996                                                  PyBytes_GET_SIZE(arg));
2997        Py_DECREF(arg);
2998        if (!output)
2999            return 0;
3000        if (!PyUnicode_Check(output)) {
3001            Py_DECREF(output);
3002            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3003            return 0;
3004        }
3005    }
3006    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3007                 PyUnicode_GET_LENGTH(output), 0, 1)) {
3008        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3009        Py_DECREF(output);
3010        return 0;
3011    }
3012    *(PyObject**)addr = output;
3013    return Py_CLEANUP_SUPPORTED;
3014}
3015
3016
3017char*
3018PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3019{
3020    PyObject *bytes;
3021    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3022
3023    if (!PyUnicode_Check(unicode)) {
3024        PyErr_BadArgument();
3025        return NULL;
3026    }
3027    if (PyUnicode_READY(u) == -1)
3028        return NULL;
3029
3030    if (PyUnicode_UTF8(unicode) == NULL) {
3031        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3032        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3033        if (bytes == NULL)
3034            return NULL;
3035        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3036        if (_PyUnicode_UTF8(u) == NULL) {
3037            Py_DECREF(bytes);
3038            return NULL;
3039        }
3040        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3041        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
3042        Py_DECREF(bytes);
3043    }
3044
3045    if (psize)
3046        *psize = PyUnicode_UTF8_LENGTH(unicode);
3047    return PyUnicode_UTF8(unicode);
3048}
3049
3050char*
3051PyUnicode_AsUTF8(PyObject *unicode)
3052{
3053    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3054}
3055
3056#ifdef Py_DEBUG
3057int unicode_as_unicode_calls = 0;
3058#endif
3059
3060
3061Py_UNICODE *
3062PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3063{
3064    PyUnicodeObject *u;
3065    const unsigned char *one_byte;
3066#if SIZEOF_WCHAR_T == 4
3067    const Py_UCS2 *two_bytes;
3068#else
3069    const Py_UCS4 *four_bytes;
3070    const Py_UCS4 *ucs4_end;
3071    Py_ssize_t num_surrogates;
3072#endif
3073    wchar_t *w;
3074    wchar_t *wchar_end;
3075
3076    if (!PyUnicode_Check(unicode)) {
3077        PyErr_BadArgument();
3078        return NULL;
3079    }
3080    u = (PyUnicodeObject*)unicode;
3081    if (_PyUnicode_WSTR(u) == NULL) {
3082        /* Non-ASCII compact unicode object */
3083        assert(_PyUnicode_KIND(u) != 0);
3084        assert(PyUnicode_IS_READY(u));
3085
3086#ifdef Py_DEBUG
3087        ++unicode_as_unicode_calls;
3088#endif
3089
3090        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3091#if SIZEOF_WCHAR_T == 2
3092            four_bytes = PyUnicode_4BYTE_DATA(u);
3093            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3094            num_surrogates = 0;
3095
3096            for (; four_bytes < ucs4_end; ++four_bytes) {
3097                if (*four_bytes > 0xFFFF)
3098                    ++num_surrogates;
3099            }
3100
3101            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3102                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3103            if (!_PyUnicode_WSTR(u)) {
3104                PyErr_NoMemory();
3105                return NULL;
3106            }
3107            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3108
3109            w = _PyUnicode_WSTR(u);
3110            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3111            four_bytes = PyUnicode_4BYTE_DATA(u);
3112            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3113                if (*four_bytes > 0xFFFF) {
3114                    /* encode surrogate pair in this case */
3115                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3116                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3117                }
3118                else
3119                    *w = *four_bytes;
3120
3121                if (w > wchar_end) {
3122                    assert(0 && "Miscalculated string end");
3123                }
3124            }
3125            *w = 0;
3126#else
3127            /* sizeof(wchar_t) == 4 */
3128            Py_FatalError("Impossible unicode object state, wstr and str "
3129                          "should share memory already.");
3130            return NULL;
3131#endif
3132        }
3133        else {
3134            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3135                                                  (_PyUnicode_LENGTH(u) + 1));
3136            if (!_PyUnicode_WSTR(u)) {
3137                PyErr_NoMemory();
3138                return NULL;
3139            }
3140            if (!PyUnicode_IS_COMPACT_ASCII(u))
3141                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3142            w = _PyUnicode_WSTR(u);
3143            wchar_end = w + _PyUnicode_LENGTH(u);
3144
3145            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3146                one_byte = PyUnicode_1BYTE_DATA(u);
3147                for (; w < wchar_end; ++one_byte, ++w)
3148                    *w = *one_byte;
3149                /* null-terminate the wstr */
3150                *w = 0;
3151            }
3152            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3153#if SIZEOF_WCHAR_T == 4
3154                two_bytes = PyUnicode_2BYTE_DATA(u);
3155                for (; w < wchar_end; ++two_bytes, ++w)
3156                    *w = *two_bytes;
3157                /* null-terminate the wstr */
3158                *w = 0;
3159#else
3160                /* sizeof(wchar_t) == 2 */
3161                PyObject_FREE(_PyUnicode_WSTR(u));
3162                _PyUnicode_WSTR(u) = NULL;
3163                Py_FatalError("Impossible unicode object state, wstr "
3164                              "and str should share memory already.");
3165                return NULL;
3166#endif
3167            }
3168            else {
3169                assert(0 && "This should never happen.");
3170            }
3171        }
3172    }
3173    if (size != NULL)
3174        *size = PyUnicode_WSTR_LENGTH(u);
3175    return _PyUnicode_WSTR(u);
3176}
3177
3178Py_UNICODE *
3179PyUnicode_AsUnicode(PyObject *unicode)
3180{
3181    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3182}
3183
3184
3185Py_ssize_t
3186PyUnicode_GetSize(PyObject *unicode)
3187{
3188    if (!PyUnicode_Check(unicode)) {
3189        PyErr_BadArgument();
3190        goto onError;
3191    }
3192    return PyUnicode_GET_SIZE(unicode);
3193
3194  onError:
3195    return -1;
3196}
3197
3198Py_ssize_t
3199PyUnicode_GetLength(PyObject *unicode)
3200{
3201    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3202        PyErr_BadArgument();
3203        return -1;
3204    }
3205
3206    return PyUnicode_GET_LENGTH(unicode);
3207}
3208
3209Py_UCS4
3210PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3211{
3212    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3213        PyErr_BadArgument();
3214        return (Py_UCS4)-1;
3215    }
3216    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3217        PyErr_SetString(PyExc_IndexError, "string index out of range");
3218        return (Py_UCS4)-1;
3219    }
3220    return PyUnicode_READ_CHAR(unicode, index);
3221}
3222
3223int
3224PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3225{
3226    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3227        PyErr_BadArgument();
3228        return -1;
3229    }
3230    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3231        PyErr_SetString(PyExc_IndexError, "string index out of range");
3232        return -1;
3233    }
3234    if (_PyUnicode_Dirty(unicode))
3235        return -1;
3236    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3237                    index, ch);
3238    return 0;
3239}
3240
3241const char *
3242PyUnicode_GetDefaultEncoding(void)
3243{
3244    return "utf-8";
3245}
3246
3247/* create or adjust a UnicodeDecodeError */
3248static void
3249make_decode_exception(PyObject **exceptionObject,
3250                      const char *encoding,
3251                      const char *input, Py_ssize_t length,
3252                      Py_ssize_t startpos, Py_ssize_t endpos,
3253                      const char *reason)
3254{
3255    if (*exceptionObject == NULL) {
3256        *exceptionObject = PyUnicodeDecodeError_Create(
3257            encoding, input, length, startpos, endpos, reason);
3258    }
3259    else {
3260        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3261            goto onError;
3262        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3263            goto onError;
3264        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3265            goto onError;
3266    }
3267    return;
3268
3269onError:
3270    Py_DECREF(*exceptionObject);
3271    *exceptionObject = NULL;
3272}
3273
3274/* error handling callback helper:
3275   build arguments, call the callback and check the arguments,
3276   if no exception occurred, copy the replacement to the output
3277   and adjust various state variables.
3278   return 0 on success, -1 on error
3279*/
3280
3281static int
3282unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3283                                 const char *encoding, const char *reason,
3284                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3285                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3286                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3287{
3288    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3289
3290    PyObject *restuple = NULL;
3291    PyObject *repunicode = NULL;
3292    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3293    Py_ssize_t insize;
3294    Py_ssize_t requiredsize;
3295    Py_ssize_t newpos;
3296    const Py_UNICODE *repptr;
3297    PyObject *inputobj = NULL;
3298    Py_ssize_t repsize;
3299    int res = -1;
3300
3301    if (*errorHandler == NULL) {
3302        *errorHandler = PyCodec_LookupError(errors);
3303        if (*errorHandler == NULL)
3304            goto onError;
3305    }
3306
3307    make_decode_exception(exceptionObject,
3308        encoding,
3309        *input, *inend - *input,
3310        *startinpos, *endinpos,
3311        reason);
3312    if (*exceptionObject == NULL)
3313        goto onError;
3314
3315    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3316    if (restuple == NULL)
3317        goto onError;
3318    if (!PyTuple_Check(restuple)) {
3319        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3320        goto onError;
3321    }
3322    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3323        goto onError;
3324
3325    /* Copy back the bytes variables, which might have been modified by the
3326       callback */
3327    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3328    if (!inputobj)
3329        goto onError;
3330    if (!PyBytes_Check(inputobj)) {
3331        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3332    }
3333    *input = PyBytes_AS_STRING(inputobj);
3334    insize = PyBytes_GET_SIZE(inputobj);
3335    *inend = *input + insize;
3336    /* we can DECREF safely, as the exception has another reference,
3337       so the object won't go away. */
3338    Py_DECREF(inputobj);
3339
3340    if (newpos<0)
3341        newpos = insize+newpos;
3342    if (newpos<0 || newpos>insize) {
3343        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3344        goto onError;
3345    }
3346
3347    /* need more space? (at least enough for what we
3348       have+the replacement+the rest of the string (starting
3349       at the new input position), so we won't have to check space
3350       when there are no errors in the rest of the string) */
3351    repptr = PyUnicode_AS_UNICODE(repunicode);
3352    repsize = PyUnicode_GET_SIZE(repunicode);
3353    requiredsize = *outpos + repsize + insize-newpos;
3354    if (requiredsize > outsize) {
3355        if (requiredsize<2*outsize)
3356            requiredsize = 2*outsize;
3357        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3358            goto onError;
3359        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3360    }
3361    *endinpos = newpos;
3362    *inptr = *input + newpos;
3363    Py_UNICODE_COPY(*outptr, repptr, repsize);
3364    *outptr += repsize;
3365    *outpos += repsize;
3366
3367    /* we made it! */
3368    res = 0;
3369
3370  onError:
3371    Py_XDECREF(restuple);
3372    return res;
3373}
3374
3375/* --- UTF-7 Codec -------------------------------------------------------- */
3376
3377/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3378
3379/* Three simple macros defining base-64. */
3380
3381/* Is c a base-64 character? */
3382
3383#define IS_BASE64(c) \
3384    (((c) >= 'A' && (c) <= 'Z') ||     \
3385     ((c) >= 'a' && (c) <= 'z') ||     \
3386     ((c) >= '0' && (c) <= '9') ||     \
3387     (c) == '+' || (c) == '/')
3388
3389/* given that c is a base-64 character, what is its base-64 value? */
3390
3391#define FROM_BASE64(c)                                                  \
3392    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3393     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3394     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3395     (c) == '+' ? 62 : 63)
3396
3397/* What is the base-64 character of the bottom 6 bits of n? */
3398
3399#define TO_BASE64(n)  \
3400    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3401
3402/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3403 * decoded as itself.  We are permissive on decoding; the only ASCII
3404 * byte not decoding to itself is the + which begins a base64
3405 * string. */
3406
3407#define DECODE_DIRECT(c)                                \
3408    ((c) <= 127 && (c) != '+')
3409
3410/* The UTF-7 encoder treats ASCII characters differently according to
3411 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3412 * the above).  See RFC2152.  This array identifies these different
3413 * sets:
3414 * 0 : "Set D"
3415 *     alphanumeric and '(),-./:?
3416 * 1 : "Set O"
3417 *     !"#$%&*;<=>@[]^_`{|}
3418 * 2 : "whitespace"
3419 *     ht nl cr sp
3420 * 3 : special (must be base64 encoded)
3421 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3422 */
3423
3424static
3425char utf7_category[128] = {
3426/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3427    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3428/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3429    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3430/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3431    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3432/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3433    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3434/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3435    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3436/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3437    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3438/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3439    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3440/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3441    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3442};
3443
3444/* ENCODE_DIRECT: this character should be encoded as itself.  The
3445 * answer depends on whether we are encoding set O as itself, and also
3446 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3447 * clear that the answers to these questions vary between
3448 * applications, so this code needs to be flexible.  */
3449
3450#define ENCODE_DIRECT(c, directO, directWS)             \
3451    ((c) < 128 && (c) > 0 &&                            \
3452     ((utf7_category[(c)] == 0) ||                      \
3453      (directWS && (utf7_category[(c)] == 2)) ||        \
3454      (directO && (utf7_category[(c)] == 1))))
3455
3456PyObject *
3457PyUnicode_DecodeUTF7(const char *s,
3458                     Py_ssize_t size,
3459                     const char *errors)
3460{
3461    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3462}
3463
3464/* The decoder.  The only state we preserve is our read position,
3465 * i.e. how many characters we have consumed.  So if we end in the
3466 * middle of a shift sequence we have to back off the read position
3467 * and the output to the beginning of the sequence, otherwise we lose
3468 * all the shift state (seen bits, number of bits seen, high
3469 * surrogate). */
3470
3471PyObject *
3472PyUnicode_DecodeUTF7Stateful(const char *s,
3473                             Py_ssize_t size,
3474                             const char *errors,
3475                             Py_ssize_t *consumed)
3476{
3477    const char *starts = s;
3478    Py_ssize_t startinpos;
3479    Py_ssize_t endinpos;
3480    Py_ssize_t outpos;
3481    const char *e;
3482    PyUnicodeObject *unicode;
3483    Py_UNICODE *p;
3484    const char *errmsg = "";
3485    int inShift = 0;
3486    Py_UNICODE *shiftOutStart;
3487    unsigned int base64bits = 0;
3488    unsigned long base64buffer = 0;
3489    Py_UNICODE surrogate = 0;
3490    PyObject *errorHandler = NULL;
3491    PyObject *exc = NULL;
3492
3493    unicode = _PyUnicode_New(size);
3494    if (!unicode)
3495        return NULL;
3496    if (size == 0) {
3497        if (consumed)
3498            *consumed = 0;
3499        return (PyObject *)unicode;
3500    }
3501
3502    p = PyUnicode_AS_UNICODE(unicode);
3503    shiftOutStart = p;
3504    e = s + size;
3505
3506    while (s < e) {
3507        Py_UNICODE ch;
3508      restart:
3509        ch = (unsigned char) *s;
3510
3511        if (inShift) { /* in a base-64 section */
3512            if (IS_BASE64(ch)) { /* consume a base-64 character */
3513                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3514                base64bits += 6;
3515                s++;
3516                if (base64bits >= 16) {
3517                    /* we have enough bits for a UTF-16 value */
3518                    Py_UNICODE outCh = (Py_UNICODE)
3519                                       (base64buffer >> (base64bits-16));
3520                    base64bits -= 16;
3521                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3522                    if (surrogate) {
3523                        /* expecting a second surrogate */
3524                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3525#ifdef Py_UNICODE_WIDE
3526                            *p++ = (((surrogate & 0x3FF)<<10)
3527                                    | (outCh & 0x3FF)) + 0x10000;
3528#else
3529                            *p++ = surrogate;
3530                            *p++ = outCh;
3531#endif
3532                            surrogate = 0;
3533                        }
3534                        else {
3535                            surrogate = 0;
3536                            errmsg = "second surrogate missing";
3537                            goto utf7Error;
3538                        }
3539                    }
3540                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3541                        /* first surrogate */
3542                        surrogate = outCh;
3543                    }
3544                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3545                        errmsg = "unexpected second surrogate";
3546                        goto utf7Error;
3547                    }
3548                    else {
3549                        *p++ = outCh;
3550                    }
3551                }
3552            }
3553            else { /* now leaving a base-64 section */
3554                inShift = 0;
3555                s++;
3556                if (surrogate) {
3557                    errmsg = "second surrogate missing at end of shift sequence";
3558                    goto utf7Error;
3559                }
3560                if (base64bits > 0) { /* left-over bits */
3561                    if (base64bits >= 6) {
3562                        /* We've seen at least one base-64 character */
3563                        errmsg = "partial character in shift sequence";
3564                        goto utf7Error;
3565                    }
3566                    else {
3567                        /* Some bits remain; they should be zero */
3568                        if (base64buffer != 0) {
3569                            errmsg = "non-zero padding bits in shift sequence";
3570                            goto utf7Error;
3571                        }
3572                    }
3573                }
3574                if (ch != '-') {
3575                    /* '-' is absorbed; other terminating
3576                       characters are preserved */
3577                    *p++ = ch;
3578                }
3579            }
3580        }
3581        else if ( ch == '+' ) {
3582            startinpos = s-starts;
3583            s++; /* consume '+' */
3584            if (s < e && *s == '-') { /* '+-' encodes '+' */
3585                s++;
3586                *p++ = '+';
3587            }
3588            else { /* begin base64-encoded section */
3589                inShift = 1;
3590                shiftOutStart = p;
3591                base64bits = 0;
3592            }
3593        }
3594        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3595            *p++ = ch;
3596            s++;
3597        }
3598        else {
3599            startinpos = s-starts;
3600            s++;
3601            errmsg = "unexpected special character";
3602            goto utf7Error;
3603        }
3604        continue;
3605utf7Error:
3606        outpos = p-PyUnicode_AS_UNICODE(unicode);
3607        endinpos = s-starts;
3608        if (unicode_decode_call_errorhandler(
3609                errors, &errorHandler,
3610                "utf7", errmsg,
3611                &starts, &e, &startinpos, &endinpos, &exc, &s,
3612                &unicode, &outpos, &p))
3613            goto onError;
3614    }
3615
3616    /* end of string */
3617
3618    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3619        /* if we're in an inconsistent state, that's an error */
3620        if (surrogate ||
3621                (base64bits >= 6) ||
3622                (base64bits > 0 && base64buffer != 0)) {
3623            outpos = p-PyUnicode_AS_UNICODE(unicode);
3624            endinpos = size;
3625            if (unicode_decode_call_errorhandler(
3626                    errors, &errorHandler,
3627                    "utf7", "unterminated shift sequence",
3628                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3629                    &unicode, &outpos, &p))
3630                goto onError;
3631            if (s < e)
3632                goto restart;
3633        }
3634    }
3635
3636    /* return state */
3637    if (consumed) {
3638        if (inShift) {
3639            p = shiftOutStart; /* back off output */
3640            *consumed = startinpos;
3641        }
3642        else {
3643            *consumed = s-starts;
3644        }
3645    }
3646
3647    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3648        goto onError;
3649
3650    Py_XDECREF(errorHandler);
3651    Py_XDECREF(exc);
3652    if (_PyUnicode_READY_REPLACE(&unicode)) {
3653        Py_DECREF(unicode);
3654        return NULL;
3655    }
3656    return (PyObject *)unicode;
3657
3658  onError:
3659    Py_XDECREF(errorHandler);
3660    Py_XDECREF(exc);
3661    Py_DECREF(unicode);
3662    return NULL;
3663}
3664
3665
3666PyObject *
3667PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3668                     Py_ssize_t size,
3669                     int base64SetO,
3670                     int base64WhiteSpace,
3671                     const char *errors)
3672{
3673    PyObject *v;
3674    /* It might be possible to tighten this worst case */
3675    Py_ssize_t allocated = 8 * size;
3676    int inShift = 0;
3677    Py_ssize_t i = 0;
3678    unsigned int base64bits = 0;
3679    unsigned long base64buffer = 0;
3680    char * out;
3681    char * start;
3682
3683    if (size == 0)
3684        return PyBytes_FromStringAndSize(NULL, 0);
3685
3686    if (allocated / 8 != size)
3687        return PyErr_NoMemory();
3688
3689    v = PyBytes_FromStringAndSize(NULL, allocated);
3690    if (v == NULL)
3691        return NULL;
3692
3693    start = out = PyBytes_AS_STRING(v);
3694    for (;i < size; ++i) {
3695        Py_UNICODE ch = s[i];
3696
3697        if (inShift) {
3698            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3699                /* shifting out */
3700                if (base64bits) { /* output remaining bits */
3701                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3702                    base64buffer = 0;
3703                    base64bits = 0;
3704                }
3705                inShift = 0;
3706                /* Characters not in the BASE64 set implicitly unshift the sequence
3707                   so no '-' is required, except if the character is itself a '-' */
3708                if (IS_BASE64(ch) || ch == '-') {
3709                    *out++ = '-';
3710                }
3711                *out++ = (char) ch;
3712            }
3713            else {
3714                goto encode_char;
3715            }
3716        }
3717        else { /* not in a shift sequence */
3718            if (ch == '+') {
3719                *out++ = '+';
3720                        *out++ = '-';
3721            }
3722            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3723                *out++ = (char) ch;
3724            }
3725            else {
3726                *out++ = '+';
3727                inShift = 1;
3728                goto encode_char;
3729            }
3730        }
3731        continue;
3732encode_char:
3733#ifdef Py_UNICODE_WIDE
3734        if (ch >= 0x10000) {
3735            /* code first surrogate */
3736            base64bits += 16;
3737            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3738            while (base64bits >= 6) {
3739                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3740                base64bits -= 6;
3741            }
3742            /* prepare second surrogate */
3743            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
3744        }
3745#endif
3746        base64bits += 16;
3747        base64buffer = (base64buffer << 16) | ch;
3748        while (base64bits >= 6) {
3749            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3750            base64bits -= 6;
3751        }
3752    }
3753    if (base64bits)
3754        *out++= TO_BASE64(base64buffer << (6-base64bits) );
3755    if (inShift)
3756        *out++ = '-';
3757    if (_PyBytes_Resize(&v, out - start) < 0)
3758        return NULL;
3759    return v;
3760}
3761
3762#undef IS_BASE64
3763#undef FROM_BASE64
3764#undef TO_BASE64
3765#undef DECODE_DIRECT
3766#undef ENCODE_DIRECT
3767
3768/* --- UTF-8 Codec -------------------------------------------------------- */
3769
3770static
3771char utf8_code_length[256] = {
3772    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
3773       illegal prefix.  See RFC 3629 for details */
3774    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3775    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3776    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3777    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3778    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3779    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3780    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3781    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3782    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
3783    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3784    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3785    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3786    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3787    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3788    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3789    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
3790};
3791
3792PyObject *
3793PyUnicode_DecodeUTF8(const char *s,
3794                     Py_ssize_t size,
3795                     const char *errors)
3796{
3797    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3798}
3799
3800/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3801#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3802
3803/* Mask to quickly check whether a C 'long' contains a
3804   non-ASCII, UTF8-encoded char. */
3805#if (SIZEOF_LONG == 8)
3806# define ASCII_CHAR_MASK 0x8080808080808080L
3807#elif (SIZEOF_LONG == 4)
3808# define ASCII_CHAR_MASK 0x80808080L
3809#else
3810# error C 'long' size should be either 4 or 8!
3811#endif
3812
3813/* Scans a UTF-8 string and returns the maximum character to be expected,
3814   the size of the decoded unicode string and if any major errors were
3815   encountered.
3816
3817   This function does check basic UTF-8 sanity, it does however NOT CHECK
3818   if the string contains surrogates, and if all continuation bytes are
3819   within the correct ranges, these checks are performed in
3820   PyUnicode_DecodeUTF8Stateful.
3821
3822   If it sets has_errors to 1, it means the value of unicode_size and max_char
3823   will be bogus and you should not rely on useful information in them.
3824   */
3825static Py_UCS4
3826utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3827                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3828                                  int *has_errors)
3829{
3830    Py_ssize_t n;
3831    Py_ssize_t char_count = 0;
3832    Py_UCS4 max_char = 127, new_max;
3833    Py_UCS4 upper_bound;
3834    const unsigned char *p = (const unsigned char *)s;
3835    const unsigned char *end = p + string_size;
3836    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3837    int err = 0;
3838
3839    for (; p < end && !err; ++p, ++char_count) {
3840        /* Only check value if it's not a ASCII char... */
3841        if (*p < 0x80) {
3842            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3843               an explanation. */
3844            if (!((size_t) p & LONG_PTR_MASK)) {
3845                /* Help register allocation */
3846                register const unsigned char *_p = p;
3847                while (_p < aligned_end) {
3848                    unsigned long value = *(unsigned long *) _p;
3849                    if (value & ASCII_CHAR_MASK)
3850                        break;
3851                    _p += SIZEOF_LONG;
3852                    char_count += SIZEOF_LONG;
3853                }
3854                p = _p;
3855                if (p == end)
3856                    break;
3857            }
3858        }
3859        if (*p >= 0x80) {
3860            n = utf8_code_length[*p];
3861            new_max = max_char;
3862            switch (n) {
3863            /* invalid start byte */
3864            case 0:
3865                err = 1;
3866                break;
3867            case 2:
3868                /* Code points between 0x00FF and 0x07FF inclusive.
3869                   Approximate the upper bound of the code point,
3870                   if this flips over 255 we can be sure it will be more
3871                   than 255 and the string will need 2 bytes per code coint,
3872                   if it stays under or equal to 255, we can be sure 1 byte
3873                   is enough.
3874                   ((*p & 0b00011111) << 6) | 0b00111111 */
3875                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3876                if (max_char < upper_bound)
3877                    new_max = upper_bound;
3878                /* Ensure we track at least that we left ASCII space. */
3879                if (new_max < 128)
3880                    new_max = 128;
3881                break;
3882            case 3:
3883                /* Between 0x0FFF and 0xFFFF inclusive, so values are
3884                   always > 255 and <= 65535 and will always need 2 bytes. */
3885                if (max_char < 65535)
3886                    new_max = 65535;
3887                break;
3888            case 4:
3889                /* Code point will be above 0xFFFF for sure in this case. */
3890                new_max = 65537;
3891                break;
3892            /* Internal error, this should be caught by the first if */
3893            case 1:
3894            default:
3895                assert(0 && "Impossible case in utf8_max_char_and_size");
3896                err = 1;
3897            }
3898            /* Instead of number of overall bytes for this code point,
3899               n containts the number of following bytes: */
3900            --n;
3901            /* Check if the follow up chars are all valid continuation bytes */
3902            if (n >= 1) {
3903                const unsigned char *cont;
3904                if ((p + n) >= end) {
3905                    if (consumed == 0)
3906                        /* incomplete data, non-incremental decoding */
3907                        err = 1;
3908                    break;
3909                }
3910                for (cont = p + 1; cont < (p + n); ++cont) {
3911                    if ((*cont & 0xc0) != 0x80) {
3912                        err = 1;
3913                        break;
3914                    }
3915                }
3916                p += n;
3917            }
3918            else
3919                err = 1;
3920            max_char = new_max;
3921        }
3922    }
3923
3924    if (unicode_size)
3925        *unicode_size = char_count;
3926    if (has_errors)
3927        *has_errors = err;
3928    return max_char;
3929}
3930
3931/* Similar to PyUnicode_WRITE but can also write into wstr field
3932   of the legacy unicode representation */
3933#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3934    do { \
3935        const int k_ = (kind); \
3936        if (k_ == PyUnicode_WCHAR_KIND) \
3937            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3938        else if (k_ == PyUnicode_1BYTE_KIND) \
3939            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3940        else if (k_ == PyUnicode_2BYTE_KIND) \
3941            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3942        else \
3943            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3944    } while (0)
3945
3946PyObject *
3947PyUnicode_DecodeUTF8Stateful(const char *s,
3948                             Py_ssize_t size,
3949                             const char *errors,
3950                             Py_ssize_t *consumed)
3951{
3952    const char *starts = s;
3953    int n;
3954    int k;
3955    Py_ssize_t startinpos;
3956    Py_ssize_t endinpos;
3957    const char *e, *aligned_end;
3958    PyUnicodeObject *unicode;
3959    const char *errmsg = "";
3960    PyObject *errorHandler = NULL;
3961    PyObject *exc = NULL;
3962    Py_UCS4 maxchar = 0;
3963    Py_ssize_t unicode_size;
3964    Py_ssize_t i;
3965    int kind;
3966    void *data;
3967    int has_errors;
3968    Py_UNICODE *error_outptr;
3969#if SIZEOF_WCHAR_T == 2
3970    Py_ssize_t wchar_offset = 0;
3971#endif
3972
3973    if (size == 0) {
3974        if (consumed)
3975            *consumed = 0;
3976        return (PyObject *)PyUnicode_New(0, 0);
3977    }
3978    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3979                                                consumed, &has_errors);
3980    if (has_errors) {
3981        unicode = _PyUnicode_New(size);
3982        if (!unicode)
3983            return NULL;
3984        kind = PyUnicode_WCHAR_KIND;
3985        data = PyUnicode_AS_UNICODE(unicode);
3986        assert(data != NULL);
3987    }
3988    else {
3989        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3990        if (!unicode)
3991            return NULL;
3992        /* When the string is ASCII only, just use memcpy and return.
3993           unicode_size may be != size if there is an incomplete UTF-8
3994           sequence at the end of the ASCII block.  */
3995        if (maxchar < 128 && size == unicode_size) {
3996            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
3997            return (PyObject *)unicode;
3998        }
3999        kind = PyUnicode_KIND(unicode);
4000        data = PyUnicode_DATA(unicode);
4001    }
4002    /* Unpack UTF-8 encoded data */
4003    i = 0;
4004    e = s + size;
4005    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4006
4007    while (s < e) {
4008        Py_UCS4 ch = (unsigned char)*s;
4009
4010        if (ch < 0x80) {
4011            /* Fast path for runs of ASCII characters. Given that common UTF-8
4012               input will consist of an overwhelming majority of ASCII
4013               characters, we try to optimize for this case by checking
4014               as many characters as a C 'long' can contain.
4015               First, check if we can do an aligned read, as most CPUs have
4016               a penalty for unaligned reads.
4017            */
4018            if (!((size_t) s & LONG_PTR_MASK)) {
4019                /* Help register allocation */
4020                register const char *_s = s;
4021                register Py_ssize_t _i = i;
4022                while (_s < aligned_end) {
4023                    /* Read a whole long at a time (either 4 or 8 bytes),
4024                       and do a fast unrolled copy if it only contains ASCII
4025                       characters. */
4026                    unsigned long value = *(unsigned long *) _s;
4027                    if (value & ASCII_CHAR_MASK)
4028                        break;
4029                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4030                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4031                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4032                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4033#if (SIZEOF_LONG == 8)
4034                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4035                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4036                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4037                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4038#endif
4039                    _s += SIZEOF_LONG;
4040                    _i += SIZEOF_LONG;
4041                }
4042                s = _s;
4043                i = _i;
4044                if (s == e)
4045                    break;
4046                ch = (unsigned char)*s;
4047            }
4048        }
4049
4050        if (ch < 0x80) {
4051            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4052            s++;
4053            continue;
4054        }
4055
4056        n = utf8_code_length[ch];
4057
4058        if (s + n > e) {
4059            if (consumed)
4060                break;
4061            else {
4062                errmsg = "unexpected end of data";
4063                startinpos = s-starts;
4064                endinpos = startinpos+1;
4065                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4066                    endinpos++;
4067                goto utf8Error;
4068            }
4069        }
4070
4071        switch (n) {
4072
4073        case 0:
4074            errmsg = "invalid start byte";
4075            startinpos = s-starts;
4076            endinpos = startinpos+1;
4077            goto utf8Error;
4078
4079        case 1:
4080            errmsg = "internal error";
4081            startinpos = s-starts;
4082            endinpos = startinpos+1;
4083            goto utf8Error;
4084
4085        case 2:
4086            if ((s[1] & 0xc0) != 0x80) {
4087                errmsg = "invalid continuation byte";
4088                startinpos = s-starts;
4089                endinpos = startinpos + 1;
4090                goto utf8Error;
4091            }
4092            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4093            assert ((ch > 0x007F) && (ch <= 0x07FF));
4094            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4095            break;
4096
4097        case 3:
4098            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4099               will result in surrogates in range d800-dfff. Surrogates are
4100               not valid UTF-8 so they are rejected.
4101               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4102               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4103            if ((s[1] & 0xc0) != 0x80 ||
4104                (s[2] & 0xc0) != 0x80 ||
4105                ((unsigned char)s[0] == 0xE0 &&
4106                 (unsigned char)s[1] < 0xA0) ||
4107                ((unsigned char)s[0] == 0xED &&
4108                 (unsigned char)s[1] > 0x9F)) {
4109                errmsg = "invalid continuation byte";
4110                startinpos = s-starts;
4111                endinpos = startinpos + 1;
4112
4113                /* if s[1] first two bits are 1 and 0, then the invalid
4114                   continuation byte is s[2], so increment endinpos by 1,
4115                   if not, s[1] is invalid and endinpos doesn't need to
4116                   be incremented. */
4117                if ((s[1] & 0xC0) == 0x80)
4118                    endinpos++;
4119                goto utf8Error;
4120            }
4121            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4122            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4123            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4124            break;
4125
4126        case 4:
4127            if ((s[1] & 0xc0) != 0x80 ||
4128                (s[2] & 0xc0) != 0x80 ||
4129                (s[3] & 0xc0) != 0x80 ||
4130                ((unsigned char)s[0] == 0xF0 &&
4131                 (unsigned char)s[1] < 0x90) ||
4132                ((unsigned char)s[0] == 0xF4 &&
4133                 (unsigned char)s[1] > 0x8F)) {
4134                errmsg = "invalid continuation byte";
4135                startinpos = s-starts;
4136                endinpos = startinpos + 1;
4137                if ((s[1] & 0xC0) == 0x80) {
4138                    endinpos++;
4139                    if ((s[2] & 0xC0) == 0x80)
4140                        endinpos++;
4141                }
4142                goto utf8Error;
4143            }
4144            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4145                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4146            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4147
4148            /* If the string is flexible or we have native UCS-4, write
4149               directly.. */
4150            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4151                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4152
4153            else {
4154                /* compute and append the two surrogates: */
4155
4156                /* translate from 10000..10FFFF to 0..FFFF */
4157                ch -= 0x10000;
4158
4159                /* high surrogate = top 10 bits added to D800 */
4160                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4161                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4162
4163                /* low surrogate = bottom 10 bits added to DC00 */
4164                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4165                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4166            }
4167#if SIZEOF_WCHAR_T == 2
4168            wchar_offset++;
4169#endif
4170            break;
4171        }
4172        s += n;
4173        continue;
4174
4175      utf8Error:
4176        /* If this is not yet a resizable string, make it one.. */
4177        if (kind != PyUnicode_WCHAR_KIND) {
4178            const Py_UNICODE *u;
4179            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4180            if (!new_unicode)
4181                goto onError;
4182            u = PyUnicode_AsUnicode((PyObject *)unicode);
4183            if (!u)
4184                goto onError;
4185#if SIZEOF_WCHAR_T == 2
4186            i += wchar_offset;
4187#endif
4188            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4189            Py_DECREF(unicode);
4190            unicode = new_unicode;
4191            kind = 0;
4192            data = PyUnicode_AS_UNICODE(new_unicode);
4193            assert(data != NULL);
4194        }
4195        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4196        if (unicode_decode_call_errorhandler(
4197                errors, &errorHandler,
4198                "utf8", errmsg,
4199                &starts, &e, &startinpos, &endinpos, &exc, &s,
4200                &unicode, &i, &error_outptr))
4201            goto onError;
4202        /* Update data because unicode_decode_call_errorhandler might have
4203           re-created or resized the unicode object. */
4204        data = PyUnicode_AS_UNICODE(unicode);
4205        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4206    }
4207    /* Ensure the unicode_size calculation above was correct: */
4208    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4209
4210    if (consumed)
4211        *consumed = s-starts;
4212
4213    /* Adjust length and ready string when it contained errors and
4214       is of the old resizable kind. */
4215    if (kind == PyUnicode_WCHAR_KIND) {
4216        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
4217            goto onError;
4218    }
4219
4220    Py_XDECREF(errorHandler);
4221    Py_XDECREF(exc);
4222    if (_PyUnicode_READY_REPLACE(&unicode)) {
4223        Py_DECREF(unicode);
4224        return NULL;
4225    }
4226    return (PyObject *)unicode;
4227
4228  onError:
4229    Py_XDECREF(errorHandler);
4230    Py_XDECREF(exc);
4231    Py_DECREF(unicode);
4232    return NULL;
4233}
4234
4235#undef WRITE_FLEXIBLE_OR_WSTR
4236
4237#ifdef __APPLE__
4238
4239/* Simplified UTF-8 decoder using surrogateescape error handler,
4240   used to decode the command line arguments on Mac OS X. */
4241
4242wchar_t*
4243_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4244{
4245    int n;
4246    const char *e;
4247    wchar_t *unicode, *p;
4248
4249    /* Note: size will always be longer than the resulting Unicode
4250       character count */
4251    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4252        PyErr_NoMemory();
4253        return NULL;
4254    }
4255    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4256    if (!unicode)
4257        return NULL;
4258
4259    /* Unpack UTF-8 encoded data */
4260    p = unicode;
4261    e = s + size;
4262    while (s < e) {
4263        Py_UCS4 ch = (unsigned char)*s;
4264
4265        if (ch < 0x80) {
4266            *p++ = (wchar_t)ch;
4267            s++;
4268            continue;
4269        }
4270
4271        n = utf8_code_length[ch];
4272        if (s + n > e) {
4273            goto surrogateescape;
4274        }
4275
4276        switch (n) {
4277        case 0:
4278        case 1:
4279            goto surrogateescape;
4280
4281        case 2:
4282            if ((s[1] & 0xc0) != 0x80)
4283                goto surrogateescape;
4284            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4285            assert ((ch > 0x007F) && (ch <= 0x07FF));
4286            *p++ = (wchar_t)ch;
4287            break;
4288
4289        case 3:
4290            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4291               will result in surrogates in range d800-dfff. Surrogates are
4292               not valid UTF-8 so they are rejected.
4293               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4294               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4295            if ((s[1] & 0xc0) != 0x80 ||
4296                (s[2] & 0xc0) != 0x80 ||
4297                ((unsigned char)s[0] == 0xE0 &&
4298                 (unsigned char)s[1] < 0xA0) ||
4299                ((unsigned char)s[0] == 0xED &&
4300                 (unsigned char)s[1] > 0x9F)) {
4301
4302                goto surrogateescape;
4303            }
4304            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4305            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4306            *p++ = (wchar_t)ch;
4307            break;
4308
4309        case 4:
4310            if ((s[1] & 0xc0) != 0x80 ||
4311                (s[2] & 0xc0) != 0x80 ||
4312                (s[3] & 0xc0) != 0x80 ||
4313                ((unsigned char)s[0] == 0xF0 &&
4314                 (unsigned char)s[1] < 0x90) ||
4315                ((unsigned char)s[0] == 0xF4 &&
4316                 (unsigned char)s[1] > 0x8F)) {
4317                goto surrogateescape;
4318            }
4319            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4320                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4321            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4322
4323#if SIZEOF_WCHAR_T == 4
4324            *p++ = (wchar_t)ch;
4325#else
4326            /*  compute and append the two surrogates: */
4327
4328            /*  translate from 10000..10FFFF to 0..FFFF */
4329            ch -= 0x10000;
4330
4331            /*  high surrogate = top 10 bits added to D800 */
4332            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4333
4334            /*  low surrogate = bottom 10 bits added to DC00 */
4335            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4336#endif
4337            break;
4338        }
4339        s += n;
4340        continue;
4341
4342      surrogateescape:
4343        *p++ = 0xDC00 + ch;
4344        s++;
4345    }
4346    *p = L'\0';
4347    return unicode;
4348}
4349
4350#endif /* __APPLE__ */
4351
4352/* Primary internal function which creates utf8 encoded bytes objects.
4353
4354   Allocation strategy:  if the string is short, convert into a stack buffer
4355   and allocate exactly as much space needed at the end.  Else allocate the
4356   maximum possible needed (4 result bytes per Unicode character), and return
4357   the excess memory at the end.
4358*/
4359PyObject *
4360_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4361{
4362#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4363
4364    Py_ssize_t i;                /* index into s of next input byte */
4365    PyObject *result;            /* result string object */
4366    char *p;                     /* next free byte in output buffer */
4367    Py_ssize_t nallocated;      /* number of result bytes allocated */
4368    Py_ssize_t nneeded;            /* number of result bytes needed */
4369    char stackbuf[MAX_SHORT_UNICHARS * 4];
4370    PyObject *errorHandler = NULL;
4371    PyObject *exc = NULL;
4372    int kind;
4373    void *data;
4374    Py_ssize_t size;
4375    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4376#if SIZEOF_WCHAR_T == 2
4377    Py_ssize_t wchar_offset = 0;
4378#endif
4379
4380    if (!PyUnicode_Check(unicode)) {
4381        PyErr_BadArgument();
4382        return NULL;
4383    }
4384
4385    if (PyUnicode_READY(unicode) == -1)
4386        return NULL;
4387
4388    if (PyUnicode_UTF8(unicode))
4389        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4390                                         PyUnicode_UTF8_LENGTH(unicode));
4391
4392    kind = PyUnicode_KIND(unicode);
4393    data = PyUnicode_DATA(unicode);
4394    size = PyUnicode_GET_LENGTH(unicode);
4395
4396    assert(size >= 0);
4397
4398    if (size <= MAX_SHORT_UNICHARS) {
4399        /* Write into the stack buffer; nallocated can't overflow.
4400         * At the end, we'll allocate exactly as much heap space as it
4401         * turns out we need.
4402         */
4403        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4404        result = NULL;   /* will allocate after we're done */
4405        p = stackbuf;
4406    }
4407    else {
4408        /* Overallocate on the heap, and give the excess back at the end. */
4409        nallocated = size * 4;
4410        if (nallocated / 4 != size)  /* overflow! */
4411            return PyErr_NoMemory();
4412        result = PyBytes_FromStringAndSize(NULL, nallocated);
4413        if (result == NULL)
4414            return NULL;
4415        p = PyBytes_AS_STRING(result);
4416    }
4417
4418    for (i = 0; i < size;) {
4419        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4420
4421        if (ch < 0x80)
4422            /* Encode ASCII */
4423            *p++ = (char) ch;
4424
4425        else if (ch < 0x0800) {
4426            /* Encode Latin-1 */
4427            *p++ = (char)(0xc0 | (ch >> 6));
4428            *p++ = (char)(0x80 | (ch & 0x3f));
4429        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4430            Py_ssize_t newpos;
4431            PyObject *rep;
4432            Py_ssize_t repsize, k, startpos;
4433            startpos = i-1;
4434#if SIZEOF_WCHAR_T == 2
4435            startpos += wchar_offset;
4436#endif
4437            rep = unicode_encode_call_errorhandler(
4438                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4439                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4440                  &exc, startpos, startpos+1, &newpos);
4441            if (!rep)
4442                goto error;
4443
4444            if (PyBytes_Check(rep))
4445                repsize = PyBytes_GET_SIZE(rep);
4446            else
4447                repsize = PyUnicode_GET_SIZE(rep);
4448
4449            if (repsize > 4) {
4450                Py_ssize_t offset;
4451
4452                if (result == NULL)
4453                    offset = p - stackbuf;
4454                else
4455                    offset = p - PyBytes_AS_STRING(result);
4456
4457                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4458                    /* integer overflow */
4459                    PyErr_NoMemory();
4460                    goto error;
4461                }
4462                nallocated += repsize - 4;
4463                if (result != NULL) {
4464                    if (_PyBytes_Resize(&result, nallocated) < 0)
4465                        goto error;
4466                } else {
4467                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4468                    if (result == NULL)
4469                        goto error;
4470                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4471                }
4472                p = PyBytes_AS_STRING(result) + offset;
4473            }
4474
4475            if (PyBytes_Check(rep)) {
4476                char *prep = PyBytes_AS_STRING(rep);
4477                for(k = repsize; k > 0; k--)
4478                    *p++ = *prep++;
4479            } else /* rep is unicode */ {
4480                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4481                Py_UNICODE c;
4482
4483                for(k=0; k<repsize; k++) {
4484                    c = prep[k];
4485                    if (0x80 <= c) {
4486                        raise_encode_exception(&exc, "utf-8",
4487                                               PyUnicode_AS_UNICODE(unicode),
4488                                               size, i-1, i,
4489                                               "surrogates not allowed");
4490                        goto error;
4491                    }
4492                    *p++ = (char)prep[k];
4493                }
4494            }
4495            Py_DECREF(rep);
4496        } else if (ch < 0x10000) {
4497            *p++ = (char)(0xe0 | (ch >> 12));
4498            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4499            *p++ = (char)(0x80 | (ch & 0x3f));
4500        } else /* ch >= 0x10000 */ {
4501            /* Encode UCS4 Unicode ordinals */
4502            *p++ = (char)(0xf0 | (ch >> 18));
4503            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4504            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4505            *p++ = (char)(0x80 | (ch & 0x3f));
4506#if SIZEOF_WCHAR_T == 2
4507            wchar_offset++;
4508#endif
4509        }
4510    }
4511
4512    if (result == NULL) {
4513        /* This was stack allocated. */
4514        nneeded = p - stackbuf;
4515        assert(nneeded <= nallocated);
4516        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4517    }
4518    else {
4519        /* Cut back to size actually needed. */
4520        nneeded = p - PyBytes_AS_STRING(result);
4521        assert(nneeded <= nallocated);
4522        _PyBytes_Resize(&result, nneeded);
4523    }
4524
4525    Py_XDECREF(errorHandler);
4526    Py_XDECREF(exc);
4527    return result;
4528 error:
4529    Py_XDECREF(errorHandler);
4530    Py_XDECREF(exc);
4531    Py_XDECREF(result);
4532    return NULL;
4533
4534#undef MAX_SHORT_UNICHARS
4535}
4536
4537PyObject *
4538PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4539                     Py_ssize_t size,
4540                     const char *errors)
4541{
4542    PyObject *v, *unicode;
4543
4544    unicode = PyUnicode_FromUnicode(s, size);
4545    if (unicode == NULL)
4546        return NULL;
4547    v = _PyUnicode_AsUTF8String(unicode, errors);
4548    Py_DECREF(unicode);
4549    return v;
4550}
4551
4552PyObject *
4553PyUnicode_AsUTF8String(PyObject *unicode)
4554{
4555    return _PyUnicode_AsUTF8String(unicode, NULL);
4556}
4557
4558/* --- UTF-32 Codec ------------------------------------------------------- */
4559
4560PyObject *
4561PyUnicode_DecodeUTF32(const char *s,
4562                      Py_ssize_t size,
4563                      const char *errors,
4564                      int *byteorder)
4565{
4566    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4567}
4568
4569PyObject *
4570PyUnicode_DecodeUTF32Stateful(const char *s,
4571                              Py_ssize_t size,
4572                              const char *errors,
4573                              int *byteorder,
4574                              Py_ssize_t *consumed)
4575{
4576    const char *starts = s;
4577    Py_ssize_t startinpos;
4578    Py_ssize_t endinpos;
4579    Py_ssize_t outpos;
4580    PyUnicodeObject *unicode;
4581    Py_UNICODE *p;
4582#ifndef Py_UNICODE_WIDE
4583    int pairs = 0;
4584    const unsigned char *qq;
4585#else
4586    const int pairs = 0;
4587#endif
4588    const unsigned char *q, *e;
4589    int bo = 0;       /* assume native ordering by default */
4590    const char *errmsg = "";
4591    /* Offsets from q for retrieving bytes in the right order. */
4592#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4593    int iorder[] = {0, 1, 2, 3};
4594#else
4595    int iorder[] = {3, 2, 1, 0};
4596#endif
4597    PyObject *errorHandler = NULL;
4598    PyObject *exc = NULL;
4599
4600    q = (unsigned char *)s;
4601    e = q + size;
4602
4603    if (byteorder)
4604        bo = *byteorder;
4605
4606    /* Check for BOM marks (U+FEFF) in the input and adjust current
4607       byte order setting accordingly. In native mode, the leading BOM
4608       mark is skipped, in all other modes, it is copied to the output
4609       stream as-is (giving a ZWNBSP character). */
4610    if (bo == 0) {
4611        if (size >= 4) {
4612            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4613                (q[iorder[1]] << 8) | q[iorder[0]];
4614#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4615            if (bom == 0x0000FEFF) {
4616                q += 4;
4617                bo = -1;
4618            }
4619            else if (bom == 0xFFFE0000) {
4620                q += 4;
4621                bo = 1;
4622            }
4623#else
4624            if (bom == 0x0000FEFF) {
4625                q += 4;
4626                bo = 1;
4627            }
4628            else if (bom == 0xFFFE0000) {
4629                q += 4;
4630                bo = -1;
4631            }
4632#endif
4633        }
4634    }
4635
4636    if (bo == -1) {
4637        /* force LE */
4638        iorder[0] = 0;
4639        iorder[1] = 1;
4640        iorder[2] = 2;
4641        iorder[3] = 3;
4642    }
4643    else if (bo == 1) {
4644        /* force BE */
4645        iorder[0] = 3;
4646        iorder[1] = 2;
4647        iorder[2] = 1;
4648        iorder[3] = 0;
4649    }
4650
4651    /* On narrow builds we split characters outside the BMP into two
4652       codepoints => count how much extra space we need. */
4653#ifndef Py_UNICODE_WIDE
4654    for (qq = q; qq < e; qq += 4)
4655        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4656            pairs++;
4657#endif
4658
4659    /* This might be one to much, because of a BOM */
4660    unicode = _PyUnicode_New((size+3)/4+pairs);
4661    if (!unicode)
4662        return NULL;
4663    if (size == 0)
4664        return (PyObject *)unicode;
4665
4666    /* Unpack UTF-32 encoded data */
4667    p = PyUnicode_AS_UNICODE(unicode);
4668
4669    while (q < e) {
4670        Py_UCS4 ch;
4671        /* remaining bytes at the end? (size should be divisible by 4) */
4672        if (e-q<4) {
4673            if (consumed)
4674                break;
4675            errmsg = "truncated data";
4676            startinpos = ((const char *)q)-starts;
4677            endinpos = ((const char *)e)-starts;
4678            goto utf32Error;
4679            /* The remaining input chars are ignored if the callback
4680               chooses to skip the input */
4681        }
4682        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4683            (q[iorder[1]] << 8) | q[iorder[0]];
4684
4685        if (ch >= 0x110000)
4686        {
4687            errmsg = "codepoint not in range(0x110000)";
4688            startinpos = ((const char *)q)-starts;
4689            endinpos = startinpos+4;
4690            goto utf32Error;
4691        }
4692#ifndef Py_UNICODE_WIDE
4693        if (ch >= 0x10000)
4694        {
4695            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4696            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4697        }
4698        else
4699#endif
4700            *p++ = ch;
4701        q += 4;
4702        continue;
4703      utf32Error:
4704        outpos = p-PyUnicode_AS_UNICODE(unicode);
4705        if (unicode_decode_call_errorhandler(
4706                errors, &errorHandler,
4707                "utf32", errmsg,
4708                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4709                &unicode, &outpos, &p))
4710            goto onError;
4711    }
4712
4713    if (byteorder)
4714        *byteorder = bo;
4715
4716    if (consumed)
4717        *consumed = (const char *)q-starts;
4718
4719    /* Adjust length */
4720    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4721        goto onError;
4722
4723    Py_XDECREF(errorHandler);
4724    Py_XDECREF(exc);
4725    if (_PyUnicode_READY_REPLACE(&unicode)) {
4726        Py_DECREF(unicode);
4727        return NULL;
4728    }
4729    return (PyObject *)unicode;
4730
4731  onError:
4732    Py_DECREF(unicode);
4733    Py_XDECREF(errorHandler);
4734    Py_XDECREF(exc);
4735    return NULL;
4736}
4737
4738PyObject *
4739PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4740                      Py_ssize_t size,
4741                      const char *errors,
4742                      int byteorder)
4743{
4744    PyObject *v;
4745    unsigned char *p;
4746    Py_ssize_t nsize, bytesize;
4747#ifndef Py_UNICODE_WIDE
4748    Py_ssize_t i, pairs;
4749#else
4750    const int pairs = 0;
4751#endif
4752    /* Offsets from p for storing byte pairs in the right order. */
4753#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4754    int iorder[] = {0, 1, 2, 3};
4755#else
4756    int iorder[] = {3, 2, 1, 0};
4757#endif
4758
4759#define STORECHAR(CH)                           \
4760    do {                                        \
4761        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4762        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4763        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4764        p[iorder[0]] = (CH) & 0xff;             \
4765        p += 4;                                 \
4766    } while(0)
4767
4768    /* In narrow builds we can output surrogate pairs as one codepoint,
4769       so we need less space. */
4770#ifndef Py_UNICODE_WIDE
4771    for (i = pairs = 0; i < size-1; i++)
4772        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4773            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4774            pairs++;
4775#endif
4776    nsize = (size - pairs + (byteorder == 0));
4777    bytesize = nsize * 4;
4778    if (bytesize / 4 != nsize)
4779        return PyErr_NoMemory();
4780    v = PyBytes_FromStringAndSize(NULL, bytesize);
4781    if (v == NULL)
4782        return NULL;
4783
4784    p = (unsigned char *)PyBytes_AS_STRING(v);
4785    if (byteorder == 0)
4786        STORECHAR(0xFEFF);
4787    if (size == 0)
4788        goto done;
4789
4790    if (byteorder == -1) {
4791        /* force LE */
4792        iorder[0] = 0;
4793        iorder[1] = 1;
4794        iorder[2] = 2;
4795        iorder[3] = 3;
4796    }
4797    else if (byteorder == 1) {
4798        /* force BE */
4799        iorder[0] = 3;
4800        iorder[1] = 2;
4801        iorder[2] = 1;
4802        iorder[3] = 0;
4803    }
4804
4805    while (size-- > 0) {
4806        Py_UCS4 ch = *s++;
4807#ifndef Py_UNICODE_WIDE
4808        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4809            Py_UCS4 ch2 = *s;
4810            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4811                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4812                s++;
4813                size--;
4814            }
4815        }
4816#endif
4817        STORECHAR(ch);
4818    }
4819
4820  done:
4821    return v;
4822#undef STORECHAR
4823}
4824
4825PyObject *
4826PyUnicode_AsUTF32String(PyObject *unicode)
4827{
4828    if (!PyUnicode_Check(unicode)) {
4829        PyErr_BadArgument();
4830        return NULL;
4831    }
4832    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
4833                                 PyUnicode_GET_SIZE(unicode),
4834                                 NULL,
4835                                 0);
4836}
4837
4838/* --- UTF-16 Codec ------------------------------------------------------- */
4839
4840PyObject *
4841PyUnicode_DecodeUTF16(const char *s,
4842                      Py_ssize_t size,
4843                      const char *errors,
4844                      int *byteorder)
4845{
4846    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4847}
4848
4849/* Two masks for fast checking of whether a C 'long' may contain
4850   UTF16-encoded surrogate characters. This is an efficient heuristic,
4851   assuming that non-surrogate characters with a code point >= 0x8000 are
4852   rare in most input.
4853   FAST_CHAR_MASK is used when the input is in native byte ordering,
4854   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
4855*/
4856#if (SIZEOF_LONG == 8)
4857# define FAST_CHAR_MASK         0x8000800080008000L
4858# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4859#elif (SIZEOF_LONG == 4)
4860# define FAST_CHAR_MASK         0x80008000L
4861# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4862#else
4863# error C 'long' size should be either 4 or 8!
4864#endif
4865
4866PyObject *
4867PyUnicode_DecodeUTF16Stateful(const char *s,
4868                              Py_ssize_t size,
4869                              const char *errors,
4870                              int *byteorder,
4871                              Py_ssize_t *consumed)
4872{
4873    const char *starts = s;
4874    Py_ssize_t startinpos;
4875    Py_ssize_t endinpos;
4876    Py_ssize_t outpos;
4877    PyUnicodeObject *unicode;
4878    Py_UNICODE *p;
4879    const unsigned char *q, *e, *aligned_end;
4880    int bo = 0;       /* assume native ordering by default */
4881    int native_ordering = 0;
4882    const char *errmsg = "";
4883    /* Offsets from q for retrieving byte pairs in the right order. */
4884#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4885    int ihi = 1, ilo = 0;
4886#else
4887    int ihi = 0, ilo = 1;
4888#endif
4889    PyObject *errorHandler = NULL;
4890    PyObject *exc = NULL;
4891
4892    /* Note: size will always be longer than the resulting Unicode
4893       character count */
4894    unicode = _PyUnicode_New(size);
4895    if (!unicode)
4896        return NULL;
4897    if (size == 0)
4898        return (PyObject *)unicode;
4899
4900    /* Unpack UTF-16 encoded data */
4901    p = PyUnicode_AS_UNICODE(unicode);
4902    q = (unsigned char *)s;
4903    e = q + size - 1;
4904
4905    if (byteorder)
4906        bo = *byteorder;
4907
4908    /* Check for BOM marks (U+FEFF) in the input and adjust current
4909       byte order setting accordingly. In native mode, the leading BOM
4910       mark is skipped, in all other modes, it is copied to the output
4911       stream as-is (giving a ZWNBSP character). */
4912    if (bo == 0) {
4913        if (size >= 2) {
4914            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
4915#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4916            if (bom == 0xFEFF) {
4917                q += 2;
4918                bo = -1;
4919            }
4920            else if (bom == 0xFFFE) {
4921                q += 2;
4922                bo = 1;
4923            }
4924#else
4925            if (bom == 0xFEFF) {
4926                q += 2;
4927                bo = 1;
4928            }
4929            else if (bom == 0xFFFE) {
4930                q += 2;
4931                bo = -1;
4932            }
4933#endif
4934        }
4935    }
4936
4937    if (bo == -1) {
4938        /* force LE */
4939        ihi = 1;
4940        ilo = 0;
4941    }
4942    else if (bo == 1) {
4943        /* force BE */
4944        ihi = 0;
4945        ilo = 1;
4946    }
4947#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4948    native_ordering = ilo < ihi;
4949#else
4950    native_ordering = ilo > ihi;
4951#endif
4952
4953    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
4954    while (q < e) {
4955        Py_UNICODE ch;
4956        /* First check for possible aligned read of a C 'long'. Unaligned
4957           reads are more expensive, better to defer to another iteration. */
4958        if (!((size_t) q & LONG_PTR_MASK)) {
4959            /* Fast path for runs of non-surrogate chars. */
4960            register const unsigned char *_q = q;
4961            Py_UNICODE *_p = p;
4962            if (native_ordering) {
4963                /* Native ordering is simple: as long as the input cannot
4964                   possibly contain a surrogate char, do an unrolled copy
4965                   of several 16-bit code points to the target object.
4966                   The non-surrogate check is done on several input bytes
4967                   at a time (as many as a C 'long' can contain). */
4968                while (_q < aligned_end) {
4969                    unsigned long data = * (unsigned long *) _q;
4970                    if (data & FAST_CHAR_MASK)
4971                        break;
4972                    _p[0] = ((unsigned short *) _q)[0];
4973                    _p[1] = ((unsigned short *) _q)[1];
4974#if (SIZEOF_LONG == 8)
4975                    _p[2] = ((unsigned short *) _q)[2];
4976                    _p[3] = ((unsigned short *) _q)[3];
4977#endif
4978                    _q += SIZEOF_LONG;
4979                    _p += SIZEOF_LONG / 2;
4980                }
4981            }
4982            else {
4983                /* Byteswapped ordering is similar, but we must decompose
4984                   the copy bytewise, and take care of zero'ing out the
4985                   upper bytes if the target object is in 32-bit units
4986                   (that is, in UCS-4 builds). */
4987                while (_q < aligned_end) {
4988                    unsigned long data = * (unsigned long *) _q;
4989                    if (data & SWAPPED_FAST_CHAR_MASK)
4990                        break;
4991                    /* Zero upper bytes in UCS-4 builds */
4992#if (Py_UNICODE_SIZE > 2)
4993                    _p[0] = 0;
4994                    _p[1] = 0;
4995#if (SIZEOF_LONG == 8)
4996                    _p[2] = 0;
4997                    _p[3] = 0;
4998#endif
4999#endif
5000                    /* Issue #4916; UCS-4 builds on big endian machines must
5001                       fill the two last bytes of each 4-byte unit. */
5002#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5003# define OFF 2
5004#else
5005# define OFF 0
5006#endif
5007                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5008                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5009                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5010                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5011#if (SIZEOF_LONG == 8)
5012                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5013                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5014                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5015                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5016#endif
5017#undef OFF
5018                    _q += SIZEOF_LONG;
5019                    _p += SIZEOF_LONG / 2;
5020                }
5021            }
5022            p = _p;
5023            q = _q;
5024            if (q >= e)
5025                break;
5026        }
5027        ch = (q[ihi] << 8) | q[ilo];
5028
5029        q += 2;
5030
5031        if (ch < 0xD800 || ch > 0xDFFF) {
5032            *p++ = ch;
5033            continue;
5034        }
5035
5036        /* UTF-16 code pair: */
5037        if (q > e) {
5038            errmsg = "unexpected end of data";
5039            startinpos = (((const char *)q) - 2) - starts;
5040            endinpos = ((const char *)e) + 1 - starts;
5041            goto utf16Error;
5042        }
5043        if (0xD800 <= ch && ch <= 0xDBFF) {
5044            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5045            q += 2;
5046            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5047#ifndef Py_UNICODE_WIDE
5048                *p++ = ch;
5049                *p++ = ch2;
5050#else
5051                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5052#endif
5053                continue;
5054            }
5055            else {
5056                errmsg = "illegal UTF-16 surrogate";
5057                startinpos = (((const char *)q)-4)-starts;
5058                endinpos = startinpos+2;
5059                goto utf16Error;
5060            }
5061
5062        }
5063        errmsg = "illegal encoding";
5064        startinpos = (((const char *)q)-2)-starts;
5065        endinpos = startinpos+2;
5066        /* Fall through to report the error */
5067
5068      utf16Error:
5069        outpos = p - PyUnicode_AS_UNICODE(unicode);
5070        if (unicode_decode_call_errorhandler(
5071                errors,
5072                &errorHandler,
5073                "utf16", errmsg,
5074                &starts,
5075                (const char **)&e,
5076                &startinpos,
5077                &endinpos,
5078                &exc,
5079                (const char **)&q,
5080                &unicode,
5081                &outpos,
5082                &p))
5083            goto onError;
5084    }
5085    /* remaining byte at the end? (size should be even) */
5086    if (e == q) {
5087        if (!consumed) {
5088            errmsg = "truncated data";
5089            startinpos = ((const char *)q) - starts;
5090            endinpos = ((const char *)e) + 1 - starts;
5091            outpos = p - PyUnicode_AS_UNICODE(unicode);
5092            if (unicode_decode_call_errorhandler(
5093                    errors,
5094                    &errorHandler,
5095                    "utf16", errmsg,
5096                    &starts,
5097                    (const char **)&e,
5098                    &startinpos,
5099                    &endinpos,
5100                    &exc,
5101                    (const char **)&q,
5102                    &unicode,
5103                    &outpos,
5104                    &p))
5105                goto onError;
5106            /* The remaining input chars are ignored if the callback
5107               chooses to skip the input */
5108        }
5109    }
5110
5111    if (byteorder)
5112        *byteorder = bo;
5113
5114    if (consumed)
5115        *consumed = (const char *)q-starts;
5116
5117    /* Adjust length */
5118    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5119        goto onError;
5120
5121    Py_XDECREF(errorHandler);
5122    Py_XDECREF(exc);
5123    if (_PyUnicode_READY_REPLACE(&unicode)) {
5124        Py_DECREF(unicode);
5125        return NULL;
5126    }
5127    return (PyObject *)unicode;
5128
5129  onError:
5130    Py_DECREF(unicode);
5131    Py_XDECREF(errorHandler);
5132    Py_XDECREF(exc);
5133    return NULL;
5134}
5135
5136#undef FAST_CHAR_MASK
5137#undef SWAPPED_FAST_CHAR_MASK
5138
5139PyObject *
5140PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5141                      Py_ssize_t size,
5142                      const char *errors,
5143                      int byteorder)
5144{
5145    PyObject *v;
5146    unsigned char *p;
5147    Py_ssize_t nsize, bytesize;
5148#ifdef Py_UNICODE_WIDE
5149    Py_ssize_t i, pairs;
5150#else
5151    const int pairs = 0;
5152#endif
5153    /* Offsets from p for storing byte pairs in the right order. */
5154#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5155    int ihi = 1, ilo = 0;
5156#else
5157    int ihi = 0, ilo = 1;
5158#endif
5159
5160#define STORECHAR(CH)                           \
5161    do {                                        \
5162        p[ihi] = ((CH) >> 8) & 0xff;            \
5163        p[ilo] = (CH) & 0xff;                   \
5164        p += 2;                                 \
5165    } while(0)
5166
5167#ifdef Py_UNICODE_WIDE
5168    for (i = pairs = 0; i < size; i++)
5169        if (s[i] >= 0x10000)
5170            pairs++;
5171#endif
5172    /* 2 * (size + pairs + (byteorder == 0)) */
5173    if (size > PY_SSIZE_T_MAX ||
5174        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5175        return PyErr_NoMemory();
5176    nsize = size + pairs + (byteorder == 0);
5177    bytesize = nsize * 2;
5178    if (bytesize / 2 != nsize)
5179        return PyErr_NoMemory();
5180    v = PyBytes_FromStringAndSize(NULL, bytesize);
5181    if (v == NULL)
5182        return NULL;
5183
5184    p = (unsigned char *)PyBytes_AS_STRING(v);
5185    if (byteorder == 0)
5186        STORECHAR(0xFEFF);
5187    if (size == 0)
5188        goto done;
5189
5190    if (byteorder == -1) {
5191        /* force LE */
5192        ihi = 1;
5193        ilo = 0;
5194    }
5195    else if (byteorder == 1) {
5196        /* force BE */
5197        ihi = 0;
5198        ilo = 1;
5199    }
5200
5201    while (size-- > 0) {
5202        Py_UNICODE ch = *s++;
5203        Py_UNICODE ch2 = 0;
5204#ifdef Py_UNICODE_WIDE
5205        if (ch >= 0x10000) {
5206            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5207            ch  = 0xD800 | ((ch-0x10000) >> 10);
5208        }
5209#endif
5210        STORECHAR(ch);
5211        if (ch2)
5212            STORECHAR(ch2);
5213    }
5214
5215  done:
5216    return v;
5217#undef STORECHAR
5218}
5219
5220PyObject *
5221PyUnicode_AsUTF16String(PyObject *unicode)
5222{
5223    if (!PyUnicode_Check(unicode)) {
5224        PyErr_BadArgument();
5225        return NULL;
5226    }
5227    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5228                                 PyUnicode_GET_SIZE(unicode),
5229                                 NULL,
5230                                 0);
5231}
5232
5233/* --- Unicode Escape Codec ----------------------------------------------- */
5234
5235/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5236   if all the escapes in the string make it still a valid ASCII string.
5237   Returns -1 if any escapes were found which cause the string to
5238   pop out of ASCII range.  Otherwise returns the length of the
5239   required buffer to hold the string.
5240   */
5241Py_ssize_t
5242length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5243{
5244    const unsigned char *p = (const unsigned char *)s;
5245    const unsigned char *end = p + size;
5246    Py_ssize_t length = 0;
5247
5248    if (size < 0)
5249        return -1;
5250
5251    for (; p < end; ++p) {
5252        if (*p > 127) {
5253            /* Non-ASCII */
5254            return -1;
5255        }
5256        else if (*p != '\\') {
5257            /* Normal character */
5258            ++length;
5259        }
5260        else {
5261            /* Backslash-escape, check next char */
5262            ++p;
5263            /* Escape sequence reaches till end of string or
5264               non-ASCII follow-up. */
5265            if (p >= end || *p > 127)
5266                return -1;
5267            switch (*p) {
5268            case '\n':
5269                /* backslash + \n result in zero characters */
5270                break;
5271            case '\\': case '\'': case '\"':
5272            case 'b': case 'f': case 't':
5273            case 'n': case 'r': case 'v': case 'a':
5274                ++length;
5275                break;
5276            case '0': case '1': case '2': case '3':
5277            case '4': case '5': case '6': case '7':
5278            case 'x': case 'u': case 'U': case 'N':
5279                /* these do not guarantee ASCII characters */
5280                return -1;
5281            default:
5282                /* count the backslash + the other character */
5283                length += 2;
5284            }
5285        }
5286    }
5287    return length;
5288}
5289
5290/* Similar to PyUnicode_WRITE but either write into wstr field
5291   or treat string as ASCII. */
5292#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5293    do { \
5294        if ((kind) != PyUnicode_WCHAR_KIND) \
5295            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5296        else \
5297            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5298    } while (0)
5299
5300#define WRITE_WSTR(buf, index, value) \
5301    assert(kind == PyUnicode_WCHAR_KIND), \
5302    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5303
5304
5305static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5306
5307PyObject *
5308PyUnicode_DecodeUnicodeEscape(const char *s,
5309                              Py_ssize_t size,
5310                              const char *errors)
5311{
5312    const char *starts = s;
5313    Py_ssize_t startinpos;
5314    Py_ssize_t endinpos;
5315    int j;
5316    PyUnicodeObject *v;
5317    Py_UNICODE *p;
5318    const char *end;
5319    char* message;
5320    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5321    PyObject *errorHandler = NULL;
5322    PyObject *exc = NULL;
5323    Py_ssize_t ascii_length;
5324    Py_ssize_t i;
5325    int kind;
5326    void *data;
5327
5328    ascii_length = length_of_escaped_ascii_string(s, size);
5329
5330    /* After length_of_escaped_ascii_string() there are two alternatives,
5331       either the string is pure ASCII with named escapes like \n, etc.
5332       and we determined it's exact size (common case)
5333       or it contains \x, \u, ... escape sequences.  then we create a
5334       legacy wchar string and resize it at the end of this function. */
5335    if (ascii_length >= 0) {
5336        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5337        if (!v)
5338            goto onError;
5339        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5340        kind = PyUnicode_1BYTE_KIND;
5341        data = PyUnicode_DATA(v);
5342    }
5343    else {
5344        /* Escaped strings will always be longer than the resulting
5345           Unicode string, so we start with size here and then reduce the
5346           length after conversion to the true value.
5347           (but if the error callback returns a long replacement string
5348           we'll have to allocate more space) */
5349        v = _PyUnicode_New(size);
5350        if (!v)
5351            goto onError;
5352        kind = PyUnicode_WCHAR_KIND;
5353        data = PyUnicode_AS_UNICODE(v);
5354    }
5355
5356    if (size == 0)
5357        return (PyObject *)v;
5358    i = 0;
5359    end = s + size;
5360
5361    while (s < end) {
5362        unsigned char c;
5363        Py_UNICODE x;
5364        int digits;
5365
5366        if (kind == PyUnicode_WCHAR_KIND) {
5367            assert(i < _PyUnicode_WSTR_LENGTH(v));
5368        }
5369        else {
5370            /* The only case in which i == ascii_length is a backslash
5371               followed by a newline. */
5372            assert(i <= ascii_length);
5373        }
5374
5375        /* Non-escape characters are interpreted as Unicode ordinals */
5376        if (*s != '\\') {
5377            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5378            continue;
5379        }
5380
5381        startinpos = s-starts;
5382        /* \ - Escapes */
5383        s++;
5384        c = *s++;
5385        if (s > end)
5386            c = '\0'; /* Invalid after \ */
5387
5388        if (kind == PyUnicode_WCHAR_KIND) {
5389            assert(i < _PyUnicode_WSTR_LENGTH(v));
5390        }
5391        else {
5392            /* The only case in which i == ascii_length is a backslash
5393               followed by a newline. */
5394            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5395        }
5396
5397        switch (c) {
5398
5399            /* \x escapes */
5400        case '\n': break;
5401        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5402        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5403        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5404        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5405        /* FF */
5406        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5407        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5408        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5409        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5410        /* VT */
5411        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5412        /* BEL, not classic C */
5413        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5414
5415            /* \OOO (octal) escapes */
5416        case '0': case '1': case '2': case '3':
5417        case '4': case '5': case '6': case '7':
5418            x = s[-1] - '0';
5419            if (s < end && '0' <= *s && *s <= '7') {
5420                x = (x<<3) + *s++ - '0';
5421                if (s < end && '0' <= *s && *s <= '7')
5422                    x = (x<<3) + *s++ - '0';
5423            }
5424            WRITE_WSTR(data, i++, x);
5425            break;
5426
5427            /* hex escapes */
5428            /* \xXX */
5429        case 'x':
5430            digits = 2;
5431            message = "truncated \\xXX escape";
5432            goto hexescape;
5433
5434            /* \uXXXX */
5435        case 'u':
5436            digits = 4;
5437            message = "truncated \\uXXXX escape";
5438            goto hexescape;
5439
5440            /* \UXXXXXXXX */
5441        case 'U':
5442            digits = 8;
5443            message = "truncated \\UXXXXXXXX escape";
5444        hexescape:
5445            chr = 0;
5446            p = PyUnicode_AS_UNICODE(v) + i;
5447            if (s+digits>end) {
5448                endinpos = size;
5449                if (unicode_decode_call_errorhandler(
5450                        errors, &errorHandler,
5451                        "unicodeescape", "end of string in escape sequence",
5452                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5453                        &v, &i, &p))
5454                    goto onError;
5455                data = PyUnicode_AS_UNICODE(v);
5456                goto nextByte;
5457            }
5458            for (j = 0; j < digits; ++j) {
5459                c = (unsigned char) s[j];
5460                if (!Py_ISXDIGIT(c)) {
5461                    endinpos = (s+j+1)-starts;
5462                    p = PyUnicode_AS_UNICODE(v) + i;
5463                    if (unicode_decode_call_errorhandler(
5464                            errors, &errorHandler,
5465                            "unicodeescape", message,
5466                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5467                            &v, &i, &p))
5468                        goto onError;
5469                    data = PyUnicode_AS_UNICODE(v);
5470                    goto nextByte;
5471                }
5472                chr = (chr<<4) & ~0xF;
5473                if (c >= '0' && c <= '9')
5474                    chr += c - '0';
5475                else if (c >= 'a' && c <= 'f')
5476                    chr += 10 + c - 'a';
5477                else
5478                    chr += 10 + c - 'A';
5479            }
5480            s += j;
5481            if (chr == 0xffffffff && PyErr_Occurred())
5482                /* _decoding_error will have already written into the
5483                   target buffer. */
5484                break;
5485        store:
5486            /* when we get here, chr is a 32-bit unicode character */
5487            if (chr <= 0xffff)
5488                /* UCS-2 character */
5489                WRITE_WSTR(data, i++, chr);
5490            else if (chr <= 0x10ffff) {
5491                /* UCS-4 character. Either store directly, or as
5492                   surrogate pair. */
5493#ifdef Py_UNICODE_WIDE
5494                WRITE_WSTR(data, i++, chr);
5495#else
5496                chr -= 0x10000L;
5497                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5498                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5499#endif
5500            } else {
5501                endinpos = s-starts;
5502                p = PyUnicode_AS_UNICODE(v) + i;
5503                if (unicode_decode_call_errorhandler(
5504                        errors, &errorHandler,
5505                        "unicodeescape", "illegal Unicode character",
5506                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5507                        &v, &i, &p))
5508                    goto onError;
5509                data = PyUnicode_AS_UNICODE(v);
5510            }
5511            break;
5512
5513            /* \N{name} */
5514        case 'N':
5515            message = "malformed \\N character escape";
5516            if (ucnhash_CAPI == NULL) {
5517                /* load the unicode data module */
5518                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5519                                                PyUnicodeData_CAPSULE_NAME, 1);
5520                if (ucnhash_CAPI == NULL)
5521                    goto ucnhashError;
5522            }
5523            if (*s == '{') {
5524                const char *start = s+1;
5525                /* look for the closing brace */
5526                while (*s != '}' && s < end)
5527                    s++;
5528                if (s > start && s < end && *s == '}') {
5529                    /* found a name.  look it up in the unicode database */
5530                    message = "unknown Unicode character name";
5531                    s++;
5532                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5533                                              &chr))
5534                        goto store;
5535                }
5536            }
5537            endinpos = s-starts;
5538            p = PyUnicode_AS_UNICODE(v) + i;
5539            if (unicode_decode_call_errorhandler(
5540                    errors, &errorHandler,
5541                    "unicodeescape", message,
5542                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5543                    &v, &i, &p))
5544                goto onError;
5545            data = PyUnicode_AS_UNICODE(v);
5546            break;
5547
5548        default:
5549            if (s > end) {
5550                assert(kind == PyUnicode_WCHAR_KIND);
5551                message = "\\ at end of string";
5552                s--;
5553                endinpos = s-starts;
5554                p = PyUnicode_AS_UNICODE(v) + i;
5555                if (unicode_decode_call_errorhandler(
5556                        errors, &errorHandler,
5557                        "unicodeescape", message,
5558                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5559                        &v, &i, &p))
5560                    goto onError;
5561                data = PyUnicode_AS_UNICODE(v);
5562            }
5563            else {
5564                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5565                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5566            }
5567            break;
5568        }
5569      nextByte:
5570        ;
5571    }
5572    /* Ensure the length prediction worked in case of ASCII strings */
5573    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5574
5575    if (kind == PyUnicode_WCHAR_KIND)
5576    {
5577        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5578            goto onError;
5579    }
5580    Py_XDECREF(errorHandler);
5581    Py_XDECREF(exc);
5582    if (_PyUnicode_READY_REPLACE(&v)) {
5583        Py_DECREF(v);
5584        return NULL;
5585    }
5586    return (PyObject *)v;
5587
5588  ucnhashError:
5589    PyErr_SetString(
5590        PyExc_UnicodeError,
5591        "\\N escapes not supported (can't load unicodedata module)"
5592        );
5593    Py_XDECREF(v);
5594    Py_XDECREF(errorHandler);
5595    Py_XDECREF(exc);
5596    return NULL;
5597
5598  onError:
5599    Py_XDECREF(v);
5600    Py_XDECREF(errorHandler);
5601    Py_XDECREF(exc);
5602    return NULL;
5603}
5604
5605#undef WRITE_ASCII_OR_WSTR
5606#undef WRITE_WSTR
5607
5608/* Return a Unicode-Escape string version of the Unicode object.
5609
5610   If quotes is true, the string is enclosed in u"" or u'' quotes as
5611   appropriate.
5612
5613*/
5614
5615static const char *hexdigits = "0123456789abcdef";
5616
5617PyObject *
5618PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5619                              Py_ssize_t size)
5620{
5621    PyObject *repr;
5622    char *p;
5623
5624#ifdef Py_UNICODE_WIDE
5625    const Py_ssize_t expandsize = 10;
5626#else
5627    const Py_ssize_t expandsize = 6;
5628#endif
5629
5630    /* XXX(nnorwitz): rather than over-allocating, it would be
5631       better to choose a different scheme.  Perhaps scan the
5632       first N-chars of the string and allocate based on that size.
5633    */
5634    /* Initial allocation is based on the longest-possible unichr
5635       escape.
5636
5637       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5638       unichr, so in this case it's the longest unichr escape. In
5639       narrow (UTF-16) builds this is five chars per source unichr
5640       since there are two unichrs in the surrogate pair, so in narrow
5641       (UTF-16) builds it's not the longest unichr escape.
5642
5643       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5644       so in the narrow (UTF-16) build case it's the longest unichr
5645       escape.
5646    */
5647
5648    if (size == 0)
5649        return PyBytes_FromStringAndSize(NULL, 0);
5650
5651    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5652        return PyErr_NoMemory();
5653
5654    repr = PyBytes_FromStringAndSize(NULL,
5655                                     2
5656                                     + expandsize*size
5657                                     + 1);
5658    if (repr == NULL)
5659        return NULL;
5660
5661    p = PyBytes_AS_STRING(repr);
5662
5663    while (size-- > 0) {
5664        Py_UNICODE ch = *s++;
5665
5666        /* Escape backslashes */
5667        if (ch == '\\') {
5668            *p++ = '\\';
5669            *p++ = (char) ch;
5670            continue;
5671        }
5672
5673#ifdef Py_UNICODE_WIDE
5674        /* Map 21-bit characters to '\U00xxxxxx' */
5675        else if (ch >= 0x10000) {
5676            *p++ = '\\';
5677            *p++ = 'U';
5678            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5679            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5680            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5681            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5682            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5683            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5684            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5685            *p++ = hexdigits[ch & 0x0000000F];
5686            continue;
5687        }
5688#else
5689        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5690        else if (ch >= 0xD800 && ch < 0xDC00) {
5691            Py_UNICODE ch2;
5692            Py_UCS4 ucs;
5693
5694            ch2 = *s++;
5695            size--;
5696            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5697                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5698                *p++ = '\\';
5699                *p++ = 'U';
5700                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5701                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5702                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5703                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5704                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5705                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5706                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5707                *p++ = hexdigits[ucs & 0x0000000F];
5708                continue;
5709            }
5710            /* Fall through: isolated surrogates are copied as-is */
5711            s--;
5712            size++;
5713        }
5714#endif
5715
5716        /* Map 16-bit characters to '\uxxxx' */
5717        if (ch >= 256) {
5718            *p++ = '\\';
5719            *p++ = 'u';
5720            *p++ = hexdigits[(ch >> 12) & 0x000F];
5721            *p++ = hexdigits[(ch >> 8) & 0x000F];
5722            *p++ = hexdigits[(ch >> 4) & 0x000F];
5723            *p++ = hexdigits[ch & 0x000F];
5724        }
5725
5726        /* Map special whitespace to '\t', \n', '\r' */
5727        else if (ch == '\t') {
5728            *p++ = '\\';
5729            *p++ = 't';
5730        }
5731        else if (ch == '\n') {
5732            *p++ = '\\';
5733            *p++ = 'n';
5734        }
5735        else if (ch == '\r') {
5736            *p++ = '\\';
5737            *p++ = 'r';
5738        }
5739
5740        /* Map non-printable US ASCII to '\xhh' */
5741        else if (ch < ' ' || ch >= 0x7F) {
5742            *p++ = '\\';
5743            *p++ = 'x';
5744            *p++ = hexdigits[(ch >> 4) & 0x000F];
5745            *p++ = hexdigits[ch & 0x000F];
5746        }
5747
5748        /* Copy everything else as-is */
5749        else
5750            *p++ = (char) ch;
5751    }
5752
5753    assert(p - PyBytes_AS_STRING(repr) > 0);
5754    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5755        return NULL;
5756    return repr;
5757}
5758
5759PyObject *
5760PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5761{
5762    PyObject *s;
5763    if (!PyUnicode_Check(unicode)) {
5764        PyErr_BadArgument();
5765        return NULL;
5766    }
5767    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5768                                      PyUnicode_GET_SIZE(unicode));
5769    return s;
5770}
5771
5772/* --- Raw Unicode Escape Codec ------------------------------------------- */
5773
5774PyObject *
5775PyUnicode_DecodeRawUnicodeEscape(const char *s,
5776                                 Py_ssize_t size,
5777                                 const char *errors)
5778{
5779    const char *starts = s;
5780    Py_ssize_t startinpos;
5781    Py_ssize_t endinpos;
5782    Py_ssize_t outpos;
5783    PyUnicodeObject *v;
5784    Py_UNICODE *p;
5785    const char *end;
5786    const char *bs;
5787    PyObject *errorHandler = NULL;
5788    PyObject *exc = NULL;
5789
5790    /* Escaped strings will always be longer than the resulting
5791       Unicode string, so we start with size here and then reduce the
5792       length after conversion to the true value. (But decoding error
5793       handler might have to resize the string) */
5794    v = _PyUnicode_New(size);
5795    if (v == NULL)
5796        goto onError;
5797    if (size == 0)
5798        return (PyObject *)v;
5799    p = PyUnicode_AS_UNICODE(v);
5800    end = s + size;
5801    while (s < end) {
5802        unsigned char c;
5803        Py_UCS4 x;
5804        int i;
5805        int count;
5806
5807        /* Non-escape characters are interpreted as Unicode ordinals */
5808        if (*s != '\\') {
5809            *p++ = (unsigned char)*s++;
5810            continue;
5811        }
5812        startinpos = s-starts;
5813
5814        /* \u-escapes are only interpreted iff the number of leading
5815           backslashes if odd */
5816        bs = s;
5817        for (;s < end;) {
5818            if (*s != '\\')
5819                break;
5820            *p++ = (unsigned char)*s++;
5821        }
5822        if (((s - bs) & 1) == 0 ||
5823            s >= end ||
5824            (*s != 'u' && *s != 'U')) {
5825            continue;
5826        }
5827        p--;
5828        count = *s=='u' ? 4 : 8;
5829        s++;
5830
5831        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5832        outpos = p-PyUnicode_AS_UNICODE(v);
5833        for (x = 0, i = 0; i < count; ++i, ++s) {
5834            c = (unsigned char)*s;
5835            if (!Py_ISXDIGIT(c)) {
5836                endinpos = s-starts;
5837                if (unicode_decode_call_errorhandler(
5838                        errors, &errorHandler,
5839                        "rawunicodeescape", "truncated \\uXXXX",
5840                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5841                        &v, &outpos, &p))
5842                    goto onError;
5843                goto nextByte;
5844            }
5845            x = (x<<4) & ~0xF;
5846            if (c >= '0' && c <= '9')
5847                x += c - '0';
5848            else if (c >= 'a' && c <= 'f')
5849                x += 10 + c - 'a';
5850            else
5851                x += 10 + c - 'A';
5852        }
5853        if (x <= 0xffff)
5854            /* UCS-2 character */
5855            *p++ = (Py_UNICODE) x;
5856        else if (x <= 0x10ffff) {
5857            /* UCS-4 character. Either store directly, or as
5858               surrogate pair. */
5859#ifdef Py_UNICODE_WIDE
5860            *p++ = (Py_UNICODE) x;
5861#else
5862            x -= 0x10000L;
5863            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5864            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
5865#endif
5866        } else {
5867            endinpos = s-starts;
5868            outpos = p-PyUnicode_AS_UNICODE(v);
5869            if (unicode_decode_call_errorhandler(
5870                    errors, &errorHandler,
5871                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5872                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5873                    &v, &outpos, &p))
5874                goto onError;
5875        }
5876      nextByte:
5877        ;
5878    }
5879    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5880        goto onError;
5881    Py_XDECREF(errorHandler);
5882    Py_XDECREF(exc);
5883    if (_PyUnicode_READY_REPLACE(&v)) {
5884        Py_DECREF(v);
5885        return NULL;
5886    }
5887    return (PyObject *)v;
5888
5889  onError:
5890    Py_XDECREF(v);
5891    Py_XDECREF(errorHandler);
5892    Py_XDECREF(exc);
5893    return NULL;
5894}
5895
5896PyObject *
5897PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5898                                 Py_ssize_t size)
5899{
5900    PyObject *repr;
5901    char *p;
5902    char *q;
5903
5904#ifdef Py_UNICODE_WIDE
5905    const Py_ssize_t expandsize = 10;
5906#else
5907    const Py_ssize_t expandsize = 6;
5908#endif
5909
5910    if (size > PY_SSIZE_T_MAX / expandsize)
5911        return PyErr_NoMemory();
5912
5913    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
5914    if (repr == NULL)
5915        return NULL;
5916    if (size == 0)
5917        return repr;
5918
5919    p = q = PyBytes_AS_STRING(repr);
5920    while (size-- > 0) {
5921        Py_UNICODE ch = *s++;
5922#ifdef Py_UNICODE_WIDE
5923        /* Map 32-bit characters to '\Uxxxxxxxx' */
5924        if (ch >= 0x10000) {
5925            *p++ = '\\';
5926            *p++ = 'U';
5927            *p++ = hexdigits[(ch >> 28) & 0xf];
5928            *p++ = hexdigits[(ch >> 24) & 0xf];
5929            *p++ = hexdigits[(ch >> 20) & 0xf];
5930            *p++ = hexdigits[(ch >> 16) & 0xf];
5931            *p++ = hexdigits[(ch >> 12) & 0xf];
5932            *p++ = hexdigits[(ch >> 8) & 0xf];
5933            *p++ = hexdigits[(ch >> 4) & 0xf];
5934            *p++ = hexdigits[ch & 15];
5935        }
5936        else
5937#else
5938            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5939            if (ch >= 0xD800 && ch < 0xDC00) {
5940                Py_UNICODE ch2;
5941                Py_UCS4 ucs;
5942
5943                ch2 = *s++;
5944                size--;
5945                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5946                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5947                    *p++ = '\\';
5948                    *p++ = 'U';
5949                    *p++ = hexdigits[(ucs >> 28) & 0xf];
5950                    *p++ = hexdigits[(ucs >> 24) & 0xf];
5951                    *p++ = hexdigits[(ucs >> 20) & 0xf];
5952                    *p++ = hexdigits[(ucs >> 16) & 0xf];
5953                    *p++ = hexdigits[(ucs >> 12) & 0xf];
5954                    *p++ = hexdigits[(ucs >> 8) & 0xf];
5955                    *p++ = hexdigits[(ucs >> 4) & 0xf];
5956                    *p++ = hexdigits[ucs & 0xf];
5957                    continue;
5958                }
5959                /* Fall through: isolated surrogates are copied as-is */
5960                s--;
5961                size++;
5962            }
5963#endif
5964        /* Map 16-bit characters to '\uxxxx' */
5965        if (ch >= 256) {
5966            *p++ = '\\';
5967            *p++ = 'u';
5968            *p++ = hexdigits[(ch >> 12) & 0xf];
5969            *p++ = hexdigits[(ch >> 8) & 0xf];
5970            *p++ = hexdigits[(ch >> 4) & 0xf];
5971            *p++ = hexdigits[ch & 15];
5972        }
5973        /* Copy everything else as-is */
5974        else
5975            *p++ = (char) ch;
5976    }
5977    size = p - q;
5978
5979    assert(size > 0);
5980    if (_PyBytes_Resize(&repr, size) < 0)
5981        return NULL;
5982    return repr;
5983}
5984
5985PyObject *
5986PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5987{
5988    PyObject *s;
5989    if (!PyUnicode_Check(unicode)) {
5990        PyErr_BadArgument();
5991        return NULL;
5992    }
5993    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5994                                         PyUnicode_GET_SIZE(unicode));
5995
5996    return s;
5997}
5998
5999/* --- Unicode Internal Codec ------------------------------------------- */
6000
6001PyObject *
6002_PyUnicode_DecodeUnicodeInternal(const char *s,
6003                                 Py_ssize_t size,
6004                                 const char *errors)
6005{
6006    const char *starts = s;
6007    Py_ssize_t startinpos;
6008    Py_ssize_t endinpos;
6009    Py_ssize_t outpos;
6010    PyUnicodeObject *v;
6011    Py_UNICODE *p;
6012    const char *end;
6013    const char *reason;
6014    PyObject *errorHandler = NULL;
6015    PyObject *exc = NULL;
6016
6017#ifdef Py_UNICODE_WIDE
6018    Py_UNICODE unimax = PyUnicode_GetMax();
6019#endif
6020
6021    /* XXX overflow detection missing */
6022    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6023    if (v == NULL)
6024        goto onError;
6025    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6026       as string was created with the old API. */
6027    if (PyUnicode_GET_SIZE(v) == 0)
6028        return (PyObject *)v;
6029    p = PyUnicode_AS_UNICODE(v);
6030    end = s + size;
6031
6032    while (s < end) {
6033        memcpy(p, s, sizeof(Py_UNICODE));
6034        /* We have to sanity check the raw data, otherwise doom looms for
6035           some malformed UCS-4 data. */
6036        if (
6037#ifdef Py_UNICODE_WIDE
6038            *p > unimax || *p < 0 ||
6039#endif
6040            end-s < Py_UNICODE_SIZE
6041            )
6042        {
6043            startinpos = s - starts;
6044            if (end-s < Py_UNICODE_SIZE) {
6045                endinpos = end-starts;
6046                reason = "truncated input";
6047            }
6048            else {
6049                endinpos = s - starts + Py_UNICODE_SIZE;
6050                reason = "illegal code point (> 0x10FFFF)";
6051            }
6052            outpos = p - PyUnicode_AS_UNICODE(v);
6053            if (unicode_decode_call_errorhandler(
6054                    errors, &errorHandler,
6055                    "unicode_internal", reason,
6056                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6057                    &v, &outpos, &p)) {
6058                goto onError;
6059            }
6060        }
6061        else {
6062            p++;
6063            s += Py_UNICODE_SIZE;
6064        }
6065    }
6066
6067    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6068        goto onError;
6069    Py_XDECREF(errorHandler);
6070    Py_XDECREF(exc);
6071    if (_PyUnicode_READY_REPLACE(&v)) {
6072        Py_DECREF(v);
6073        return NULL;
6074    }
6075    return (PyObject *)v;
6076
6077  onError:
6078    Py_XDECREF(v);
6079    Py_XDECREF(errorHandler);
6080    Py_XDECREF(exc);
6081    return NULL;
6082}
6083
6084/* --- Latin-1 Codec ------------------------------------------------------ */
6085
6086PyObject *
6087PyUnicode_DecodeLatin1(const char *s,
6088                       Py_ssize_t size,
6089                       const char *errors)
6090{
6091    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6092    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6093}
6094
6095/* create or adjust a UnicodeEncodeError */
6096static void
6097make_encode_exception(PyObject **exceptionObject,
6098                      const char *encoding,
6099                      const Py_UNICODE *unicode, Py_ssize_t size,
6100                      Py_ssize_t startpos, Py_ssize_t endpos,
6101                      const char *reason)
6102{
6103    if (*exceptionObject == NULL) {
6104        *exceptionObject = PyUnicodeEncodeError_Create(
6105            encoding, unicode, size, startpos, endpos, reason);
6106    }
6107    else {
6108        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6109            goto onError;
6110        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6111            goto onError;
6112        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6113            goto onError;
6114        return;
6115      onError:
6116        Py_DECREF(*exceptionObject);
6117        *exceptionObject = NULL;
6118    }
6119}
6120
6121/* raises a UnicodeEncodeError */
6122static void
6123raise_encode_exception(PyObject **exceptionObject,
6124                       const char *encoding,
6125                       const Py_UNICODE *unicode, Py_ssize_t size,
6126                       Py_ssize_t startpos, Py_ssize_t endpos,
6127                       const char *reason)
6128{
6129    make_encode_exception(exceptionObject,
6130                          encoding, unicode, size, startpos, endpos, reason);
6131    if (*exceptionObject != NULL)
6132        PyCodec_StrictErrors(*exceptionObject);
6133}
6134
6135/* error handling callback helper:
6136   build arguments, call the callback and check the arguments,
6137   put the result into newpos and return the replacement string, which
6138   has to be freed by the caller */
6139static PyObject *
6140unicode_encode_call_errorhandler(const char *errors,
6141                                 PyObject **errorHandler,
6142                                 const char *encoding, const char *reason,
6143                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6144                                 Py_ssize_t startpos, Py_ssize_t endpos,
6145                                 Py_ssize_t *newpos)
6146{
6147    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6148
6149    PyObject *restuple;
6150    PyObject *resunicode;
6151
6152    if (*errorHandler == NULL) {
6153        *errorHandler = PyCodec_LookupError(errors);
6154        if (*errorHandler == NULL)
6155            return NULL;
6156    }
6157
6158    make_encode_exception(exceptionObject,
6159                          encoding, unicode, size, startpos, endpos, reason);
6160    if (*exceptionObject == NULL)
6161        return NULL;
6162
6163    restuple = PyObject_CallFunctionObjArgs(
6164        *errorHandler, *exceptionObject, NULL);
6165    if (restuple == NULL)
6166        return NULL;
6167    if (!PyTuple_Check(restuple)) {
6168        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6169        Py_DECREF(restuple);
6170        return NULL;
6171    }
6172    if (!PyArg_ParseTuple(restuple, argparse,
6173                          &resunicode, newpos)) {
6174        Py_DECREF(restuple);
6175        return NULL;
6176    }
6177    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6178        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6179        Py_DECREF(restuple);
6180        return NULL;
6181    }
6182    if (*newpos<0)
6183        *newpos = size+*newpos;
6184    if (*newpos<0 || *newpos>size) {
6185        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6186        Py_DECREF(restuple);
6187        return NULL;
6188    }
6189    Py_INCREF(resunicode);
6190    Py_DECREF(restuple);
6191    return resunicode;
6192}
6193
6194static PyObject *
6195unicode_encode_ucs1(const Py_UNICODE *p,
6196                    Py_ssize_t size,
6197                    const char *errors,
6198                    int limit)
6199{
6200    /* output object */
6201    PyObject *res;
6202    /* pointers to the beginning and end+1 of input */
6203    const Py_UNICODE *startp = p;
6204    const Py_UNICODE *endp = p + size;
6205    /* pointer to the beginning of the unencodable characters */
6206    /* const Py_UNICODE *badp = NULL; */
6207    /* pointer into the output */
6208    char *str;
6209    /* current output position */
6210    Py_ssize_t ressize;
6211    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6212    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6213    PyObject *errorHandler = NULL;
6214    PyObject *exc = NULL;
6215    /* the following variable is used for caching string comparisons
6216     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6217    int known_errorHandler = -1;
6218
6219    /* allocate enough for a simple encoding without
6220       replacements, if we need more, we'll resize */
6221    if (size == 0)
6222        return PyBytes_FromStringAndSize(NULL, 0);
6223    res = PyBytes_FromStringAndSize(NULL, size);
6224    if (res == NULL)
6225        return NULL;
6226    str = PyBytes_AS_STRING(res);
6227    ressize = size;
6228
6229    while (p<endp) {
6230        Py_UNICODE c = *p;
6231
6232        /* can we encode this? */
6233        if (c<limit) {
6234            /* no overflow check, because we know that the space is enough */
6235            *str++ = (char)c;
6236            ++p;
6237        }
6238        else {
6239            Py_ssize_t unicodepos = p-startp;
6240            Py_ssize_t requiredsize;
6241            PyObject *repunicode;
6242            Py_ssize_t repsize;
6243            Py_ssize_t newpos;
6244            Py_ssize_t respos;
6245            Py_UNICODE *uni2;
6246            /* startpos for collecting unencodable chars */
6247            const Py_UNICODE *collstart = p;
6248            const Py_UNICODE *collend = p;
6249            /* find all unecodable characters */
6250            while ((collend < endp) && ((*collend)>=limit))
6251                ++collend;
6252            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6253            if (known_errorHandler==-1) {
6254                if ((errors==NULL) || (!strcmp(errors, "strict")))
6255                    known_errorHandler = 1;
6256                else if (!strcmp(errors, "replace"))
6257                    known_errorHandler = 2;
6258                else if (!strcmp(errors, "ignore"))
6259                    known_errorHandler = 3;
6260                else if (!strcmp(errors, "xmlcharrefreplace"))
6261                    known_errorHandler = 4;
6262                else
6263                    known_errorHandler = 0;
6264            }
6265            switch (known_errorHandler) {
6266            case 1: /* strict */
6267                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6268                goto onError;
6269            case 2: /* replace */
6270                while (collstart++<collend)
6271                    *str++ = '?'; /* fall through */
6272            case 3: /* ignore */
6273                p = collend;
6274                break;
6275            case 4: /* xmlcharrefreplace */
6276                respos = str - PyBytes_AS_STRING(res);
6277                /* determine replacement size (temporarily (mis)uses p) */
6278                for (p = collstart, repsize = 0; p < collend; ++p) {
6279                    if (*p<10)
6280                        repsize += 2+1+1;
6281                    else if (*p<100)
6282                        repsize += 2+2+1;
6283                    else if (*p<1000)
6284                        repsize += 2+3+1;
6285                    else if (*p<10000)
6286                        repsize += 2+4+1;
6287#ifndef Py_UNICODE_WIDE
6288                    else
6289                        repsize += 2+5+1;
6290#else
6291                    else if (*p<100000)
6292                        repsize += 2+5+1;
6293                    else if (*p<1000000)
6294                        repsize += 2+6+1;
6295                    else
6296                        repsize += 2+7+1;
6297#endif
6298                }
6299                requiredsize = respos+repsize+(endp-collend);
6300                if (requiredsize > ressize) {
6301                    if (requiredsize<2*ressize)
6302                        requiredsize = 2*ressize;
6303                    if (_PyBytes_Resize(&res, requiredsize))
6304                        goto onError;
6305                    str = PyBytes_AS_STRING(res) + respos;
6306                    ressize = requiredsize;
6307                }
6308                /* generate replacement (temporarily (mis)uses p) */
6309                for (p = collstart; p < collend; ++p) {
6310                    str += sprintf(str, "&#%d;", (int)*p);
6311                }
6312                p = collend;
6313                break;
6314            default:
6315                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6316                                                              encoding, reason, startp, size, &exc,
6317                                                              collstart-startp, collend-startp, &newpos);
6318                if (repunicode == NULL)
6319                    goto onError;
6320                if (PyBytes_Check(repunicode)) {
6321                    /* Directly copy bytes result to output. */
6322                    repsize = PyBytes_Size(repunicode);
6323                    if (repsize > 1) {
6324                        /* Make room for all additional bytes. */
6325                        respos = str - PyBytes_AS_STRING(res);
6326                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6327                            Py_DECREF(repunicode);
6328                            goto onError;
6329                        }
6330                        str = PyBytes_AS_STRING(res) + respos;
6331                        ressize += repsize-1;
6332                    }
6333                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6334                    str += repsize;
6335                    p = startp + newpos;
6336                    Py_DECREF(repunicode);
6337                    break;
6338                }
6339                /* need more space? (at least enough for what we
6340                   have+the replacement+the rest of the string, so
6341                   we won't have to check space for encodable characters) */
6342                respos = str - PyBytes_AS_STRING(res);
6343                repsize = PyUnicode_GET_SIZE(repunicode);
6344                requiredsize = respos+repsize+(endp-collend);
6345                if (requiredsize > ressize) {
6346                    if (requiredsize<2*ressize)
6347                        requiredsize = 2*ressize;
6348                    if (_PyBytes_Resize(&res, requiredsize)) {
6349                        Py_DECREF(repunicode);
6350                        goto onError;
6351                    }
6352                    str = PyBytes_AS_STRING(res) + respos;
6353                    ressize = requiredsize;
6354                }
6355                /* check if there is anything unencodable in the replacement
6356                   and copy it to the output */
6357                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6358                    c = *uni2;
6359                    if (c >= limit) {
6360                        raise_encode_exception(&exc, encoding, startp, size,
6361                                               unicodepos, unicodepos+1, reason);
6362                        Py_DECREF(repunicode);
6363                        goto onError;
6364                    }
6365                    *str = (char)c;
6366                }
6367                p = startp + newpos;
6368                Py_DECREF(repunicode);
6369            }
6370        }
6371    }
6372    /* Resize if we allocated to much */
6373    size = str - PyBytes_AS_STRING(res);
6374    if (size < ressize) { /* If this falls res will be NULL */
6375        assert(size >= 0);
6376        if (_PyBytes_Resize(&res, size) < 0)
6377            goto onError;
6378    }
6379
6380    Py_XDECREF(errorHandler);
6381    Py_XDECREF(exc);
6382    return res;
6383
6384  onError:
6385    Py_XDECREF(res);
6386    Py_XDECREF(errorHandler);
6387    Py_XDECREF(exc);
6388    return NULL;
6389}
6390
6391PyObject *
6392PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6393                       Py_ssize_t size,
6394                       const char *errors)
6395{
6396    return unicode_encode_ucs1(p, size, errors, 256);
6397}
6398
6399PyObject *
6400_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6401{
6402    if (!PyUnicode_Check(unicode)) {
6403        PyErr_BadArgument();
6404        return NULL;
6405    }
6406    if (PyUnicode_READY(unicode) == -1)
6407        return NULL;
6408    /* Fast path: if it is a one-byte string, construct
6409       bytes object directly. */
6410    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6411        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6412                                         PyUnicode_GET_LENGTH(unicode));
6413    /* Non-Latin-1 characters present. Defer to above function to
6414       raise the exception. */
6415    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6416                                  PyUnicode_GET_SIZE(unicode),
6417                                  errors);
6418}
6419
6420PyObject*
6421PyUnicode_AsLatin1String(PyObject *unicode)
6422{
6423    return _PyUnicode_AsLatin1String(unicode, NULL);
6424}
6425
6426/* --- 7-bit ASCII Codec -------------------------------------------------- */
6427
6428PyObject *
6429PyUnicode_DecodeASCII(const char *s,
6430                      Py_ssize_t size,
6431                      const char *errors)
6432{
6433    const char *starts = s;
6434    PyUnicodeObject *v;
6435    Py_UNICODE *p;
6436    Py_ssize_t startinpos;
6437    Py_ssize_t endinpos;
6438    Py_ssize_t outpos;
6439    const char *e;
6440    unsigned char* d;
6441    PyObject *errorHandler = NULL;
6442    PyObject *exc = NULL;
6443    Py_ssize_t i;
6444
6445    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6446    if (size == 1 && *(unsigned char*)s < 128)
6447        return PyUnicode_FromOrdinal(*(unsigned char*)s);
6448
6449    /* Fast path. Assume the input actually *is* ASCII, and allocate
6450       a single-block Unicode object with that assumption. If there is
6451       an error, drop the object and start over. */
6452    v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6453    if (v == NULL)
6454        goto onError;
6455    d = PyUnicode_1BYTE_DATA(v);
6456    for (i = 0; i < size; i++) {
6457        unsigned char ch = ((unsigned char*)s)[i];
6458        if (ch < 128)
6459            d[i] = ch;
6460        else
6461            break;
6462    }
6463    if (i == size)
6464        return (PyObject*)v;
6465    Py_DECREF(v); /* start over */
6466
6467    v = _PyUnicode_New(size);
6468    if (v == NULL)
6469        goto onError;
6470    if (size == 0)
6471        return (PyObject *)v;
6472    p = PyUnicode_AS_UNICODE(v);
6473    e = s + size;
6474    while (s < e) {
6475        register unsigned char c = (unsigned char)*s;
6476        if (c < 128) {
6477            *p++ = c;
6478            ++s;
6479        }
6480        else {
6481            startinpos = s-starts;
6482            endinpos = startinpos + 1;
6483            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6484            if (unicode_decode_call_errorhandler(
6485                    errors, &errorHandler,
6486                    "ascii", "ordinal not in range(128)",
6487                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6488                    &v, &outpos, &p))
6489                goto onError;
6490        }
6491    }
6492    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6493        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6494            goto onError;
6495    Py_XDECREF(errorHandler);
6496    Py_XDECREF(exc);
6497    if (_PyUnicode_READY_REPLACE(&v)) {
6498        Py_DECREF(v);
6499        return NULL;
6500    }
6501    return (PyObject *)v;
6502
6503  onError:
6504    Py_XDECREF(v);
6505    Py_XDECREF(errorHandler);
6506    Py_XDECREF(exc);
6507    return NULL;
6508}
6509
6510PyObject *
6511PyUnicode_EncodeASCII(const Py_UNICODE *p,
6512                      Py_ssize_t size,
6513                      const char *errors)
6514{
6515    return unicode_encode_ucs1(p, size, errors, 128);
6516}
6517
6518PyObject *
6519_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6520{
6521    if (!PyUnicode_Check(unicode)) {
6522        PyErr_BadArgument();
6523        return NULL;
6524    }
6525    if (PyUnicode_READY(unicode) == -1)
6526        return NULL;
6527    /* Fast path: if it is an ASCII-only string, construct bytes object
6528       directly. Else defer to above function to raise the exception. */
6529    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6530        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6531                                         PyUnicode_GET_LENGTH(unicode));
6532    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6533                                 PyUnicode_GET_SIZE(unicode),
6534                                 errors);
6535}
6536
6537PyObject *
6538PyUnicode_AsASCIIString(PyObject *unicode)
6539{
6540    return _PyUnicode_AsASCIIString(unicode, NULL);
6541}
6542
6543#ifdef HAVE_MBCS
6544
6545/* --- MBCS codecs for Windows -------------------------------------------- */
6546
6547#if SIZEOF_INT < SIZEOF_SIZE_T
6548#define NEED_RETRY
6549#endif
6550
6551/* XXX This code is limited to "true" double-byte encodings, as
6552   a) it assumes an incomplete character consists of a single byte, and
6553   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6554   encodings, see IsDBCSLeadByteEx documentation. */
6555
6556static int
6557is_dbcs_lead_byte(const char *s, int offset)
6558{
6559    const char *curr = s + offset;
6560
6561    if (IsDBCSLeadByte(*curr)) {
6562        const char *prev = CharPrev(s, curr);
6563        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6564    }
6565    return 0;
6566}
6567
6568/*
6569 * Decode MBCS string into unicode object. If 'final' is set, converts
6570 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6571 */
6572static int
6573decode_mbcs(PyUnicodeObject **v,
6574            const char *s, /* MBCS string */
6575            int size, /* sizeof MBCS string */
6576            int final,
6577            const char *errors)
6578{
6579    Py_UNICODE *p;
6580    Py_ssize_t n;
6581    DWORD usize;
6582    DWORD flags;
6583
6584    assert(size >= 0);
6585
6586    /* check and handle 'errors' arg */
6587    if (errors==NULL || strcmp(errors, "strict")==0)
6588        flags = MB_ERR_INVALID_CHARS;
6589    else if (strcmp(errors, "ignore")==0)
6590        flags = 0;
6591    else {
6592        PyErr_Format(PyExc_ValueError,
6593                     "mbcs encoding does not support errors='%s'",
6594                     errors);
6595        return -1;
6596    }
6597
6598    /* Skip trailing lead-byte unless 'final' is set */
6599    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6600        --size;
6601
6602    /* First get the size of the result */
6603    if (size > 0) {
6604        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6605        if (usize==0)
6606            goto mbcs_decode_error;
6607    } else
6608        usize = 0;
6609
6610    if (*v == NULL) {
6611        /* Create unicode object */
6612        *v = _PyUnicode_New(usize);
6613        if (*v == NULL)
6614            return -1;
6615        n = 0;
6616    }
6617    else {
6618        /* Extend unicode object */
6619        n = PyUnicode_GET_SIZE(*v);
6620        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6621            return -1;
6622    }
6623
6624    /* Do the conversion */
6625    if (usize > 0) {
6626        p = PyUnicode_AS_UNICODE(*v) + n;
6627        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6628            goto mbcs_decode_error;
6629        }
6630    }
6631    return size;
6632
6633mbcs_decode_error:
6634    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6635       we raise a UnicodeDecodeError - else it is a 'generic'
6636       windows error
6637     */
6638    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6639        /* Ideally, we should get reason from FormatMessage - this
6640           is the Windows 2000 English version of the message
6641        */
6642        PyObject *exc = NULL;
6643        const char *reason = "No mapping for the Unicode character exists "
6644                             "in the target multi-byte code page.";
6645        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6646        if (exc != NULL) {
6647            PyCodec_StrictErrors(exc);
6648            Py_DECREF(exc);
6649        }
6650    } else {
6651        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6652    }
6653    return -1;
6654}
6655
6656PyObject *
6657PyUnicode_DecodeMBCSStateful(const char *s,
6658                             Py_ssize_t size,
6659                             const char *errors,
6660                             Py_ssize_t *consumed)
6661{
6662    PyUnicodeObject *v = NULL;
6663    int done;
6664
6665    if (consumed)
6666        *consumed = 0;
6667
6668#ifdef NEED_RETRY
6669  retry:
6670    if (size > INT_MAX)
6671        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6672    else
6673#endif
6674        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6675
6676    if (done < 0) {
6677        Py_XDECREF(v);
6678        return NULL;
6679    }
6680
6681    if (consumed)
6682        *consumed += done;
6683
6684#ifdef NEED_RETRY
6685    if (size > INT_MAX) {
6686        s += done;
6687        size -= done;
6688        goto retry;
6689    }
6690#endif
6691    if (_PyUnicode_READY_REPLACE(&v)) {
6692        Py_DECREF(v);
6693        return NULL;
6694    }
6695    return (PyObject *)v;
6696}
6697
6698PyObject *
6699PyUnicode_DecodeMBCS(const char *s,
6700                     Py_ssize_t size,
6701                     const char *errors)
6702{
6703    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6704}
6705
6706/*
6707 * Convert unicode into string object (MBCS).
6708 * Returns 0 if succeed, -1 otherwise.
6709 */
6710static int
6711encode_mbcs(PyObject **repr,
6712            const Py_UNICODE *p, /* unicode */
6713            int size, /* size of unicode */
6714            const char* errors)
6715{
6716    BOOL usedDefaultChar = FALSE;
6717    BOOL *pusedDefaultChar;
6718    int mbcssize;
6719    Py_ssize_t n;
6720    PyObject *exc = NULL;
6721    DWORD flags;
6722
6723    assert(size >= 0);
6724
6725    /* check and handle 'errors' arg */
6726    if (errors==NULL || strcmp(errors, "strict")==0) {
6727        flags = WC_NO_BEST_FIT_CHARS;
6728        pusedDefaultChar = &usedDefaultChar;
6729    } else if (strcmp(errors, "replace")==0) {
6730        flags = 0;
6731        pusedDefaultChar = NULL;
6732    } else {
6733         PyErr_Format(PyExc_ValueError,
6734                      "mbcs encoding does not support errors='%s'",
6735                      errors);
6736         return -1;
6737    }
6738
6739    /* First get the size of the result */
6740    if (size > 0) {
6741        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6742                                       NULL, pusedDefaultChar);
6743        if (mbcssize == 0) {
6744            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6745            return -1;
6746        }
6747        /* If we used a default char, then we failed! */
6748        if (pusedDefaultChar && *pusedDefaultChar)
6749            goto mbcs_encode_error;
6750    } else {
6751        mbcssize = 0;
6752    }
6753
6754    if (*repr == NULL) {
6755        /* Create string object */
6756        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6757        if (*repr == NULL)
6758            return -1;
6759        n = 0;
6760    }
6761    else {
6762        /* Extend string object */
6763        n = PyBytes_Size(*repr);
6764        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6765            return -1;
6766    }
6767
6768    /* Do the conversion */
6769    if (size > 0) {
6770        char *s = PyBytes_AS_STRING(*repr) + n;
6771        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6772                                     NULL, pusedDefaultChar)) {
6773            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6774            return -1;
6775        }
6776        if (pusedDefaultChar && *pusedDefaultChar)
6777            goto mbcs_encode_error;
6778    }
6779    return 0;
6780
6781mbcs_encode_error:
6782    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6783    Py_XDECREF(exc);
6784    return -1;
6785}
6786
6787PyObject *
6788PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6789                     Py_ssize_t size,
6790                     const char *errors)
6791{
6792    PyObject *repr = NULL;
6793    int ret;
6794
6795#ifdef NEED_RETRY
6796  retry:
6797    if (size > INT_MAX)
6798        ret = encode_mbcs(&repr, p, INT_MAX, errors);
6799    else
6800#endif
6801        ret = encode_mbcs(&repr, p, (int)size, errors);
6802
6803    if (ret < 0) {
6804        Py_XDECREF(repr);
6805        return NULL;
6806    }
6807
6808#ifdef NEED_RETRY
6809    if (size > INT_MAX) {
6810        p += INT_MAX;
6811        size -= INT_MAX;
6812        goto retry;
6813    }
6814#endif
6815
6816    return repr;
6817}
6818
6819PyObject *
6820PyUnicode_AsMBCSString(PyObject *unicode)
6821{
6822    if (!PyUnicode_Check(unicode)) {
6823        PyErr_BadArgument();
6824        return NULL;
6825    }
6826    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
6827                                PyUnicode_GET_SIZE(unicode),
6828                                NULL);
6829}
6830
6831#undef NEED_RETRY
6832
6833#endif /* HAVE_MBCS */
6834
6835/* --- Character Mapping Codec -------------------------------------------- */
6836
6837PyObject *
6838PyUnicode_DecodeCharmap(const char *s,
6839                        Py_ssize_t size,
6840                        PyObject *mapping,
6841                        const char *errors)
6842{
6843    const char *starts = s;
6844    Py_ssize_t startinpos;
6845    Py_ssize_t endinpos;
6846    Py_ssize_t outpos;
6847    const char *e;
6848    PyUnicodeObject *v;
6849    Py_UNICODE *p;
6850    Py_ssize_t extrachars = 0;
6851    PyObject *errorHandler = NULL;
6852    PyObject *exc = NULL;
6853    Py_UNICODE *mapstring = NULL;
6854    Py_ssize_t maplen = 0;
6855
6856    /* Default to Latin-1 */
6857    if (mapping == NULL)
6858        return PyUnicode_DecodeLatin1(s, size, errors);
6859
6860    v = _PyUnicode_New(size);
6861    if (v == NULL)
6862        goto onError;
6863    if (size == 0)
6864        return (PyObject *)v;
6865    p = PyUnicode_AS_UNICODE(v);
6866    e = s + size;
6867    if (PyUnicode_CheckExact(mapping)) {
6868        mapstring = PyUnicode_AS_UNICODE(mapping);
6869        maplen = PyUnicode_GET_SIZE(mapping);
6870        while (s < e) {
6871            unsigned char ch = *s;
6872            Py_UNICODE x = 0xfffe; /* illegal value */
6873
6874            if (ch < maplen)
6875                x = mapstring[ch];
6876
6877            if (x == 0xfffe) {
6878                /* undefined mapping */
6879                outpos = p-PyUnicode_AS_UNICODE(v);
6880                startinpos = s-starts;
6881                endinpos = startinpos+1;
6882                if (unicode_decode_call_errorhandler(
6883                        errors, &errorHandler,
6884                        "charmap", "character maps to <undefined>",
6885                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6886                        &v, &outpos, &p)) {
6887                    goto onError;
6888                }
6889                continue;
6890            }
6891            *p++ = x;
6892            ++s;
6893        }
6894    }
6895    else {
6896        while (s < e) {
6897            unsigned char ch = *s;
6898            PyObject *w, *x;
6899
6900            /* Get mapping (char ordinal -> integer, Unicode char or None) */
6901            w = PyLong_FromLong((long)ch);
6902            if (w == NULL)
6903                goto onError;
6904            x = PyObject_GetItem(mapping, w);
6905            Py_DECREF(w);
6906            if (x == NULL) {
6907                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6908                    /* No mapping found means: mapping is undefined. */
6909                    PyErr_Clear();
6910                    x = Py_None;
6911                    Py_INCREF(x);
6912                } else
6913                    goto onError;
6914            }
6915
6916            /* Apply mapping */
6917            if (PyLong_Check(x)) {
6918                long value = PyLong_AS_LONG(x);
6919                if (value < 0 || value > 65535) {
6920                    PyErr_SetString(PyExc_TypeError,
6921                                    "character mapping must be in range(65536)");
6922                    Py_DECREF(x);
6923                    goto onError;
6924                }
6925                *p++ = (Py_UNICODE)value;
6926            }
6927            else if (x == Py_None) {
6928                /* undefined mapping */
6929                outpos = p-PyUnicode_AS_UNICODE(v);
6930                startinpos = s-starts;
6931                endinpos = startinpos+1;
6932                if (unicode_decode_call_errorhandler(
6933                        errors, &errorHandler,
6934                        "charmap", "character maps to <undefined>",
6935                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6936                        &v, &outpos, &p)) {
6937                    Py_DECREF(x);
6938                    goto onError;
6939                }
6940                Py_DECREF(x);
6941                continue;
6942            }
6943            else if (PyUnicode_Check(x)) {
6944                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
6945
6946                if (targetsize == 1)
6947                    /* 1-1 mapping */
6948                    *p++ = *PyUnicode_AS_UNICODE(x);
6949
6950                else if (targetsize > 1) {
6951                    /* 1-n mapping */
6952                    if (targetsize > extrachars) {
6953                        /* resize first */
6954                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6955                        Py_ssize_t needed = (targetsize - extrachars) + \
6956                            (targetsize << 2);
6957                        extrachars += needed;
6958                        /* XXX overflow detection missing */
6959                        if (PyUnicode_Resize((PyObject**)&v,
6960                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
6961                            Py_DECREF(x);
6962                            goto onError;
6963                        }
6964                        p = PyUnicode_AS_UNICODE(v) + oldpos;
6965                    }
6966                    Py_UNICODE_COPY(p,
6967                                    PyUnicode_AS_UNICODE(x),
6968                                    targetsize);
6969                    p += targetsize;
6970                    extrachars -= targetsize;
6971                }
6972                /* 1-0 mapping: skip the character */
6973            }
6974            else {
6975                /* wrong return value */
6976                PyErr_SetString(PyExc_TypeError,
6977                                "character mapping must return integer, None or str");
6978                Py_DECREF(x);
6979                goto onError;
6980            }
6981            Py_DECREF(x);
6982            ++s;
6983        }
6984    }
6985    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6986        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6987            goto onError;
6988    Py_XDECREF(errorHandler);
6989    Py_XDECREF(exc);
6990    if (_PyUnicode_READY_REPLACE(&v)) {
6991        Py_DECREF(v);
6992        return NULL;
6993    }
6994    return (PyObject *)v;
6995
6996  onError:
6997    Py_XDECREF(errorHandler);
6998    Py_XDECREF(exc);
6999    Py_XDECREF(v);
7000    return NULL;
7001}
7002
7003/* Charmap encoding: the lookup table */
7004
7005struct encoding_map {
7006    PyObject_HEAD
7007    unsigned char level1[32];
7008    int count2, count3;
7009    unsigned char level23[1];
7010};
7011
7012static PyObject*
7013encoding_map_size(PyObject *obj, PyObject* args)
7014{
7015    struct encoding_map *map = (struct encoding_map*)obj;
7016    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7017                           128*map->count3);
7018}
7019
7020static PyMethodDef encoding_map_methods[] = {
7021    {"size", encoding_map_size, METH_NOARGS,
7022     PyDoc_STR("Return the size (in bytes) of this object") },
7023    { 0 }
7024};
7025
7026static void
7027encoding_map_dealloc(PyObject* o)
7028{
7029    PyObject_FREE(o);
7030}
7031
7032static PyTypeObject EncodingMapType = {
7033    PyVarObject_HEAD_INIT(NULL, 0)
7034    "EncodingMap",          /*tp_name*/
7035    sizeof(struct encoding_map),   /*tp_basicsize*/
7036    0,                      /*tp_itemsize*/
7037    /* methods */
7038    encoding_map_dealloc,   /*tp_dealloc*/
7039    0,                      /*tp_print*/
7040    0,                      /*tp_getattr*/
7041    0,                      /*tp_setattr*/
7042    0,                      /*tp_reserved*/
7043    0,                      /*tp_repr*/
7044    0,                      /*tp_as_number*/
7045    0,                      /*tp_as_sequence*/
7046    0,                      /*tp_as_mapping*/
7047    0,                      /*tp_hash*/
7048    0,                      /*tp_call*/
7049    0,                      /*tp_str*/
7050    0,                      /*tp_getattro*/
7051    0,                      /*tp_setattro*/
7052    0,                      /*tp_as_buffer*/
7053    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7054    0,                      /*tp_doc*/
7055    0,                      /*tp_traverse*/
7056    0,                      /*tp_clear*/
7057    0,                      /*tp_richcompare*/
7058    0,                      /*tp_weaklistoffset*/
7059    0,                      /*tp_iter*/
7060    0,                      /*tp_iternext*/
7061    encoding_map_methods,   /*tp_methods*/
7062    0,                      /*tp_members*/
7063    0,                      /*tp_getset*/
7064    0,                      /*tp_base*/
7065    0,                      /*tp_dict*/
7066    0,                      /*tp_descr_get*/
7067    0,                      /*tp_descr_set*/
7068    0,                      /*tp_dictoffset*/
7069    0,                      /*tp_init*/
7070    0,                      /*tp_alloc*/
7071    0,                      /*tp_new*/
7072    0,                      /*tp_free*/
7073    0,                      /*tp_is_gc*/
7074};
7075
7076PyObject*
7077PyUnicode_BuildEncodingMap(PyObject* string)
7078{
7079    PyObject *result;
7080    struct encoding_map *mresult;
7081    int i;
7082    int need_dict = 0;
7083    unsigned char level1[32];
7084    unsigned char level2[512];
7085    unsigned char *mlevel1, *mlevel2, *mlevel3;
7086    int count2 = 0, count3 = 0;
7087    int kind;
7088    void *data;
7089    Py_UCS4 ch;
7090
7091    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7092        PyErr_BadArgument();
7093        return NULL;
7094    }
7095    kind = PyUnicode_KIND(string);
7096    data = PyUnicode_DATA(string);
7097    memset(level1, 0xFF, sizeof level1);
7098    memset(level2, 0xFF, sizeof level2);
7099
7100    /* If there isn't a one-to-one mapping of NULL to \0,
7101       or if there are non-BMP characters, we need to use
7102       a mapping dictionary. */
7103    if (PyUnicode_READ(kind, data, 0) != 0)
7104        need_dict = 1;
7105    for (i = 1; i < 256; i++) {
7106        int l1, l2;
7107        ch = PyUnicode_READ(kind, data, i);
7108        if (ch == 0 || ch > 0xFFFF) {
7109            need_dict = 1;
7110            break;
7111        }
7112        if (ch == 0xFFFE)
7113            /* unmapped character */
7114            continue;
7115        l1 = ch >> 11;
7116        l2 = ch >> 7;
7117        if (level1[l1] == 0xFF)
7118            level1[l1] = count2++;
7119        if (level2[l2] == 0xFF)
7120            level2[l2] = count3++;
7121    }
7122
7123    if (count2 >= 0xFF || count3 >= 0xFF)
7124        need_dict = 1;
7125
7126    if (need_dict) {
7127        PyObject *result = PyDict_New();
7128        PyObject *key, *value;
7129        if (!result)
7130            return NULL;
7131        for (i = 0; i < 256; i++) {
7132            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7133            value = PyLong_FromLong(i);
7134            if (!key || !value)
7135                goto failed1;
7136            if (PyDict_SetItem(result, key, value) == -1)
7137                goto failed1;
7138            Py_DECREF(key);
7139            Py_DECREF(value);
7140        }
7141        return result;
7142      failed1:
7143        Py_XDECREF(key);
7144        Py_XDECREF(value);
7145        Py_DECREF(result);
7146        return NULL;
7147    }
7148
7149    /* Create a three-level trie */
7150    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7151                             16*count2 + 128*count3 - 1);
7152    if (!result)
7153        return PyErr_NoMemory();
7154    PyObject_Init(result, &EncodingMapType);
7155    mresult = (struct encoding_map*)result;
7156    mresult->count2 = count2;
7157    mresult->count3 = count3;
7158    mlevel1 = mresult->level1;
7159    mlevel2 = mresult->level23;
7160    mlevel3 = mresult->level23 + 16*count2;
7161    memcpy(mlevel1, level1, 32);
7162    memset(mlevel2, 0xFF, 16*count2);
7163    memset(mlevel3, 0, 128*count3);
7164    count3 = 0;
7165    for (i = 1; i < 256; i++) {
7166        int o1, o2, o3, i2, i3;
7167        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7168            /* unmapped character */
7169            continue;
7170        o1 = PyUnicode_READ(kind, data, i)>>11;
7171        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7172        i2 = 16*mlevel1[o1] + o2;
7173        if (mlevel2[i2] == 0xFF)
7174            mlevel2[i2] = count3++;
7175        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7176        i3 = 128*mlevel2[i2] + o3;
7177        mlevel3[i3] = i;
7178    }
7179    return result;
7180}
7181
7182static int
7183encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7184{
7185    struct encoding_map *map = (struct encoding_map*)mapping;
7186    int l1 = c>>11;
7187    int l2 = (c>>7) & 0xF;
7188    int l3 = c & 0x7F;
7189    int i;
7190
7191#ifdef Py_UNICODE_WIDE
7192    if (c > 0xFFFF) {
7193        return -1;
7194    }
7195#endif
7196    if (c == 0)
7197        return 0;
7198    /* level 1*/
7199    i = map->level1[l1];
7200    if (i == 0xFF) {
7201        return -1;
7202    }
7203    /* level 2*/
7204    i = map->level23[16*i+l2];
7205    if (i == 0xFF) {
7206        return -1;
7207    }
7208    /* level 3 */
7209    i = map->level23[16*map->count2 + 128*i + l3];
7210    if (i == 0) {
7211        return -1;
7212    }
7213    return i;
7214}
7215
7216/* Lookup the character ch in the mapping. If the character
7217   can't be found, Py_None is returned (or NULL, if another
7218   error occurred). */
7219static PyObject *
7220charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7221{
7222    PyObject *w = PyLong_FromLong((long)c);
7223    PyObject *x;
7224
7225    if (w == NULL)
7226        return NULL;
7227    x = PyObject_GetItem(mapping, w);
7228    Py_DECREF(w);
7229    if (x == NULL) {
7230        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7231            /* No mapping found means: mapping is undefined. */
7232            PyErr_Clear();
7233            x = Py_None;
7234            Py_INCREF(x);
7235            return x;
7236        } else
7237            return NULL;
7238    }
7239    else if (x == Py_None)
7240        return x;
7241    else if (PyLong_Check(x)) {
7242        long value = PyLong_AS_LONG(x);
7243        if (value < 0 || value > 255) {
7244            PyErr_SetString(PyExc_TypeError,
7245                            "character mapping must be in range(256)");
7246            Py_DECREF(x);
7247            return NULL;
7248        }
7249        return x;
7250    }
7251    else if (PyBytes_Check(x))
7252        return x;
7253    else {
7254        /* wrong return value */
7255        PyErr_Format(PyExc_TypeError,
7256                     "character mapping must return integer, bytes or None, not %.400s",
7257                     x->ob_type->tp_name);
7258        Py_DECREF(x);
7259        return NULL;
7260    }
7261}
7262
7263static int
7264charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7265{
7266    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7267    /* exponentially overallocate to minimize reallocations */
7268    if (requiredsize < 2*outsize)
7269        requiredsize = 2*outsize;
7270    if (_PyBytes_Resize(outobj, requiredsize))
7271        return -1;
7272    return 0;
7273}
7274
7275typedef enum charmapencode_result {
7276    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7277} charmapencode_result;
7278/* lookup the character, put the result in the output string and adjust
7279   various state variables. Resize the output bytes object if not enough
7280   space is available. Return a new reference to the object that
7281   was put in the output buffer, or Py_None, if the mapping was undefined
7282   (in which case no character was written) or NULL, if a
7283   reallocation error occurred. The caller must decref the result */
7284static charmapencode_result
7285charmapencode_output(Py_UNICODE c, PyObject *mapping,
7286                     PyObject **outobj, Py_ssize_t *outpos)
7287{
7288    PyObject *rep;
7289    char *outstart;
7290    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7291
7292    if (Py_TYPE(mapping) == &EncodingMapType) {
7293        int res = encoding_map_lookup(c, mapping);
7294        Py_ssize_t requiredsize = *outpos+1;
7295        if (res == -1)
7296            return enc_FAILED;
7297        if (outsize<requiredsize)
7298            if (charmapencode_resize(outobj, outpos, requiredsize))
7299                return enc_EXCEPTION;
7300        outstart = PyBytes_AS_STRING(*outobj);
7301        outstart[(*outpos)++] = (char)res;
7302        return enc_SUCCESS;
7303    }
7304
7305    rep = charmapencode_lookup(c, mapping);
7306    if (rep==NULL)
7307        return enc_EXCEPTION;
7308    else if (rep==Py_None) {
7309        Py_DECREF(rep);
7310        return enc_FAILED;
7311    } else {
7312        if (PyLong_Check(rep)) {
7313            Py_ssize_t requiredsize = *outpos+1;
7314            if (outsize<requiredsize)
7315                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7316                    Py_DECREF(rep);
7317                    return enc_EXCEPTION;
7318                }
7319            outstart = PyBytes_AS_STRING(*outobj);
7320            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7321        }
7322        else {
7323            const char *repchars = PyBytes_AS_STRING(rep);
7324            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7325            Py_ssize_t requiredsize = *outpos+repsize;
7326            if (outsize<requiredsize)
7327                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7328                    Py_DECREF(rep);
7329                    return enc_EXCEPTION;
7330                }
7331            outstart = PyBytes_AS_STRING(*outobj);
7332            memcpy(outstart + *outpos, repchars, repsize);
7333            *outpos += repsize;
7334        }
7335    }
7336    Py_DECREF(rep);
7337    return enc_SUCCESS;
7338}
7339
7340/* handle an error in PyUnicode_EncodeCharmap
7341   Return 0 on success, -1 on error */
7342static int
7343charmap_encoding_error(
7344    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7345    PyObject **exceptionObject,
7346    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7347    PyObject **res, Py_ssize_t *respos)
7348{
7349    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7350    Py_ssize_t repsize;
7351    Py_ssize_t newpos;
7352    Py_UNICODE *uni2;
7353    /* startpos for collecting unencodable chars */
7354    Py_ssize_t collstartpos = *inpos;
7355    Py_ssize_t collendpos = *inpos+1;
7356    Py_ssize_t collpos;
7357    char *encoding = "charmap";
7358    char *reason = "character maps to <undefined>";
7359    charmapencode_result x;
7360
7361    /* find all unencodable characters */
7362    while (collendpos < size) {
7363        PyObject *rep;
7364        if (Py_TYPE(mapping) == &EncodingMapType) {
7365            int res = encoding_map_lookup(p[collendpos], mapping);
7366            if (res != -1)
7367                break;
7368            ++collendpos;
7369            continue;
7370        }
7371
7372        rep = charmapencode_lookup(p[collendpos], mapping);
7373        if (rep==NULL)
7374            return -1;
7375        else if (rep!=Py_None) {
7376            Py_DECREF(rep);
7377            break;
7378        }
7379        Py_DECREF(rep);
7380        ++collendpos;
7381    }
7382    /* cache callback name lookup
7383     * (if not done yet, i.e. it's the first error) */
7384    if (*known_errorHandler==-1) {
7385        if ((errors==NULL) || (!strcmp(errors, "strict")))
7386            *known_errorHandler = 1;
7387        else if (!strcmp(errors, "replace"))
7388            *known_errorHandler = 2;
7389        else if (!strcmp(errors, "ignore"))
7390            *known_errorHandler = 3;
7391        else if (!strcmp(errors, "xmlcharrefreplace"))
7392            *known_errorHandler = 4;
7393        else
7394            *known_errorHandler = 0;
7395    }
7396    switch (*known_errorHandler) {
7397    case 1: /* strict */
7398        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7399        return -1;
7400    case 2: /* replace */
7401        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7402            x = charmapencode_output('?', mapping, res, respos);
7403            if (x==enc_EXCEPTION) {
7404                return -1;
7405            }
7406            else if (x==enc_FAILED) {
7407                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7408                return -1;
7409            }
7410        }
7411        /* fall through */
7412    case 3: /* ignore */
7413        *inpos = collendpos;
7414        break;
7415    case 4: /* xmlcharrefreplace */
7416        /* generate replacement (temporarily (mis)uses p) */
7417        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7418            char buffer[2+29+1+1];
7419            char *cp;
7420            sprintf(buffer, "&#%d;", (int)p[collpos]);
7421            for (cp = buffer; *cp; ++cp) {
7422                x = charmapencode_output(*cp, mapping, res, respos);
7423                if (x==enc_EXCEPTION)
7424                    return -1;
7425                else if (x==enc_FAILED) {
7426                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7427                    return -1;
7428                }
7429            }
7430        }
7431        *inpos = collendpos;
7432        break;
7433    default:
7434        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7435                                                      encoding, reason, p, size, exceptionObject,
7436                                                      collstartpos, collendpos, &newpos);
7437        if (repunicode == NULL)
7438            return -1;
7439        if (PyBytes_Check(repunicode)) {
7440            /* Directly copy bytes result to output. */
7441            Py_ssize_t outsize = PyBytes_Size(*res);
7442            Py_ssize_t requiredsize;
7443            repsize = PyBytes_Size(repunicode);
7444            requiredsize = *respos + repsize;
7445            if (requiredsize > outsize)
7446                /* Make room for all additional bytes. */
7447                if (charmapencode_resize(res, respos, requiredsize)) {
7448                    Py_DECREF(repunicode);
7449                    return -1;
7450                }
7451            memcpy(PyBytes_AsString(*res) + *respos,
7452                   PyBytes_AsString(repunicode),  repsize);
7453            *respos += repsize;
7454            *inpos = newpos;
7455            Py_DECREF(repunicode);
7456            break;
7457        }
7458        /* generate replacement  */
7459        repsize = PyUnicode_GET_SIZE(repunicode);
7460        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7461            x = charmapencode_output(*uni2, mapping, res, respos);
7462            if (x==enc_EXCEPTION) {
7463                return -1;
7464            }
7465            else if (x==enc_FAILED) {
7466                Py_DECREF(repunicode);
7467                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7468                return -1;
7469            }
7470        }
7471        *inpos = newpos;
7472        Py_DECREF(repunicode);
7473    }
7474    return 0;
7475}
7476
7477PyObject *
7478PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7479                        Py_ssize_t size,
7480                        PyObject *mapping,
7481                        const char *errors)
7482{
7483    /* output object */
7484    PyObject *res = NULL;
7485    /* current input position */
7486    Py_ssize_t inpos = 0;
7487    /* current output position */
7488    Py_ssize_t respos = 0;
7489    PyObject *errorHandler = NULL;
7490    PyObject *exc = NULL;
7491    /* the following variable is used for caching string comparisons
7492     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7493     * 3=ignore, 4=xmlcharrefreplace */
7494    int known_errorHandler = -1;
7495
7496    /* Default to Latin-1 */
7497    if (mapping == NULL)
7498        return PyUnicode_EncodeLatin1(p, size, errors);
7499
7500    /* allocate enough for a simple encoding without
7501       replacements, if we need more, we'll resize */
7502    res = PyBytes_FromStringAndSize(NULL, size);
7503    if (res == NULL)
7504        goto onError;
7505    if (size == 0)
7506        return res;
7507
7508    while (inpos<size) {
7509        /* try to encode it */
7510        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7511        if (x==enc_EXCEPTION) /* error */
7512            goto onError;
7513        if (x==enc_FAILED) { /* unencodable character */
7514            if (charmap_encoding_error(p, size, &inpos, mapping,
7515                                       &exc,
7516                                       &known_errorHandler, &errorHandler, errors,
7517                                       &res, &respos)) {
7518                goto onError;
7519            }
7520        }
7521        else
7522            /* done with this character => adjust input position */
7523            ++inpos;
7524    }
7525
7526    /* Resize if we allocated to much */
7527    if (respos<PyBytes_GET_SIZE(res))
7528        if (_PyBytes_Resize(&res, respos) < 0)
7529            goto onError;
7530
7531    Py_XDECREF(exc);
7532    Py_XDECREF(errorHandler);
7533    return res;
7534
7535  onError:
7536    Py_XDECREF(res);
7537    Py_XDECREF(exc);
7538    Py_XDECREF(errorHandler);
7539    return NULL;
7540}
7541
7542PyObject *
7543PyUnicode_AsCharmapString(PyObject *unicode,
7544                          PyObject *mapping)
7545{
7546    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7547        PyErr_BadArgument();
7548        return NULL;
7549    }
7550    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7551                                   PyUnicode_GET_SIZE(unicode),
7552                                   mapping,
7553                                   NULL);
7554}
7555
7556/* create or adjust a UnicodeTranslateError */
7557static void
7558make_translate_exception(PyObject **exceptionObject,
7559                         PyObject *unicode,
7560                         Py_ssize_t startpos, Py_ssize_t endpos,
7561                         const char *reason)
7562{
7563    if (*exceptionObject == NULL) {
7564        *exceptionObject = _PyUnicodeTranslateError_Create(
7565            unicode, startpos, endpos, reason);
7566    }
7567    else {
7568        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7569            goto onError;
7570        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7571            goto onError;
7572        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7573            goto onError;
7574        return;
7575      onError:
7576        Py_DECREF(*exceptionObject);
7577        *exceptionObject = NULL;
7578    }
7579}
7580
7581/* raises a UnicodeTranslateError */
7582static void
7583raise_translate_exception(PyObject **exceptionObject,
7584                          PyObject *unicode,
7585                          Py_ssize_t startpos, Py_ssize_t endpos,
7586                          const char *reason)
7587{
7588    make_translate_exception(exceptionObject,
7589                             unicode, startpos, endpos, reason);
7590    if (*exceptionObject != NULL)
7591        PyCodec_StrictErrors(*exceptionObject);
7592}
7593
7594/* error handling callback helper:
7595   build arguments, call the callback and check the arguments,
7596   put the result into newpos and return the replacement string, which
7597   has to be freed by the caller */
7598static PyObject *
7599unicode_translate_call_errorhandler(const char *errors,
7600                                    PyObject **errorHandler,
7601                                    const char *reason,
7602                                    PyObject *unicode, PyObject **exceptionObject,
7603                                    Py_ssize_t startpos, Py_ssize_t endpos,
7604                                    Py_ssize_t *newpos)
7605{
7606    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7607
7608    Py_ssize_t i_newpos;
7609    PyObject *restuple;
7610    PyObject *resunicode;
7611
7612    if (*errorHandler == NULL) {
7613        *errorHandler = PyCodec_LookupError(errors);
7614        if (*errorHandler == NULL)
7615            return NULL;
7616    }
7617
7618    make_translate_exception(exceptionObject,
7619                             unicode, startpos, endpos, reason);
7620    if (*exceptionObject == NULL)
7621        return NULL;
7622
7623    restuple = PyObject_CallFunctionObjArgs(
7624        *errorHandler, *exceptionObject, NULL);
7625    if (restuple == NULL)
7626        return NULL;
7627    if (!PyTuple_Check(restuple)) {
7628        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7629        Py_DECREF(restuple);
7630        return NULL;
7631    }
7632    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7633                          &resunicode, &i_newpos)) {
7634        Py_DECREF(restuple);
7635        return NULL;
7636    }
7637    if (i_newpos<0)
7638        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7639    else
7640        *newpos = i_newpos;
7641    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7642        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7643        Py_DECREF(restuple);
7644        return NULL;
7645    }
7646    Py_INCREF(resunicode);
7647    Py_DECREF(restuple);
7648    return resunicode;
7649}
7650
7651/* Lookup the character ch in the mapping and put the result in result,
7652   which must be decrefed by the caller.
7653   Return 0 on success, -1 on error */
7654static int
7655charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7656{
7657    PyObject *w = PyLong_FromLong((long)c);
7658    PyObject *x;
7659
7660    if (w == NULL)
7661        return -1;
7662    x = PyObject_GetItem(mapping, w);
7663    Py_DECREF(w);
7664    if (x == NULL) {
7665        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7666            /* No mapping found means: use 1:1 mapping. */
7667            PyErr_Clear();
7668            *result = NULL;
7669            return 0;
7670        } else
7671            return -1;
7672    }
7673    else if (x == Py_None) {
7674        *result = x;
7675        return 0;
7676    }
7677    else if (PyLong_Check(x)) {
7678        long value = PyLong_AS_LONG(x);
7679        long max = PyUnicode_GetMax();
7680        if (value < 0 || value > max) {
7681            PyErr_Format(PyExc_TypeError,
7682                         "character mapping must be in range(0x%x)", max+1);
7683            Py_DECREF(x);
7684            return -1;
7685        }
7686        *result = x;
7687        return 0;
7688    }
7689    else if (PyUnicode_Check(x)) {
7690        *result = x;
7691        return 0;
7692    }
7693    else {
7694        /* wrong return value */
7695        PyErr_SetString(PyExc_TypeError,
7696                        "character mapping must return integer, None or str");
7697        Py_DECREF(x);
7698        return -1;
7699    }
7700}
7701/* ensure that *outobj is at least requiredsize characters long,
7702   if not reallocate and adjust various state variables.
7703   Return 0 on success, -1 on error */
7704static int
7705charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
7706                               Py_ssize_t requiredsize)
7707{
7708    Py_ssize_t oldsize = *psize;
7709    if (requiredsize > oldsize) {
7710        /* exponentially overallocate to minimize reallocations */
7711        if (requiredsize < 2 * oldsize)
7712            requiredsize = 2 * oldsize;
7713        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7714        if (*outobj == 0)
7715            return -1;
7716        *psize = requiredsize;
7717    }
7718    return 0;
7719}
7720/* lookup the character, put the result in the output string and adjust
7721   various state variables. Return a new reference to the object that
7722   was put in the output buffer in *result, or Py_None, if the mapping was
7723   undefined (in which case no character was written).
7724   The called must decref result.
7725   Return 0 on success, -1 on error. */
7726static int
7727charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7728                        PyObject *mapping, Py_UCS4 **output,
7729                        Py_ssize_t *osize, Py_ssize_t *opos,
7730                        PyObject **res)
7731{
7732    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7733    if (charmaptranslate_lookup(curinp, mapping, res))
7734        return -1;
7735    if (*res==NULL) {
7736        /* not found => default to 1:1 mapping */
7737        (*output)[(*opos)++] = curinp;
7738    }
7739    else if (*res==Py_None)
7740        ;
7741    else if (PyLong_Check(*res)) {
7742        /* no overflow check, because we know that the space is enough */
7743        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
7744    }
7745    else if (PyUnicode_Check(*res)) {
7746        Py_ssize_t repsize;
7747        if (PyUnicode_READY(*res) == -1)
7748            return -1;
7749        repsize = PyUnicode_GET_LENGTH(*res);
7750        if (repsize==1) {
7751            /* no overflow check, because we know that the space is enough */
7752            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
7753        }
7754        else if (repsize!=0) {
7755            /* more than one character */
7756            Py_ssize_t requiredsize = *opos +
7757                (PyUnicode_GET_LENGTH(input) - ipos) +
7758                repsize - 1;
7759            Py_ssize_t i;
7760            if (charmaptranslate_makespace(output, osize, requiredsize))
7761                return -1;
7762            for(i = 0; i < repsize; i++)
7763                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
7764        }
7765    }
7766    else
7767        return -1;
7768    return 0;
7769}
7770
7771PyObject *
7772_PyUnicode_TranslateCharmap(PyObject *input,
7773                            PyObject *mapping,
7774                            const char *errors)
7775{
7776    /* input object */
7777    char *idata;
7778    Py_ssize_t size, i;
7779    int kind;
7780    /* output buffer */
7781    Py_UCS4 *output = NULL;
7782    Py_ssize_t osize;
7783    PyObject *res;
7784    /* current output position */
7785    Py_ssize_t opos;
7786    char *reason = "character maps to <undefined>";
7787    PyObject *errorHandler = NULL;
7788    PyObject *exc = NULL;
7789    /* the following variable is used for caching string comparisons
7790     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7791     * 3=ignore, 4=xmlcharrefreplace */
7792    int known_errorHandler = -1;
7793
7794    if (mapping == NULL) {
7795        PyErr_BadArgument();
7796        return NULL;
7797    }
7798
7799    if (PyUnicode_READY(input) == -1)
7800        return NULL;
7801    idata = (char*)PyUnicode_DATA(input);
7802    kind = PyUnicode_KIND(input);
7803    size = PyUnicode_GET_LENGTH(input);
7804    i = 0;
7805
7806    if (size == 0) {
7807        Py_INCREF(input);
7808        return input;
7809    }
7810
7811    /* allocate enough for a simple 1:1 translation without
7812       replacements, if we need more, we'll resize */
7813    osize = size;
7814    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7815    opos = 0;
7816    if (output == NULL) {
7817        PyErr_NoMemory();
7818        goto onError;
7819    }
7820
7821    while (i<size) {
7822        /* try to encode it */
7823        PyObject *x = NULL;
7824        if (charmaptranslate_output(input, i, mapping,
7825                                    &output, &osize, &opos, &x)) {
7826            Py_XDECREF(x);
7827            goto onError;
7828        }
7829        Py_XDECREF(x);
7830        if (x!=Py_None) /* it worked => adjust input pointer */
7831            ++i;
7832        else { /* untranslatable character */
7833            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7834            Py_ssize_t repsize;
7835            Py_ssize_t newpos;
7836            Py_ssize_t uni2;
7837            /* startpos for collecting untranslatable chars */
7838            Py_ssize_t collstart = i;
7839            Py_ssize_t collend = i+1;
7840            Py_ssize_t coll;
7841
7842            /* find all untranslatable characters */
7843            while (collend < size) {
7844                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
7845                    goto onError;
7846                Py_XDECREF(x);
7847                if (x!=Py_None)
7848                    break;
7849                ++collend;
7850            }
7851            /* cache callback name lookup
7852             * (if not done yet, i.e. it's the first error) */
7853            if (known_errorHandler==-1) {
7854                if ((errors==NULL) || (!strcmp(errors, "strict")))
7855                    known_errorHandler = 1;
7856                else if (!strcmp(errors, "replace"))
7857                    known_errorHandler = 2;
7858                else if (!strcmp(errors, "ignore"))
7859                    known_errorHandler = 3;
7860                else if (!strcmp(errors, "xmlcharrefreplace"))
7861                    known_errorHandler = 4;
7862                else
7863                    known_errorHandler = 0;
7864            }
7865            switch (known_errorHandler) {
7866            case 1: /* strict */
7867                raise_translate_exception(&exc, input, collstart,
7868                                          collend, reason);
7869                goto onError;
7870            case 2: /* replace */
7871                /* No need to check for space, this is a 1:1 replacement */
7872                for (coll = collstart; coll<collend; coll++)
7873                    output[opos++] = '?';
7874                /* fall through */
7875            case 3: /* ignore */
7876                i = collend;
7877                break;
7878            case 4: /* xmlcharrefreplace */
7879                /* generate replacement (temporarily (mis)uses i) */
7880                for (i = collstart; i < collend; ++i) {
7881                    char buffer[2+29+1+1];
7882                    char *cp;
7883                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7884                    if (charmaptranslate_makespace(&output, &osize,
7885                                                   opos+strlen(buffer)+(size-collend)))
7886                        goto onError;
7887                    for (cp = buffer; *cp; ++cp)
7888                        output[opos++] = *cp;
7889                }
7890                i = collend;
7891                break;
7892            default:
7893                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
7894                                                                 reason, input, &exc,
7895                                                                 collstart, collend, &newpos);
7896                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
7897                    goto onError;
7898                /* generate replacement  */
7899                repsize = PyUnicode_GET_LENGTH(repunicode);
7900                if (charmaptranslate_makespace(&output, &osize,
7901                                               opos+repsize+(size-collend))) {
7902                    Py_DECREF(repunicode);
7903                    goto onError;
7904                }
7905                for (uni2 = 0; repsize-->0; ++uni2)
7906                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7907                i = newpos;
7908                Py_DECREF(repunicode);
7909            }
7910        }
7911    }
7912    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7913    if (!res)
7914        goto onError;
7915    PyMem_Free(output);
7916    Py_XDECREF(exc);
7917    Py_XDECREF(errorHandler);
7918    return res;
7919
7920  onError:
7921    PyMem_Free(output);
7922    Py_XDECREF(exc);
7923    Py_XDECREF(errorHandler);
7924    return NULL;
7925}
7926
7927/* Deprecated. Use PyUnicode_Translate instead. */
7928PyObject *
7929PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7930                           Py_ssize_t size,
7931                           PyObject *mapping,
7932                           const char *errors)
7933{
7934    PyObject *unicode = PyUnicode_FromUnicode(p, size);
7935    if (!unicode)
7936        return NULL;
7937    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7938}
7939
7940PyObject *
7941PyUnicode_Translate(PyObject *str,
7942                    PyObject *mapping,
7943                    const char *errors)
7944{
7945    PyObject *result;
7946
7947    str = PyUnicode_FromObject(str);
7948    if (str == NULL)
7949        goto onError;
7950    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
7951    Py_DECREF(str);
7952    return result;
7953
7954  onError:
7955    Py_XDECREF(str);
7956    return NULL;
7957}
7958
7959static Py_UCS4
7960fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7961{
7962    /* No need to call PyUnicode_READY(self) because this function is only
7963       called as a callback from fixup() which does it already. */
7964    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7965    const int kind = PyUnicode_KIND(self);
7966    void *data = PyUnicode_DATA(self);
7967    Py_UCS4 maxchar = 0, ch, fixed;
7968    Py_ssize_t i;
7969
7970    for (i = 0; i < len; ++i) {
7971        ch = PyUnicode_READ(kind, data, i);
7972        fixed = 0;
7973        if (ch > 127) {
7974            if (Py_UNICODE_ISSPACE(ch))
7975                fixed = ' ';
7976            else {
7977                const int decimal = Py_UNICODE_TODECIMAL(ch);
7978                if (decimal >= 0)
7979                    fixed = '0' + decimal;
7980            }
7981            if (fixed != 0) {
7982                if (fixed > maxchar)
7983                    maxchar = fixed;
7984                PyUnicode_WRITE(kind, data, i, fixed);
7985            }
7986            else if (ch > maxchar)
7987                maxchar = ch;
7988        }
7989        else if (ch > maxchar)
7990            maxchar = ch;
7991    }
7992
7993    return maxchar;
7994}
7995
7996PyObject *
7997_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
7998{
7999    if (!PyUnicode_Check(unicode)) {
8000        PyErr_BadInternalCall();
8001        return NULL;
8002    }
8003    if (PyUnicode_READY(unicode) == -1)
8004        return NULL;
8005    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8006        /* If the string is already ASCII, just return the same string */
8007        Py_INCREF(unicode);
8008        return unicode;
8009    }
8010    return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8011}
8012
8013PyObject *
8014PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8015                                  Py_ssize_t length)
8016{
8017    PyObject *result;
8018    Py_UNICODE *p; /* write pointer into result */
8019    Py_ssize_t i;
8020    /* Copy to a new string */
8021    result = (PyObject *)_PyUnicode_New(length);
8022    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8023    if (result == NULL)
8024        return result;
8025    p = PyUnicode_AS_UNICODE(result);
8026    /* Iterate over code points */
8027    for (i = 0; i < length; i++) {
8028        Py_UNICODE ch =s[i];
8029        if (ch > 127) {
8030            int decimal = Py_UNICODE_TODECIMAL(ch);
8031            if (decimal >= 0)
8032                p[i] = '0' + decimal;
8033        }
8034    }
8035    if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8036        Py_DECREF(result);
8037        return NULL;
8038    }
8039    return result;
8040}
8041/* --- Decimal Encoder ---------------------------------------------------- */
8042
8043int
8044PyUnicode_EncodeDecimal(Py_UNICODE *s,
8045                        Py_ssize_t length,
8046                        char *output,
8047                        const char *errors)
8048{
8049    Py_UNICODE *p, *end;
8050    PyObject *errorHandler = NULL;
8051    PyObject *exc = NULL;
8052    const char *encoding = "decimal";
8053    const char *reason = "invalid decimal Unicode string";
8054    /* the following variable is used for caching string comparisons
8055     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8056    int known_errorHandler = -1;
8057
8058    if (output == NULL) {
8059        PyErr_BadArgument();
8060        return -1;
8061    }
8062
8063    p = s;
8064    end = s + length;
8065    while (p < end) {
8066        register Py_UNICODE ch = *p;
8067        int decimal;
8068        PyObject *repunicode;
8069        Py_ssize_t repsize;
8070        Py_ssize_t newpos;
8071        Py_UNICODE *uni2;
8072        Py_UNICODE *collstart;
8073        Py_UNICODE *collend;
8074
8075        if (Py_UNICODE_ISSPACE(ch)) {
8076            *output++ = ' ';
8077            ++p;
8078            continue;
8079        }
8080        decimal = Py_UNICODE_TODECIMAL(ch);
8081        if (decimal >= 0) {
8082            *output++ = '0' + decimal;
8083            ++p;
8084            continue;
8085        }
8086        if (0 < ch && ch < 256) {
8087            *output++ = (char)ch;
8088            ++p;
8089            continue;
8090        }
8091        /* All other characters are considered unencodable */
8092        collstart = p;
8093        collend = p+1;
8094        while (collend < end) {
8095            if ((0 < *collend && *collend < 256) ||
8096                !Py_UNICODE_ISSPACE(*collend) ||
8097                Py_UNICODE_TODECIMAL(*collend))
8098                break;
8099        }
8100        /* cache callback name lookup
8101         * (if not done yet, i.e. it's the first error) */
8102        if (known_errorHandler==-1) {
8103            if ((errors==NULL) || (!strcmp(errors, "strict")))
8104                known_errorHandler = 1;
8105            else if (!strcmp(errors, "replace"))
8106                known_errorHandler = 2;
8107            else if (!strcmp(errors, "ignore"))
8108                known_errorHandler = 3;
8109            else if (!strcmp(errors, "xmlcharrefreplace"))
8110                known_errorHandler = 4;
8111            else
8112                known_errorHandler = 0;
8113        }
8114        switch (known_errorHandler) {
8115        case 1: /* strict */
8116            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8117            goto onError;
8118        case 2: /* replace */
8119            for (p = collstart; p < collend; ++p)
8120                *output++ = '?';
8121            /* fall through */
8122        case 3: /* ignore */
8123            p = collend;
8124            break;
8125        case 4: /* xmlcharrefreplace */
8126            /* generate replacement (temporarily (mis)uses p) */
8127            for (p = collstart; p < collend; ++p)
8128                output += sprintf(output, "&#%d;", (int)*p);
8129            p = collend;
8130            break;
8131        default:
8132            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8133                                                          encoding, reason, s, length, &exc,
8134                                                          collstart-s, collend-s, &newpos);
8135            if (repunicode == NULL)
8136                goto onError;
8137            if (!PyUnicode_Check(repunicode)) {
8138                /* Byte results not supported, since they have no decimal property. */
8139                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8140                Py_DECREF(repunicode);
8141                goto onError;
8142            }
8143            /* generate replacement  */
8144            repsize = PyUnicode_GET_SIZE(repunicode);
8145            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8146                Py_UNICODE ch = *uni2;
8147                if (Py_UNICODE_ISSPACE(ch))
8148                    *output++ = ' ';
8149                else {
8150                    decimal = Py_UNICODE_TODECIMAL(ch);
8151                    if (decimal >= 0)
8152                        *output++ = '0' + decimal;
8153                    else if (0 < ch && ch < 256)
8154                        *output++ = (char)ch;
8155                    else {
8156                        Py_DECREF(repunicode);
8157                        raise_encode_exception(&exc, encoding,
8158                                               s, length, collstart-s, collend-s, reason);
8159                        goto onError;
8160                    }
8161                }
8162            }
8163            p = s + newpos;
8164            Py_DECREF(repunicode);
8165        }
8166    }
8167    /* 0-terminate the output string */
8168    *output++ = '\0';
8169    Py_XDECREF(exc);
8170    Py_XDECREF(errorHandler);
8171    return 0;
8172
8173  onError:
8174    Py_XDECREF(exc);
8175    Py_XDECREF(errorHandler);
8176    return -1;
8177}
8178
8179/* --- Helpers ------------------------------------------------------------ */
8180
8181#include "stringlib/ucs1lib.h"
8182#include "stringlib/fastsearch.h"
8183#include "stringlib/partition.h"
8184#include "stringlib/split.h"
8185#include "stringlib/count.h"
8186#include "stringlib/find.h"
8187#include "stringlib/localeutil.h"
8188#include "stringlib/undef.h"
8189
8190#include "stringlib/ucs2lib.h"
8191#include "stringlib/fastsearch.h"
8192#include "stringlib/partition.h"
8193#include "stringlib/split.h"
8194#include "stringlib/count.h"
8195#include "stringlib/find.h"
8196#include "stringlib/localeutil.h"
8197#include "stringlib/undef.h"
8198
8199#include "stringlib/ucs4lib.h"
8200#include "stringlib/fastsearch.h"
8201#include "stringlib/partition.h"
8202#include "stringlib/split.h"
8203#include "stringlib/count.h"
8204#include "stringlib/find.h"
8205#include "stringlib/localeutil.h"
8206#include "stringlib/undef.h"
8207
8208static Py_ssize_t
8209any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8210                                  const Py_UCS1*, Py_ssize_t,
8211                                  Py_ssize_t, Py_ssize_t),
8212               Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8213                                  const Py_UCS2*, Py_ssize_t,
8214                                  Py_ssize_t, Py_ssize_t),
8215               Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8216                                  const Py_UCS4*, Py_ssize_t,
8217                                  Py_ssize_t, Py_ssize_t),
8218               PyObject* s1, PyObject* s2,
8219               Py_ssize_t start,
8220               Py_ssize_t end)
8221{
8222    int kind1, kind2, kind;
8223    void *buf1, *buf2;
8224    Py_ssize_t len1, len2, result;
8225
8226    kind1 = PyUnicode_KIND(s1);
8227    kind2 = PyUnicode_KIND(s2);
8228    kind = kind1 > kind2 ? kind1 : kind2;
8229    buf1 = PyUnicode_DATA(s1);
8230    buf2 = PyUnicode_DATA(s2);
8231    if (kind1 != kind)
8232        buf1 = _PyUnicode_AsKind(s1, kind);
8233    if (!buf1)
8234        return -2;
8235    if (kind2 != kind)
8236        buf2 = _PyUnicode_AsKind(s2, kind);
8237    if (!buf2) {
8238        if (kind1 != kind) PyMem_Free(buf1);
8239        return -2;
8240    }
8241    len1 = PyUnicode_GET_LENGTH(s1);
8242    len2 = PyUnicode_GET_LENGTH(s2);
8243
8244    switch(kind) {
8245    case PyUnicode_1BYTE_KIND:
8246        result = ucs1(buf1, len1, buf2, len2, start, end);
8247        break;
8248    case PyUnicode_2BYTE_KIND:
8249        result = ucs2(buf1, len1, buf2, len2, start, end);
8250        break;
8251    case PyUnicode_4BYTE_KIND:
8252        result = ucs4(buf1, len1, buf2, len2, start, end);
8253        break;
8254    default:
8255        assert(0); result = -2;
8256    }
8257
8258    if (kind1 != kind)
8259        PyMem_Free(buf1);
8260    if (kind2 != kind)
8261        PyMem_Free(buf2);
8262
8263    return result;
8264}
8265
8266Py_ssize_t
8267_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8268                                   Py_ssize_t n_buffer,
8269                                   void *digits, Py_ssize_t n_digits,
8270                                   Py_ssize_t min_width,
8271                                   const char *grouping,
8272                                   const char *thousands_sep)
8273{
8274    switch(kind) {
8275    case PyUnicode_1BYTE_KIND:
8276        return _PyUnicode_ucs1_InsertThousandsGrouping(
8277            (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8278            min_width, grouping, thousands_sep);
8279    case PyUnicode_2BYTE_KIND:
8280        return _PyUnicode_ucs2_InsertThousandsGrouping(
8281            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8282            min_width, grouping, thousands_sep);
8283    case PyUnicode_4BYTE_KIND:
8284        return _PyUnicode_ucs4_InsertThousandsGrouping(
8285            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8286            min_width, grouping, thousands_sep);
8287    }
8288    assert(0);
8289    return -1;
8290}
8291
8292
8293#include "stringlib/unicodedefs.h"
8294#include "stringlib/fastsearch.h"
8295
8296#include "stringlib/count.h"
8297#include "stringlib/find.h"
8298
8299/* helper macro to fixup start/end slice values */
8300#define ADJUST_INDICES(start, end, len)         \
8301    if (end > len)                              \
8302        end = len;                              \
8303    else if (end < 0) {                         \
8304        end += len;                             \
8305        if (end < 0)                            \
8306            end = 0;                            \
8307    }                                           \
8308    if (start < 0) {                            \
8309        start += len;                           \
8310        if (start < 0)                          \
8311            start = 0;                          \
8312    }
8313
8314Py_ssize_t
8315PyUnicode_Count(PyObject *str,
8316                PyObject *substr,
8317                Py_ssize_t start,
8318                Py_ssize_t end)
8319{
8320    Py_ssize_t result;
8321    PyUnicodeObject* str_obj;
8322    PyUnicodeObject* sub_obj;
8323    int kind1, kind2, kind;
8324    void *buf1 = NULL, *buf2 = NULL;
8325    Py_ssize_t len1, len2;
8326
8327    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8328    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8329        return -1;
8330    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8331    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8332        Py_DECREF(str_obj);
8333        return -1;
8334    }
8335
8336    kind1 = PyUnicode_KIND(str_obj);
8337    kind2 = PyUnicode_KIND(sub_obj);
8338    kind = kind1 > kind2 ? kind1 : kind2;
8339    buf1 = PyUnicode_DATA(str_obj);
8340    if (kind1 != kind)
8341        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8342    if (!buf1)
8343        goto onError;
8344    buf2 = PyUnicode_DATA(sub_obj);
8345    if (kind2 != kind)
8346        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8347    if (!buf2)
8348        goto onError;
8349    len1 = PyUnicode_GET_LENGTH(str_obj);
8350    len2 = PyUnicode_GET_LENGTH(sub_obj);
8351
8352    ADJUST_INDICES(start, end, len1);
8353    switch(kind) {
8354    case PyUnicode_1BYTE_KIND:
8355        result = ucs1lib_count(
8356            ((Py_UCS1*)buf1) + start, end - start,
8357            buf2, len2, PY_SSIZE_T_MAX
8358            );
8359        break;
8360    case PyUnicode_2BYTE_KIND:
8361        result = ucs2lib_count(
8362            ((Py_UCS2*)buf1) + start, end - start,
8363            buf2, len2, PY_SSIZE_T_MAX
8364            );
8365        break;
8366    case PyUnicode_4BYTE_KIND:
8367        result = ucs4lib_count(
8368            ((Py_UCS4*)buf1) + start, end - start,
8369            buf2, len2, PY_SSIZE_T_MAX
8370            );
8371        break;
8372    default:
8373        assert(0); result = 0;
8374    }
8375
8376    Py_DECREF(sub_obj);
8377    Py_DECREF(str_obj);
8378
8379    if (kind1 != kind)
8380        PyMem_Free(buf1);
8381    if (kind2 != kind)
8382        PyMem_Free(buf2);
8383
8384    return result;
8385  onError:
8386    Py_DECREF(sub_obj);
8387    Py_DECREF(str_obj);
8388    if (kind1 != kind && buf1)
8389        PyMem_Free(buf1);
8390    if (kind2 != kind && buf2)
8391        PyMem_Free(buf2);
8392    return -1;
8393}
8394
8395Py_ssize_t
8396PyUnicode_Find(PyObject *str,
8397               PyObject *sub,
8398               Py_ssize_t start,
8399               Py_ssize_t end,
8400               int direction)
8401{
8402    Py_ssize_t result;
8403
8404    str = PyUnicode_FromObject(str);
8405    if (!str || PyUnicode_READY(str) == -1)
8406        return -2;
8407    sub = PyUnicode_FromObject(sub);
8408    if (!sub || PyUnicode_READY(sub) == -1) {
8409        Py_DECREF(str);
8410        return -2;
8411    }
8412
8413    if (direction > 0)
8414        result = any_find_slice(
8415            ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8416            str, sub, start, end
8417            );
8418    else
8419        result = any_find_slice(
8420            ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8421            str, sub, start, end
8422            );
8423
8424    Py_DECREF(str);
8425    Py_DECREF(sub);
8426
8427    return result;
8428}
8429
8430Py_ssize_t
8431PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8432                   Py_ssize_t start, Py_ssize_t end,
8433                   int direction)
8434{
8435    char *result;
8436    int kind;
8437    if (PyUnicode_READY(str) == -1)
8438        return -2;
8439    if (start < 0 || end < 0) {
8440        PyErr_SetString(PyExc_IndexError, "string index out of range");
8441        return -2;
8442    }
8443    if (end > PyUnicode_GET_LENGTH(str))
8444        end = PyUnicode_GET_LENGTH(str);
8445    kind = PyUnicode_KIND(str);
8446    result = findchar(PyUnicode_1BYTE_DATA(str)
8447                      + PyUnicode_KIND_SIZE(kind, start),
8448                      kind,
8449                      end-start, ch, direction);
8450    if (!result)
8451        return -1;
8452    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8453}
8454
8455static int
8456tailmatch(PyUnicodeObject *self,
8457          PyUnicodeObject *substring,
8458          Py_ssize_t start,
8459          Py_ssize_t end,
8460          int direction)
8461{
8462    int kind_self;
8463    int kind_sub;
8464    void *data_self;
8465    void *data_sub;
8466    Py_ssize_t offset;
8467    Py_ssize_t i;
8468    Py_ssize_t end_sub;
8469
8470    if (PyUnicode_READY(self) == -1 ||
8471        PyUnicode_READY(substring) == -1)
8472        return 0;
8473
8474    if (PyUnicode_GET_LENGTH(substring) == 0)
8475        return 1;
8476
8477    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8478    end -= PyUnicode_GET_LENGTH(substring);
8479    if (end < start)
8480        return 0;
8481
8482    kind_self = PyUnicode_KIND(self);
8483    data_self = PyUnicode_DATA(self);
8484    kind_sub = PyUnicode_KIND(substring);
8485    data_sub = PyUnicode_DATA(substring);
8486    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8487
8488    if (direction > 0)
8489        offset = end;
8490    else
8491        offset = start;
8492
8493    if (PyUnicode_READ(kind_self, data_self, offset) ==
8494        PyUnicode_READ(kind_sub, data_sub, 0) &&
8495        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8496        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8497        /* If both are of the same kind, memcmp is sufficient */
8498        if (kind_self == kind_sub) {
8499            return ! memcmp((char *)data_self +
8500                                (offset * PyUnicode_CHARACTER_SIZE(substring)),
8501                            data_sub,
8502                            PyUnicode_GET_LENGTH(substring) *
8503                                PyUnicode_CHARACTER_SIZE(substring));
8504        }
8505        /* otherwise we have to compare each character by first accesing it */
8506        else {
8507            /* We do not need to compare 0 and len(substring)-1 because
8508               the if statement above ensured already that they are equal
8509               when we end up here. */
8510            // TODO: honor direction and do a forward or backwards search
8511            for (i = 1; i < end_sub; ++i) {
8512                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8513                    PyUnicode_READ(kind_sub, data_sub, i))
8514                    return 0;
8515            }
8516            return 1;
8517        }
8518    }
8519
8520    return 0;
8521}
8522
8523Py_ssize_t
8524PyUnicode_Tailmatch(PyObject *str,
8525                    PyObject *substr,
8526                    Py_ssize_t start,
8527                    Py_ssize_t end,
8528                    int direction)
8529{
8530    Py_ssize_t result;
8531
8532    str = PyUnicode_FromObject(str);
8533    if (str == NULL)
8534        return -1;
8535    substr = PyUnicode_FromObject(substr);
8536    if (substr == NULL) {
8537        Py_DECREF(str);
8538        return -1;
8539    }
8540
8541    result = tailmatch((PyUnicodeObject *)str,
8542                       (PyUnicodeObject *)substr,
8543                       start, end, direction);
8544    Py_DECREF(str);
8545    Py_DECREF(substr);
8546    return result;
8547}
8548
8549/* Apply fixfct filter to the Unicode object self and return a
8550   reference to the modified object */
8551
8552static PyObject *
8553fixup(PyUnicodeObject *self,
8554      Py_UCS4 (*fixfct)(PyUnicodeObject *s))
8555{
8556    PyObject *u;
8557    Py_UCS4 maxchar_old, maxchar_new = 0;
8558
8559    if (PyUnicode_READY(self) == -1)
8560        return NULL;
8561    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8562    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8563                      maxchar_old);
8564    if (u == NULL)
8565        return NULL;
8566
8567    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8568              PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
8569
8570    /* fix functions return the new maximum character in a string,
8571       if the kind of the resulting unicode object does not change,
8572       everything is fine.  Otherwise we need to change the string kind
8573       and re-run the fix function. */
8574    maxchar_new = fixfct((PyUnicodeObject*)u);
8575    if (maxchar_new == 0)
8576        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8577    else if (maxchar_new <= 127)
8578        maxchar_new = 127;
8579    else if (maxchar_new <= 255)
8580        maxchar_new = 255;
8581    else if (maxchar_new <= 65535)
8582        maxchar_new = 65535;
8583    else
8584        maxchar_new = 1114111; /* 0x10ffff */
8585
8586    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8587        /* fixfct should return TRUE if it modified the buffer. If
8588           FALSE, return a reference to the original buffer instead
8589           (to save space, not time) */
8590        Py_INCREF(self);
8591        Py_DECREF(u);
8592        return (PyObject*) self;
8593    }
8594    else if (maxchar_new == maxchar_old) {
8595        return u;
8596    }
8597    else {
8598        /* In case the maximum character changed, we need to
8599           convert the string to the new category. */
8600        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8601        if (v == NULL) {
8602            Py_DECREF(u);
8603            return NULL;
8604        }
8605        if (maxchar_new > maxchar_old) {
8606            /* If the maxchar increased so that the kind changed, not all
8607               characters are representable anymore and we need to fix the
8608               string again. This only happens in very few cases. */
8609            if (PyUnicode_CopyCharacters(v, 0,
8610                                         (PyObject*)self, 0,
8611                                         PyUnicode_GET_LENGTH(self)) < 0)
8612            {
8613                Py_DECREF(u);
8614                return NULL;
8615            }
8616            maxchar_old = fixfct((PyUnicodeObject*)v);
8617            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8618        }
8619        else {
8620            if (PyUnicode_CopyCharacters(v, 0,
8621                                         u, 0,
8622                                         PyUnicode_GET_LENGTH(self)) < 0)
8623            {
8624                Py_DECREF(u);
8625                return NULL;
8626            }
8627        }
8628
8629        Py_DECREF(u);
8630        return v;
8631    }
8632}
8633
8634static Py_UCS4
8635fixupper(PyUnicodeObject *self)
8636{
8637    /* No need to call PyUnicode_READY(self) because this function is only
8638       called as a callback from fixup() which does it already. */
8639    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8640    const int kind = PyUnicode_KIND(self);
8641    void *data = PyUnicode_DATA(self);
8642    int touched = 0;
8643    Py_UCS4 maxchar = 0;
8644    Py_ssize_t i;
8645
8646    for (i = 0; i < len; ++i) {
8647        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8648        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8649        if (up != ch) {
8650            if (up > maxchar)
8651                maxchar = up;
8652            PyUnicode_WRITE(kind, data, i, up);
8653            touched = 1;
8654        }
8655        else if (ch > maxchar)
8656            maxchar = ch;
8657    }
8658
8659    if (touched)
8660        return maxchar;
8661    else
8662        return 0;
8663}
8664
8665static Py_UCS4
8666fixlower(PyUnicodeObject *self)
8667{
8668    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8669    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8670    const int kind = PyUnicode_KIND(self);
8671    void *data = PyUnicode_DATA(self);
8672    int touched = 0;
8673    Py_UCS4 maxchar = 0;
8674    Py_ssize_t i;
8675
8676    for(i = 0; i < len; ++i) {
8677        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8678        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8679        if (lo != ch) {
8680            if (lo > maxchar)
8681                maxchar = lo;
8682            PyUnicode_WRITE(kind, data, i, lo);
8683            touched = 1;
8684        }
8685        else if (ch > maxchar)
8686            maxchar = ch;
8687    }
8688
8689    if (touched)
8690        return maxchar;
8691    else
8692        return 0;
8693}
8694
8695static Py_UCS4
8696fixswapcase(PyUnicodeObject *self)
8697{
8698    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8699    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8700    const int kind = PyUnicode_KIND(self);
8701    void *data = PyUnicode_DATA(self);
8702    int touched = 0;
8703    Py_UCS4 maxchar = 0;
8704    Py_ssize_t i;
8705
8706    for(i = 0; i < len; ++i) {
8707        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8708        Py_UCS4 nu = 0;
8709
8710        if (Py_UNICODE_ISUPPER(ch))
8711            nu = Py_UNICODE_TOLOWER(ch);
8712        else if (Py_UNICODE_ISLOWER(ch))
8713            nu = Py_UNICODE_TOUPPER(ch);
8714
8715        if (nu != 0) {
8716            if (nu > maxchar)
8717                maxchar = nu;
8718            PyUnicode_WRITE(kind, data, i, nu);
8719            touched = 1;
8720        }
8721        else if (ch > maxchar)
8722            maxchar = ch;
8723    }
8724
8725    if (touched)
8726        return maxchar;
8727    else
8728        return 0;
8729}
8730
8731static Py_UCS4
8732fixcapitalize(PyUnicodeObject *self)
8733{
8734    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8735    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8736    const int kind = PyUnicode_KIND(self);
8737    void *data = PyUnicode_DATA(self);
8738    int touched = 0;
8739    Py_UCS4 maxchar = 0;
8740    Py_ssize_t i = 0;
8741    Py_UCS4 ch;
8742
8743    if (len == 0)
8744        return 0;
8745
8746    ch = PyUnicode_READ(kind, data, i);
8747    if (!Py_UNICODE_ISUPPER(ch)) {
8748        maxchar = Py_UNICODE_TOUPPER(ch);
8749        PyUnicode_WRITE(kind, data, i, maxchar);
8750        touched = 1;
8751    }
8752    ++i;
8753    for(; i < len; ++i) {
8754        ch = PyUnicode_READ(kind, data, i);
8755        if (!Py_UNICODE_ISLOWER(ch)) {
8756            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8757            if (lo > maxchar)
8758                maxchar = lo;
8759            PyUnicode_WRITE(kind, data, i, lo);
8760            touched = 1;
8761        }
8762        else if (ch > maxchar)
8763            maxchar = ch;
8764    }
8765
8766    if (touched)
8767        return maxchar;
8768    else
8769        return 0;
8770}
8771
8772static Py_UCS4
8773fixtitle(PyUnicodeObject *self)
8774{
8775    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8776    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8777    const int kind = PyUnicode_KIND(self);
8778    void *data = PyUnicode_DATA(self);
8779    Py_UCS4 maxchar = 0;
8780    Py_ssize_t i = 0;
8781    int previous_is_cased;
8782
8783    /* Shortcut for single character strings */
8784    if (len == 1) {
8785        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8786        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8787        if (ti != ch) {
8788            PyUnicode_WRITE(kind, data, i, ti);
8789            return ti;
8790        }
8791        else
8792            return 0;
8793    }
8794    previous_is_cased = 0;
8795    for(; i < len; ++i) {
8796        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8797        Py_UCS4 nu;
8798
8799        if (previous_is_cased)
8800            nu = Py_UNICODE_TOLOWER(ch);
8801        else
8802            nu = Py_UNICODE_TOTITLE(ch);
8803
8804        if (nu > maxchar)
8805            maxchar = nu;
8806        PyUnicode_WRITE(kind, data, i, nu);
8807
8808        if (Py_UNICODE_ISLOWER(ch) ||
8809            Py_UNICODE_ISUPPER(ch) ||
8810            Py_UNICODE_ISTITLE(ch))
8811            previous_is_cased = 1;
8812        else
8813            previous_is_cased = 0;
8814    }
8815    return maxchar;
8816}
8817
8818PyObject *
8819PyUnicode_Join(PyObject *separator, PyObject *seq)
8820{
8821    PyObject *sep = NULL;
8822    Py_ssize_t seplen = 1;
8823    PyObject *res = NULL; /* the result */
8824    PyObject *fseq;          /* PySequence_Fast(seq) */
8825    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
8826    PyObject **items;
8827    PyObject *item;
8828    Py_ssize_t sz, i, res_offset;
8829    Py_UCS4 maxchar = 0;
8830    Py_UCS4 item_maxchar;
8831
8832    fseq = PySequence_Fast(seq, "");
8833    if (fseq == NULL) {
8834        return NULL;
8835    }
8836
8837    /* NOTE: the following code can't call back into Python code,
8838     * so we are sure that fseq won't be mutated.
8839     */
8840
8841    seqlen = PySequence_Fast_GET_SIZE(fseq);
8842    /* If empty sequence, return u"". */
8843    if (seqlen == 0) {
8844        res = PyUnicode_New(0, 0);
8845        goto Done;
8846    }
8847    items = PySequence_Fast_ITEMS(fseq);
8848    /* If singleton sequence with an exact Unicode, return that. */
8849    if (seqlen == 1) {
8850        item = items[0];
8851        if (PyUnicode_CheckExact(item)) {
8852            Py_INCREF(item);
8853            res = item;
8854            goto Done;
8855        }
8856    }
8857    else {
8858        /* Set up sep and seplen */
8859        if (separator == NULL) {
8860            /* fall back to a blank space separator */
8861            sep = PyUnicode_FromOrdinal(' ');
8862            if (!sep)
8863                goto onError;
8864        }
8865        else {
8866            if (!PyUnicode_Check(separator)) {
8867                PyErr_Format(PyExc_TypeError,
8868                             "separator: expected str instance,"
8869                             " %.80s found",
8870                             Py_TYPE(separator)->tp_name);
8871                goto onError;
8872            }
8873            if (PyUnicode_READY(separator))
8874                goto onError;
8875            sep = separator;
8876            seplen = PyUnicode_GET_LENGTH(separator);
8877            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8878            /* inc refcount to keep this code path symetric with the
8879               above case of a blank separator */
8880            Py_INCREF(sep);
8881        }
8882    }
8883
8884    /* There are at least two things to join, or else we have a subclass
8885     * of str in the sequence.
8886     * Do a pre-pass to figure out the total amount of space we'll
8887     * need (sz), and see whether all argument are strings.
8888     */
8889    sz = 0;
8890    for (i = 0; i < seqlen; i++) {
8891        const Py_ssize_t old_sz = sz;
8892        item = items[i];
8893        if (!PyUnicode_Check(item)) {
8894            PyErr_Format(PyExc_TypeError,
8895                         "sequence item %zd: expected str instance,"
8896                         " %.80s found",
8897                         i, Py_TYPE(item)->tp_name);
8898            goto onError;
8899        }
8900        if (PyUnicode_READY(item) == -1)
8901            goto onError;
8902        sz += PyUnicode_GET_LENGTH(item);
8903        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8904        if (item_maxchar > maxchar)
8905            maxchar = item_maxchar;
8906        if (i != 0)
8907            sz += seplen;
8908        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8909            PyErr_SetString(PyExc_OverflowError,
8910                            "join() result is too long for a Python string");
8911            goto onError;
8912        }
8913    }
8914
8915    res = PyUnicode_New(sz, maxchar);
8916    if (res == NULL)
8917        goto onError;
8918
8919    /* Catenate everything. */
8920    for (i = 0, res_offset = 0; i < seqlen; ++i) {
8921        Py_ssize_t itemlen, copied;
8922        item = items[i];
8923        /* Copy item, and maybe the separator. */
8924        if (i && seplen != 0) {
8925            copied = PyUnicode_CopyCharacters(res, res_offset,
8926                                              sep, 0, seplen);
8927            if (copied < 0)
8928                goto onError;
8929#ifdef Py_DEBUG
8930            res_offset += copied;
8931#else
8932            res_offset += seplen;
8933#endif
8934        }
8935        itemlen = PyUnicode_GET_LENGTH(item);
8936        if (itemlen != 0) {
8937            copied = PyUnicode_CopyCharacters(res, res_offset,
8938                                              item, 0, itemlen);
8939            if (copied < 0)
8940                goto onError;
8941#ifdef Py_DEBUG
8942            res_offset += copied;
8943#else
8944            res_offset += itemlen;
8945#endif
8946        }
8947    }
8948    assert(res_offset == PyUnicode_GET_LENGTH(res));
8949
8950  Done:
8951    Py_DECREF(fseq);
8952    Py_XDECREF(sep);
8953    return res;
8954
8955  onError:
8956    Py_DECREF(fseq);
8957    Py_XDECREF(sep);
8958    Py_XDECREF(res);
8959    return NULL;
8960}
8961
8962#define FILL(kind, data, value, start, length) \
8963    do { \
8964        Py_ssize_t i_ = 0; \
8965        assert(kind != PyUnicode_WCHAR_KIND); \
8966        switch ((kind)) { \
8967        case PyUnicode_1BYTE_KIND: { \
8968            unsigned char * to_ = (unsigned char *)((data)) + (start); \
8969            memset(to_, (unsigned char)value, length); \
8970            break; \
8971        } \
8972        case PyUnicode_2BYTE_KIND: { \
8973            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8974            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8975            break; \
8976        } \
8977        default: { \
8978            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8979            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8980            break; \
8981        } \
8982        } \
8983    } while (0)
8984
8985static PyUnicodeObject *
8986pad(PyUnicodeObject *self,
8987    Py_ssize_t left,
8988    Py_ssize_t right,
8989    Py_UCS4 fill)
8990{
8991    PyObject *u;
8992    Py_UCS4 maxchar;
8993    int kind;
8994    void *data;
8995
8996    if (left < 0)
8997        left = 0;
8998    if (right < 0)
8999        right = 0;
9000
9001    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9002        Py_INCREF(self);
9003        return self;
9004    }
9005
9006    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9007        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9008        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9009        return NULL;
9010    }
9011    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9012    if (fill > maxchar)
9013        maxchar = fill;
9014    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9015    if (!u)
9016        return NULL;
9017
9018    kind = PyUnicode_KIND(u);
9019    data = PyUnicode_DATA(u);
9020    if (left)
9021        FILL(kind, data, fill, 0, left);
9022    if (right)
9023        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9024    if (PyUnicode_CopyCharacters(u, left,
9025                                 (PyObject*)self, 0,
9026                                 _PyUnicode_LENGTH(self)) < 0)
9027    {
9028        Py_DECREF(u);
9029        return NULL;
9030    }
9031
9032    return (PyUnicodeObject*)u;
9033}
9034#undef FILL
9035
9036PyObject *
9037PyUnicode_Splitlines(PyObject *string, int keepends)
9038{
9039    PyObject *list;
9040
9041    string = PyUnicode_FromObject(string);
9042    if (string == NULL || PyUnicode_READY(string) == -1)
9043        return NULL;
9044
9045    switch(PyUnicode_KIND(string)) {
9046    case PyUnicode_1BYTE_KIND:
9047        list = ucs1lib_splitlines(
9048            (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9049            PyUnicode_GET_LENGTH(string), keepends);
9050        break;
9051    case PyUnicode_2BYTE_KIND:
9052        list = ucs2lib_splitlines(
9053            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9054            PyUnicode_GET_LENGTH(string), keepends);
9055        break;
9056    case PyUnicode_4BYTE_KIND:
9057        list = ucs4lib_splitlines(
9058            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9059            PyUnicode_GET_LENGTH(string), keepends);
9060        break;
9061    default:
9062        assert(0);
9063        list = 0;
9064    }
9065    Py_DECREF(string);
9066    return list;
9067}
9068
9069static PyObject *
9070split(PyUnicodeObject *self,
9071      PyUnicodeObject *substring,
9072      Py_ssize_t maxcount)
9073{
9074    int kind1, kind2, kind;
9075    void *buf1, *buf2;
9076    Py_ssize_t len1, len2;
9077    PyObject* out;
9078
9079    if (maxcount < 0)
9080        maxcount = PY_SSIZE_T_MAX;
9081
9082    if (PyUnicode_READY(self) == -1)
9083        return NULL;
9084
9085    if (substring == NULL)
9086        switch(PyUnicode_KIND(self)) {
9087        case PyUnicode_1BYTE_KIND:
9088            return ucs1lib_split_whitespace(
9089                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9090                PyUnicode_GET_LENGTH(self), maxcount
9091                );
9092        case PyUnicode_2BYTE_KIND:
9093            return ucs2lib_split_whitespace(
9094                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9095                PyUnicode_GET_LENGTH(self), maxcount
9096                );
9097        case PyUnicode_4BYTE_KIND:
9098            return ucs4lib_split_whitespace(
9099                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9100                PyUnicode_GET_LENGTH(self), maxcount
9101                );
9102        default:
9103            assert(0);
9104            return NULL;
9105        }
9106
9107    if (PyUnicode_READY(substring) == -1)
9108        return NULL;
9109
9110    kind1 = PyUnicode_KIND(self);
9111    kind2 = PyUnicode_KIND(substring);
9112    kind = kind1 > kind2 ? kind1 : kind2;
9113    buf1 = PyUnicode_DATA(self);
9114    buf2 = PyUnicode_DATA(substring);
9115    if (kind1 != kind)
9116        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9117    if (!buf1)
9118        return NULL;
9119    if (kind2 != kind)
9120        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9121    if (!buf2) {
9122        if (kind1 != kind) PyMem_Free(buf1);
9123        return NULL;
9124    }
9125    len1 = PyUnicode_GET_LENGTH(self);
9126    len2 = PyUnicode_GET_LENGTH(substring);
9127
9128    switch(kind) {
9129    case PyUnicode_1BYTE_KIND:
9130        out = ucs1lib_split(
9131            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9132        break;
9133    case PyUnicode_2BYTE_KIND:
9134        out = ucs2lib_split(
9135            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9136        break;
9137    case PyUnicode_4BYTE_KIND:
9138        out = ucs4lib_split(
9139            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9140        break;
9141    default:
9142        out = NULL;
9143    }
9144    if (kind1 != kind)
9145        PyMem_Free(buf1);
9146    if (kind2 != kind)
9147        PyMem_Free(buf2);
9148    return out;
9149}
9150
9151static PyObject *
9152rsplit(PyUnicodeObject *self,
9153       PyUnicodeObject *substring,
9154       Py_ssize_t maxcount)
9155{
9156    int kind1, kind2, kind;
9157    void *buf1, *buf2;
9158    Py_ssize_t len1, len2;
9159    PyObject* out;
9160
9161    if (maxcount < 0)
9162        maxcount = PY_SSIZE_T_MAX;
9163
9164    if (PyUnicode_READY(self) == -1)
9165        return NULL;
9166
9167    if (substring == NULL)
9168        switch(PyUnicode_KIND(self)) {
9169        case PyUnicode_1BYTE_KIND:
9170            return ucs1lib_rsplit_whitespace(
9171                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9172                PyUnicode_GET_LENGTH(self), maxcount
9173                );
9174        case PyUnicode_2BYTE_KIND:
9175            return ucs2lib_rsplit_whitespace(
9176                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9177                PyUnicode_GET_LENGTH(self), maxcount
9178                );
9179        case PyUnicode_4BYTE_KIND:
9180            return ucs4lib_rsplit_whitespace(
9181                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9182                PyUnicode_GET_LENGTH(self), maxcount
9183                );
9184        default:
9185            assert(0);
9186            return NULL;
9187        }
9188
9189    if (PyUnicode_READY(substring) == -1)
9190        return NULL;
9191
9192    kind1 = PyUnicode_KIND(self);
9193    kind2 = PyUnicode_KIND(substring);
9194    kind = kind1 > kind2 ? kind1 : kind2;
9195    buf1 = PyUnicode_DATA(self);
9196    buf2 = PyUnicode_DATA(substring);
9197    if (kind1 != kind)
9198        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9199    if (!buf1)
9200        return NULL;
9201    if (kind2 != kind)
9202        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9203    if (!buf2) {
9204        if (kind1 != kind) PyMem_Free(buf1);
9205        return NULL;
9206    }
9207    len1 = PyUnicode_GET_LENGTH(self);
9208    len2 = PyUnicode_GET_LENGTH(substring);
9209
9210    switch(kind) {
9211    case PyUnicode_1BYTE_KIND:
9212        out = ucs1lib_rsplit(
9213            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9214        break;
9215    case PyUnicode_2BYTE_KIND:
9216        out = ucs2lib_rsplit(
9217            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9218        break;
9219    case PyUnicode_4BYTE_KIND:
9220        out = ucs4lib_rsplit(
9221            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9222        break;
9223    default:
9224        out = NULL;
9225    }
9226    if (kind1 != kind)
9227        PyMem_Free(buf1);
9228    if (kind2 != kind)
9229        PyMem_Free(buf2);
9230    return out;
9231}
9232
9233static Py_ssize_t
9234anylib_find(int kind, void *buf1, Py_ssize_t len1,
9235            void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9236{
9237    switch(kind) {
9238    case PyUnicode_1BYTE_KIND:
9239        return ucs1lib_find(buf1, len1, buf2, len2, offset);
9240    case PyUnicode_2BYTE_KIND:
9241        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9242    case PyUnicode_4BYTE_KIND:
9243        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9244    }
9245    assert(0);
9246    return -1;
9247}
9248
9249static Py_ssize_t
9250anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9251             void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9252{
9253        switch(kind) {
9254        case PyUnicode_1BYTE_KIND:
9255            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9256        case PyUnicode_2BYTE_KIND:
9257            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9258        case PyUnicode_4BYTE_KIND:
9259            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9260        }
9261        assert(0);
9262        return 0;
9263}
9264
9265static PyObject *
9266replace(PyObject *self, PyObject *str1,
9267        PyObject *str2, Py_ssize_t maxcount)
9268{
9269    PyObject *u;
9270    char *sbuf = PyUnicode_DATA(self);
9271    char *buf1 = PyUnicode_DATA(str1);
9272    char *buf2 = PyUnicode_DATA(str2);
9273    int srelease = 0, release1 = 0, release2 = 0;
9274    int skind = PyUnicode_KIND(self);
9275    int kind1 = PyUnicode_KIND(str1);
9276    int kind2 = PyUnicode_KIND(str2);
9277    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9278    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9279    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9280
9281    if (maxcount < 0)
9282        maxcount = PY_SSIZE_T_MAX;
9283    else if (maxcount == 0 || slen == 0)
9284        goto nothing;
9285
9286    if (skind < kind1)
9287        /* substring too wide to be present */
9288        goto nothing;
9289
9290    if (len1 == len2) {
9291        Py_ssize_t i;
9292        /* same length */
9293        if (len1 == 0)
9294            goto nothing;
9295        if (len1 == 1) {
9296            /* replace characters */
9297            Py_UCS4 u1, u2, maxchar;
9298            int mayshrink, rkind;
9299            u1 = PyUnicode_READ_CHAR(str1, 0);
9300            if (!findchar(sbuf, PyUnicode_KIND(self),
9301                          slen, u1, 1))
9302                goto nothing;
9303            u2 = PyUnicode_READ_CHAR(str2, 0);
9304            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9305            /* Replacing u1 with u2 may cause a maxchar reduction in the
9306               result string. */
9307            mayshrink = maxchar > 127;
9308            if (u2 > maxchar) {
9309                maxchar = u2;
9310                mayshrink = 0;
9311            }
9312            u = PyUnicode_New(slen, maxchar);
9313            if (!u)
9314                goto error;
9315            if (PyUnicode_CopyCharacters(u, 0,
9316                                         (PyObject*)self, 0, slen) < 0)
9317            {
9318                Py_DECREF(u);
9319                return NULL;
9320            }
9321            rkind = PyUnicode_KIND(u);
9322            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9323                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9324                    if (--maxcount < 0)
9325                        break;
9326                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9327                }
9328            if (mayshrink) {
9329                PyObject *tmp = u;
9330                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9331                                              PyUnicode_GET_LENGTH(tmp));
9332                Py_DECREF(tmp);
9333            }
9334        } else {
9335            int rkind = skind;
9336            char *res;
9337            if (kind1 < rkind) {
9338                /* widen substring */
9339                buf1 = _PyUnicode_AsKind(str1, rkind);
9340                if (!buf1) goto error;
9341                release1 = 1;
9342            }
9343            i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
9344            if (i < 0)
9345                goto nothing;
9346            if (rkind > kind2) {
9347                /* widen replacement */
9348                buf2 = _PyUnicode_AsKind(str2, rkind);
9349                if (!buf2) goto error;
9350                release2 = 1;
9351            }
9352            else if (rkind < kind2) {
9353                /* widen self and buf1 */
9354                rkind = kind2;
9355                if (release1) PyMem_Free(buf1);
9356                sbuf = _PyUnicode_AsKind(self, rkind);
9357                if (!sbuf) goto error;
9358                srelease = 1;
9359                buf1 = _PyUnicode_AsKind(str1, rkind);
9360                if (!buf1) goto error;
9361                release1 = 1;
9362            }
9363            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9364            if (!res) {
9365                PyErr_NoMemory();
9366                goto error;
9367            }
9368            memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
9369            /* change everything in-place, starting with this one */
9370            memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9371                   buf2,
9372                   PyUnicode_KIND_SIZE(rkind, len2));
9373            i += len1;
9374
9375            while ( --maxcount > 0) {
9376                i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9377                                slen-i,
9378                                buf1, len1, i);
9379                if (i == -1)
9380                    break;
9381                memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9382                       buf2,
9383                       PyUnicode_KIND_SIZE(rkind, len2));
9384                i += len1;
9385            }
9386
9387            u = PyUnicode_FromKindAndData(rkind, res, slen);
9388            PyMem_Free(res);
9389            if (!u) goto error;
9390        }
9391    } else {
9392
9393        Py_ssize_t n, i, j, ires;
9394        Py_ssize_t product, new_size;
9395        int rkind = skind;
9396        char *res;
9397
9398        if (kind1 < rkind) {
9399            buf1 = _PyUnicode_AsKind(str1, rkind);
9400            if (!buf1) goto error;
9401            release1 = 1;
9402        }
9403        n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
9404        if (n == 0)
9405            goto nothing;
9406        if (kind2 < rkind) {
9407            buf2 = _PyUnicode_AsKind(str2, rkind);
9408            if (!buf2) goto error;
9409            release2 = 1;
9410        }
9411        else if (kind2 > rkind) {
9412            rkind = kind2;
9413            sbuf = _PyUnicode_AsKind(self, rkind);
9414            if (!sbuf) goto error;
9415            srelease = 1;
9416            if (release1) PyMem_Free(buf1);
9417            buf1 = _PyUnicode_AsKind(str1, rkind);
9418            if (!buf1) goto error;
9419            release1 = 1;
9420        }
9421        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9422           PyUnicode_GET_LENGTH(str1))); */
9423        product = n * (len2-len1);
9424        if ((product / (len2-len1)) != n) {
9425                PyErr_SetString(PyExc_OverflowError,
9426                                "replace string is too long");
9427                goto error;
9428        }
9429        new_size = slen + product;
9430        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9431            PyErr_SetString(PyExc_OverflowError,
9432                            "replace string is too long");
9433            goto error;
9434        }
9435        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9436        if (!res)
9437            goto error;
9438        ires = i = 0;
9439        if (len1 > 0) {
9440            while (n-- > 0) {
9441                /* look for next match */
9442                j = anylib_find(rkind,
9443                                sbuf + PyUnicode_KIND_SIZE(rkind, i),
9444                                slen-i, buf1, len1, i);
9445                if (j == -1)
9446                    break;
9447                else if (j > i) {
9448                    /* copy unchanged part [i:j] */
9449                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9450                           sbuf + PyUnicode_KIND_SIZE(rkind, i),
9451                           PyUnicode_KIND_SIZE(rkind, j-i));
9452                    ires += j - i;
9453                }
9454                /* copy substitution string */
9455                if (len2 > 0) {
9456                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9457                           buf2,
9458                           PyUnicode_KIND_SIZE(rkind, len2));
9459                    ires += len2;
9460                }
9461                i = j + len1;
9462            }
9463            if (i < slen)
9464                /* copy tail [i:] */
9465                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9466                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9467                       PyUnicode_KIND_SIZE(rkind, slen-i));
9468        } else {
9469            /* interleave */
9470            while (n > 0) {
9471                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9472                       buf2,
9473                       PyUnicode_KIND_SIZE(rkind, len2));
9474                ires += len2;
9475                if (--n <= 0)
9476                    break;
9477                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9478                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9479                       PyUnicode_KIND_SIZE(rkind, 1));
9480                ires++;
9481                i++;
9482            }
9483            memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9484                   sbuf + PyUnicode_KIND_SIZE(rkind, i),
9485                   PyUnicode_KIND_SIZE(rkind, slen-i));
9486        }
9487        u = PyUnicode_FromKindAndData(rkind, res, new_size);
9488        PyMem_Free(res);
9489    }
9490    if (srelease)
9491        PyMem_FREE(sbuf);
9492    if (release1)
9493        PyMem_FREE(buf1);
9494    if (release2)
9495        PyMem_FREE(buf2);
9496    return u;
9497
9498  nothing:
9499    /* nothing to replace; return original string (when possible) */
9500    if (srelease)
9501        PyMem_FREE(sbuf);
9502    if (release1)
9503        PyMem_FREE(buf1);
9504    if (release2)
9505        PyMem_FREE(buf2);
9506    if (PyUnicode_CheckExact(self)) {
9507        Py_INCREF(self);
9508        return (PyObject *) self;
9509    }
9510    return PyUnicode_Copy(self);
9511  error:
9512    if (srelease && sbuf)
9513        PyMem_FREE(sbuf);
9514    if (release1 && buf1)
9515        PyMem_FREE(buf1);
9516    if (release2 && buf2)
9517        PyMem_FREE(buf2);
9518    return NULL;
9519}
9520
9521/* --- Unicode Object Methods --------------------------------------------- */
9522
9523PyDoc_STRVAR(title__doc__,
9524             "S.title() -> str\n\
9525\n\
9526Return a titlecased version of S, i.e. words start with title case\n\
9527characters, all remaining cased characters have lower case.");
9528
9529static PyObject*
9530unicode_title(PyUnicodeObject *self)
9531{
9532    return fixup(self, fixtitle);
9533}
9534
9535PyDoc_STRVAR(capitalize__doc__,
9536             "S.capitalize() -> str\n\
9537\n\
9538Return a capitalized version of S, i.e. make the first character\n\
9539have upper case and the rest lower case.");
9540
9541static PyObject*
9542unicode_capitalize(PyUnicodeObject *self)
9543{
9544    return fixup(self, fixcapitalize);
9545}
9546
9547#if 0
9548PyDoc_STRVAR(capwords__doc__,
9549             "S.capwords() -> str\n\
9550\n\
9551Apply .capitalize() to all words in S and return the result with\n\
9552normalized whitespace (all whitespace strings are replaced by ' ').");
9553
9554static PyObject*
9555unicode_capwords(PyUnicodeObject *self)
9556{
9557    PyObject *list;
9558    PyObject *item;
9559    Py_ssize_t i;
9560
9561    /* Split into words */
9562    list = split(self, NULL, -1);
9563    if (!list)
9564        return NULL;
9565
9566    /* Capitalize each word */
9567    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9568        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9569                     fixcapitalize);
9570        if (item == NULL)
9571            goto onError;
9572        Py_DECREF(PyList_GET_ITEM(list, i));
9573        PyList_SET_ITEM(list, i, item);
9574    }
9575
9576    /* Join the words to form a new string */
9577    item = PyUnicode_Join(NULL, list);
9578
9579  onError:
9580    Py_DECREF(list);
9581    return (PyObject *)item;
9582}
9583#endif
9584
9585/* Argument converter.  Coerces to a single unicode character */
9586
9587static int
9588convert_uc(PyObject *obj, void *addr)
9589{
9590    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9591    PyObject *uniobj;
9592
9593    uniobj = PyUnicode_FromObject(obj);
9594    if (uniobj == NULL) {
9595        PyErr_SetString(PyExc_TypeError,
9596                        "The fill character cannot be converted to Unicode");
9597        return 0;
9598    }
9599    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
9600        PyErr_SetString(PyExc_TypeError,
9601                        "The fill character must be exactly one character long");
9602        Py_DECREF(uniobj);
9603        return 0;
9604    }
9605    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
9606    Py_DECREF(uniobj);
9607    return 1;
9608}
9609
9610PyDoc_STRVAR(center__doc__,
9611             "S.center(width[, fillchar]) -> str\n\
9612\n\
9613Return S centered in a string of length width. Padding is\n\
9614done using the specified fill character (default is a space)");
9615
9616static PyObject *
9617unicode_center(PyUnicodeObject *self, PyObject *args)
9618{
9619    Py_ssize_t marg, left;
9620    Py_ssize_t width;
9621    Py_UCS4 fillchar = ' ';
9622
9623    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
9624        return NULL;
9625
9626    if (PyUnicode_READY(self) == -1)
9627        return NULL;
9628
9629    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
9630        Py_INCREF(self);
9631        return (PyObject*) self;
9632    }
9633
9634    marg = width - _PyUnicode_LENGTH(self);
9635    left = marg / 2 + (marg & width & 1);
9636
9637    return (PyObject*) pad(self, left, marg - left, fillchar);
9638}
9639
9640#if 0
9641
9642/* This code should go into some future Unicode collation support
9643   module. The basic comparison should compare ordinals on a naive
9644   basis (this is what Java does and thus Jython too). */
9645
9646/* speedy UTF-16 code point order comparison */
9647/* gleaned from: */
9648/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9649
9650static short utf16Fixup[32] =
9651{
9652    0, 0, 0, 0, 0, 0, 0, 0,
9653    0, 0, 0, 0, 0, 0, 0, 0,
9654    0, 0, 0, 0, 0, 0, 0, 0,
9655    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
9656};
9657
9658static int
9659unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9660{
9661    Py_ssize_t len1, len2;
9662
9663    Py_UNICODE *s1 = str1->str;
9664    Py_UNICODE *s2 = str2->str;
9665
9666    len1 = str1->_base._base.length;
9667    len2 = str2->_base._base.length;
9668
9669    while (len1 > 0 && len2 > 0) {
9670        Py_UNICODE c1, c2;
9671
9672        c1 = *s1++;
9673        c2 = *s2++;
9674
9675        if (c1 > (1<<11) * 26)
9676            c1 += utf16Fixup[c1>>11];
9677        if (c2 > (1<<11) * 26)
9678            c2 += utf16Fixup[c2>>11];
9679        /* now c1 and c2 are in UTF-32-compatible order */
9680
9681        if (c1 != c2)
9682            return (c1 < c2) ? -1 : 1;
9683
9684        len1--; len2--;
9685    }
9686
9687    return (len1 < len2) ? -1 : (len1 != len2);
9688}
9689
9690#else
9691
9692/* This function assumes that str1 and str2 are readied by the caller. */
9693
9694static int
9695unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9696{
9697    int kind1, kind2;
9698    void *data1, *data2;
9699    Py_ssize_t len1, len2, i;
9700
9701    kind1 = PyUnicode_KIND(str1);
9702    kind2 = PyUnicode_KIND(str2);
9703    data1 = PyUnicode_DATA(str1);
9704    data2 = PyUnicode_DATA(str2);
9705    len1 = PyUnicode_GET_LENGTH(str1);
9706    len2 = PyUnicode_GET_LENGTH(str2);
9707
9708    for (i = 0; i < len1 && i < len2; ++i) {
9709        Py_UCS4 c1, c2;
9710        c1 = PyUnicode_READ(kind1, data1, i);
9711        c2 = PyUnicode_READ(kind2, data2, i);
9712
9713        if (c1 != c2)
9714            return (c1 < c2) ? -1 : 1;
9715    }
9716
9717    return (len1 < len2) ? -1 : (len1 != len2);
9718}
9719
9720#endif
9721
9722int
9723PyUnicode_Compare(PyObject *left, PyObject *right)
9724{
9725    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9726        if (PyUnicode_READY(left) == -1 ||
9727            PyUnicode_READY(right) == -1)
9728            return -1;
9729        return unicode_compare((PyUnicodeObject *)left,
9730                               (PyUnicodeObject *)right);
9731    }
9732    PyErr_Format(PyExc_TypeError,
9733                 "Can't compare %.100s and %.100s",
9734                 left->ob_type->tp_name,
9735                 right->ob_type->tp_name);
9736    return -1;
9737}
9738
9739int
9740PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9741{
9742    Py_ssize_t i;
9743    int kind;
9744    void *data;
9745    Py_UCS4 chr;
9746
9747    assert(_PyUnicode_CHECK(uni));
9748    if (PyUnicode_READY(uni) == -1)
9749        return -1;
9750    kind = PyUnicode_KIND(uni);
9751    data = PyUnicode_DATA(uni);
9752    /* Compare Unicode string and source character set string */
9753    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9754        if (chr != str[i])
9755            return (chr < (unsigned char)(str[i])) ? -1 : 1;
9756    /* This check keeps Python strings that end in '\0' from comparing equal
9757     to C strings identical up to that point. */
9758    if (PyUnicode_GET_LENGTH(uni) != i || chr)
9759        return 1; /* uni is longer */
9760    if (str[i])
9761        return -1; /* str is longer */
9762    return 0;
9763}
9764
9765
9766#define TEST_COND(cond)                         \
9767    ((cond) ? Py_True : Py_False)
9768
9769PyObject *
9770PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
9771{
9772    int result;
9773
9774    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9775        PyObject *v;
9776        if (PyUnicode_READY(left) == -1 ||
9777            PyUnicode_READY(right) == -1)
9778            return NULL;
9779        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9780            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
9781            if (op == Py_EQ) {
9782                Py_INCREF(Py_False);
9783                return Py_False;
9784            }
9785            if (op == Py_NE) {
9786                Py_INCREF(Py_True);
9787                return Py_True;
9788            }
9789        }
9790        if (left == right)
9791            result = 0;
9792        else
9793            result = unicode_compare((PyUnicodeObject *)left,
9794                                     (PyUnicodeObject *)right);
9795
9796        /* Convert the return value to a Boolean */
9797        switch (op) {
9798        case Py_EQ:
9799            v = TEST_COND(result == 0);
9800            break;
9801        case Py_NE:
9802            v = TEST_COND(result != 0);
9803            break;
9804        case Py_LE:
9805            v = TEST_COND(result <= 0);
9806            break;
9807        case Py_GE:
9808            v = TEST_COND(result >= 0);
9809            break;
9810        case Py_LT:
9811            v = TEST_COND(result == -1);
9812            break;
9813        case Py_GT:
9814            v = TEST_COND(result == 1);
9815            break;
9816        default:
9817            PyErr_BadArgument();
9818            return NULL;
9819        }
9820        Py_INCREF(v);
9821        return v;
9822    }
9823
9824    Py_RETURN_NOTIMPLEMENTED;
9825}
9826
9827int
9828PyUnicode_Contains(PyObject *container, PyObject *element)
9829{
9830    PyObject *str, *sub;
9831    int kind1, kind2, kind;
9832    void *buf1, *buf2;
9833    Py_ssize_t len1, len2;
9834    int result;
9835
9836    /* Coerce the two arguments */
9837    sub = PyUnicode_FromObject(element);
9838    if (!sub) {
9839        PyErr_Format(PyExc_TypeError,
9840                     "'in <string>' requires string as left operand, not %s",
9841                     element->ob_type->tp_name);
9842        return -1;
9843    }
9844    if (PyUnicode_READY(sub) == -1)
9845        return -1;
9846
9847    str = PyUnicode_FromObject(container);
9848    if (!str || PyUnicode_READY(str) == -1) {
9849        Py_DECREF(sub);
9850        return -1;
9851    }
9852
9853    kind1 = PyUnicode_KIND(str);
9854    kind2 = PyUnicode_KIND(sub);
9855    kind = kind1 > kind2 ? kind1 : kind2;
9856    buf1 = PyUnicode_DATA(str);
9857    buf2 = PyUnicode_DATA(sub);
9858    if (kind1 != kind)
9859        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9860    if (!buf1) {
9861        Py_DECREF(sub);
9862        return -1;
9863    }
9864    if (kind2 != kind)
9865        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9866    if (!buf2) {
9867        Py_DECREF(sub);
9868        if (kind1 != kind) PyMem_Free(buf1);
9869        return -1;
9870    }
9871    len1 = PyUnicode_GET_LENGTH(str);
9872    len2 = PyUnicode_GET_LENGTH(sub);
9873
9874    switch(kind) {
9875    case PyUnicode_1BYTE_KIND:
9876        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9877        break;
9878    case PyUnicode_2BYTE_KIND:
9879        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9880        break;
9881    case PyUnicode_4BYTE_KIND:
9882        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9883        break;
9884    default:
9885        result = -1;
9886        assert(0);
9887    }
9888
9889    Py_DECREF(str);
9890    Py_DECREF(sub);
9891
9892    if (kind1 != kind)
9893        PyMem_Free(buf1);
9894    if (kind2 != kind)
9895        PyMem_Free(buf2);
9896
9897    return result;
9898}
9899
9900/* Concat to string or Unicode object giving a new Unicode object. */
9901
9902PyObject *
9903PyUnicode_Concat(PyObject *left, PyObject *right)
9904{
9905    PyObject *u = NULL, *v = NULL, *w;
9906    Py_UCS4 maxchar;
9907
9908    /* Coerce the two arguments */
9909    u = PyUnicode_FromObject(left);
9910    if (u == NULL)
9911        goto onError;
9912    v = PyUnicode_FromObject(right);
9913    if (v == NULL)
9914        goto onError;
9915
9916    /* Shortcuts */
9917    if (v == unicode_empty) {
9918        Py_DECREF(v);
9919        return u;
9920    }
9921    if (u == unicode_empty) {
9922        Py_DECREF(u);
9923        return v;
9924    }
9925
9926    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
9927    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
9928
9929    /* Concat the two Unicode strings */
9930    w = PyUnicode_New(
9931        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9932        maxchar);
9933    if (w == NULL)
9934        goto onError;
9935    if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9936        goto onError;
9937    if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
9938                                 v, 0,
9939                                 PyUnicode_GET_LENGTH(v)) < 0)
9940        goto onError;
9941    Py_DECREF(u);
9942    Py_DECREF(v);
9943    return w;
9944
9945  onError:
9946    Py_XDECREF(u);
9947    Py_XDECREF(v);
9948    return NULL;
9949}
9950
9951void
9952PyUnicode_Append(PyObject **p_left, PyObject *right)
9953{
9954    PyObject *left, *res;
9955
9956    if (p_left == NULL) {
9957        if (!PyErr_Occurred())
9958            PyErr_BadInternalCall();
9959        return;
9960    }
9961    left = *p_left;
9962    if (right == NULL || !PyUnicode_Check(left)) {
9963        if (!PyErr_Occurred())
9964            PyErr_BadInternalCall();
9965        goto error;
9966    }
9967
9968    if (PyUnicode_CheckExact(left) && left != unicode_empty
9969        && PyUnicode_CheckExact(right) && right != unicode_empty
9970        && unicode_resizable(left)
9971        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9972            || _PyUnicode_WSTR(left) != NULL))
9973    {
9974        Py_ssize_t left_len, right_len, new_len;
9975#ifdef Py_DEBUG
9976        Py_ssize_t copied;
9977#endif
9978
9979        if (PyUnicode_READY(left))
9980            goto error;
9981        if (PyUnicode_READY(right))
9982            goto error;
9983
9984        /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9985        if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9986        {
9987            left_len = PyUnicode_GET_LENGTH(left);
9988            right_len = PyUnicode_GET_LENGTH(right);
9989            if (left_len > PY_SSIZE_T_MAX - right_len) {
9990                PyErr_SetString(PyExc_OverflowError,
9991                                "strings are too large to concat");
9992                goto error;
9993            }
9994            new_len = left_len + right_len;
9995
9996            /* Now we own the last reference to 'left', so we can resize it
9997             * in-place.
9998             */
9999            if (unicode_resize(&left, new_len) != 0) {
10000                /* XXX if _PyUnicode_Resize() fails, 'left' has been
10001                 * deallocated so it cannot be put back into
10002                 * 'variable'.  The MemoryError is raised when there
10003                 * is no value in 'variable', which might (very
10004                 * remotely) be a cause of incompatibilities.
10005                 */
10006                goto error;
10007            }
10008            /* copy 'right' into the newly allocated area of 'left' */
10009#ifdef Py_DEBUG
10010            copied = PyUnicode_CopyCharacters(left, left_len,
10011                                              right, 0,
10012                                              right_len);
10013            assert(0 <= copied);
10014#else
10015            PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
10016#endif
10017            *p_left = left;
10018            return;
10019        }
10020    }
10021
10022    res = PyUnicode_Concat(left, right);
10023    if (res == NULL)
10024        goto error;
10025    Py_DECREF(left);
10026    *p_left = res;
10027    return;
10028
10029error:
10030    Py_DECREF(*p_left);
10031    *p_left = NULL;
10032}
10033
10034void
10035PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10036{
10037    PyUnicode_Append(pleft, right);
10038    Py_XDECREF(right);
10039}
10040
10041PyDoc_STRVAR(count__doc__,
10042             "S.count(sub[, start[, end]]) -> int\n\
10043\n\
10044Return the number of non-overlapping occurrences of substring sub in\n\
10045string S[start:end].  Optional arguments start and end are\n\
10046interpreted as in slice notation.");
10047
10048static PyObject *
10049unicode_count(PyUnicodeObject *self, PyObject *args)
10050{
10051    PyUnicodeObject *substring;
10052    Py_ssize_t start = 0;
10053    Py_ssize_t end = PY_SSIZE_T_MAX;
10054    PyObject *result;
10055    int kind1, kind2, kind;
10056    void *buf1, *buf2;
10057    Py_ssize_t len1, len2, iresult;
10058
10059    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10060                                            &start, &end))
10061        return NULL;
10062
10063    kind1 = PyUnicode_KIND(self);
10064    kind2 = PyUnicode_KIND(substring);
10065    kind = kind1 > kind2 ? kind1 : kind2;
10066    buf1 = PyUnicode_DATA(self);
10067    buf2 = PyUnicode_DATA(substring);
10068    if (kind1 != kind)
10069        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10070    if (!buf1) {
10071        Py_DECREF(substring);
10072        return NULL;
10073    }
10074    if (kind2 != kind)
10075        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10076    if (!buf2) {
10077        Py_DECREF(substring);
10078        if (kind1 != kind) PyMem_Free(buf1);
10079        return NULL;
10080    }
10081    len1 = PyUnicode_GET_LENGTH(self);
10082    len2 = PyUnicode_GET_LENGTH(substring);
10083
10084    ADJUST_INDICES(start, end, len1);
10085    switch(kind) {
10086    case PyUnicode_1BYTE_KIND:
10087        iresult = ucs1lib_count(
10088            ((Py_UCS1*)buf1) + start, end - start,
10089            buf2, len2, PY_SSIZE_T_MAX
10090            );
10091        break;
10092    case PyUnicode_2BYTE_KIND:
10093        iresult = ucs2lib_count(
10094            ((Py_UCS2*)buf1) + start, end - start,
10095            buf2, len2, PY_SSIZE_T_MAX
10096            );
10097        break;
10098    case PyUnicode_4BYTE_KIND:
10099        iresult = ucs4lib_count(
10100            ((Py_UCS4*)buf1) + start, end - start,
10101            buf2, len2, PY_SSIZE_T_MAX
10102            );
10103        break;
10104    default:
10105        assert(0); iresult = 0;
10106    }
10107
10108    result = PyLong_FromSsize_t(iresult);
10109
10110    if (kind1 != kind)
10111        PyMem_Free(buf1);
10112    if (kind2 != kind)
10113        PyMem_Free(buf2);
10114
10115    Py_DECREF(substring);
10116
10117    return result;
10118}
10119
10120PyDoc_STRVAR(encode__doc__,
10121             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10122\n\
10123Encode S using the codec registered for encoding. Default encoding\n\
10124is 'utf-8'. errors may be given to set a different error\n\
10125handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10127'xmlcharrefreplace' as well as any other name registered with\n\
10128codecs.register_error that can handle UnicodeEncodeErrors.");
10129
10130static PyObject *
10131unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10132{
10133    static char *kwlist[] = {"encoding", "errors", 0};
10134    char *encoding = NULL;
10135    char *errors = NULL;
10136
10137    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10138                                     kwlist, &encoding, &errors))
10139        return NULL;
10140    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10141}
10142
10143PyDoc_STRVAR(expandtabs__doc__,
10144             "S.expandtabs([tabsize]) -> str\n\
10145\n\
10146Return a copy of S where all tab characters are expanded using spaces.\n\
10147If tabsize is not given, a tab size of 8 characters is assumed.");
10148
10149static PyObject*
10150unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10151{
10152    Py_UNICODE *e;
10153    Py_UNICODE *p;
10154    Py_UNICODE *q;
10155    Py_UNICODE *qe;
10156    Py_ssize_t i, j, incr, wstr_length;
10157    PyUnicodeObject *u;
10158    int tabsize = 8;
10159
10160    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10161        return NULL;
10162
10163    if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10164        return NULL;
10165
10166    /* First pass: determine size of output string */
10167    i = 0; /* chars up to and including most recent \n or \r */
10168    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
10169    e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10170    for (p = _PyUnicode_WSTR(self); p < e; p++)
10171        if (*p == '\t') {
10172            if (tabsize > 0) {
10173                incr = tabsize - (j % tabsize); /* cannot overflow */
10174                if (j > PY_SSIZE_T_MAX - incr)
10175                    goto overflow1;
10176                j += incr;
10177            }
10178        }
10179        else {
10180            if (j > PY_SSIZE_T_MAX - 1)
10181                goto overflow1;
10182            j++;
10183            if (*p == '\n' || *p == '\r') {
10184                if (i > PY_SSIZE_T_MAX - j)
10185                    goto overflow1;
10186                i += j;
10187                j = 0;
10188            }
10189        }
10190
10191    if (i > PY_SSIZE_T_MAX - j)
10192        goto overflow1;
10193
10194    /* Second pass: create output string and fill it */
10195    u = _PyUnicode_New(i + j);
10196    if (!u)
10197        return NULL;
10198
10199    j = 0; /* same as in first pass */
10200    q = _PyUnicode_WSTR(u); /* next output char */
10201    qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
10202
10203    for (p = _PyUnicode_WSTR(self); p < e; p++)
10204        if (*p == '\t') {
10205            if (tabsize > 0) {
10206                i = tabsize - (j % tabsize);
10207                j += i;
10208                while (i--) {
10209                    if (q >= qe)
10210                        goto overflow2;
10211                    *q++ = ' ';
10212                }
10213            }
10214        }
10215        else {
10216            if (q >= qe)
10217                goto overflow2;
10218            *q++ = *p;
10219            j++;
10220            if (*p == '\n' || *p == '\r')
10221                j = 0;
10222        }
10223
10224    if (_PyUnicode_READY_REPLACE(&u)) {
10225        Py_DECREF(u);
10226        return NULL;
10227    }
10228    return (PyObject*) u;
10229
10230  overflow2:
10231    Py_DECREF(u);
10232  overflow1:
10233    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10234    return NULL;
10235}
10236
10237PyDoc_STRVAR(find__doc__,
10238             "S.find(sub[, start[, end]]) -> int\n\
10239\n\
10240Return the lowest index in S where substring sub is found,\n\
10241such that sub is contained within S[start:end].  Optional\n\
10242arguments start and end are interpreted as in slice notation.\n\
10243\n\
10244Return -1 on failure.");
10245
10246static PyObject *
10247unicode_find(PyObject *self, PyObject *args)
10248{
10249    PyUnicodeObject *substring;
10250    Py_ssize_t start;
10251    Py_ssize_t end;
10252    Py_ssize_t result;
10253
10254    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10255                                            &start, &end))
10256        return NULL;
10257
10258    if (PyUnicode_READY(self) == -1)
10259        return NULL;
10260    if (PyUnicode_READY(substring) == -1)
10261        return NULL;
10262
10263    result = any_find_slice(
10264        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10265        self, (PyObject*)substring, start, end
10266        );
10267
10268    Py_DECREF(substring);
10269
10270    if (result == -2)
10271        return NULL;
10272
10273    return PyLong_FromSsize_t(result);
10274}
10275
10276static PyObject *
10277unicode_getitem(PyObject *self, Py_ssize_t index)
10278{
10279    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10280    if (ch == (Py_UCS4)-1)
10281        return NULL;
10282    return PyUnicode_FromOrdinal(ch);
10283}
10284
10285/* Believe it or not, this produces the same value for ASCII strings
10286   as bytes_hash(). */
10287static Py_hash_t
10288unicode_hash(PyUnicodeObject *self)
10289{
10290    Py_ssize_t len;
10291    Py_uhash_t x;
10292
10293    if (_PyUnicode_HASH(self) != -1)
10294        return _PyUnicode_HASH(self);
10295    if (PyUnicode_READY(self) == -1)
10296        return -1;
10297    len = PyUnicode_GET_LENGTH(self);
10298
10299    /* The hash function as a macro, gets expanded three times below. */
10300#define HASH(P) \
10301    x = (Py_uhash_t)*P << 7; \
10302    while (--len >= 0) \
10303        x = (1000003*x) ^ (Py_uhash_t)*P++;
10304
10305    switch (PyUnicode_KIND(self)) {
10306    case PyUnicode_1BYTE_KIND: {
10307        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10308        HASH(c);
10309        break;
10310    }
10311    case PyUnicode_2BYTE_KIND: {
10312        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10313        HASH(s);
10314        break;
10315    }
10316    default: {
10317        Py_UCS4 *l;
10318        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10319               "Impossible switch case in unicode_hash");
10320        l = PyUnicode_4BYTE_DATA(self);
10321        HASH(l);
10322        break;
10323    }
10324    }
10325    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10326
10327    if (x == -1)
10328        x = -2;
10329    _PyUnicode_HASH(self) = x;
10330    return x;
10331}
10332#undef HASH
10333
10334PyDoc_STRVAR(index__doc__,
10335             "S.index(sub[, start[, end]]) -> int\n\
10336\n\
10337Like S.find() but raise ValueError when the substring is not found.");
10338
10339static PyObject *
10340unicode_index(PyObject *self, PyObject *args)
10341{
10342    Py_ssize_t result;
10343    PyUnicodeObject *substring;
10344    Py_ssize_t start;
10345    Py_ssize_t end;
10346
10347    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10348                                            &start, &end))
10349        return NULL;
10350
10351    if (PyUnicode_READY(self) == -1)
10352        return NULL;
10353    if (PyUnicode_READY(substring) == -1)
10354        return NULL;
10355
10356    result = any_find_slice(
10357        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10358        self, (PyObject*)substring, start, end
10359        );
10360
10361    Py_DECREF(substring);
10362
10363    if (result == -2)
10364        return NULL;
10365
10366    if (result < 0) {
10367        PyErr_SetString(PyExc_ValueError, "substring not found");
10368        return NULL;
10369    }
10370
10371    return PyLong_FromSsize_t(result);
10372}
10373
10374PyDoc_STRVAR(islower__doc__,
10375             "S.islower() -> bool\n\
10376\n\
10377Return True if all cased characters in S are lowercase and there is\n\
10378at least one cased character in S, False otherwise.");
10379
10380static PyObject*
10381unicode_islower(PyUnicodeObject *self)
10382{
10383    Py_ssize_t i, length;
10384    int kind;
10385    void *data;
10386    int cased;
10387
10388    if (PyUnicode_READY(self) == -1)
10389        return NULL;
10390    length = PyUnicode_GET_LENGTH(self);
10391    kind = PyUnicode_KIND(self);
10392    data = PyUnicode_DATA(self);
10393
10394    /* Shortcut for single character strings */
10395    if (length == 1)
10396        return PyBool_FromLong(
10397            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10398
10399    /* Special case for empty strings */
10400    if (length == 0)
10401        return PyBool_FromLong(0);
10402
10403    cased = 0;
10404    for (i = 0; i < length; i++) {
10405        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10406
10407        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10408            return PyBool_FromLong(0);
10409        else if (!cased && Py_UNICODE_ISLOWER(ch))
10410            cased = 1;
10411    }
10412    return PyBool_FromLong(cased);
10413}
10414
10415PyDoc_STRVAR(isupper__doc__,
10416             "S.isupper() -> bool\n\
10417\n\
10418Return True if all cased characters in S are uppercase and there is\n\
10419at least one cased character in S, False otherwise.");
10420
10421static PyObject*
10422unicode_isupper(PyUnicodeObject *self)
10423{
10424    Py_ssize_t i, length;
10425    int kind;
10426    void *data;
10427    int cased;
10428
10429    if (PyUnicode_READY(self) == -1)
10430        return NULL;
10431    length = PyUnicode_GET_LENGTH(self);
10432    kind = PyUnicode_KIND(self);
10433    data = PyUnicode_DATA(self);
10434
10435    /* Shortcut for single character strings */
10436    if (length == 1)
10437        return PyBool_FromLong(
10438            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10439
10440    /* Special case for empty strings */
10441    if (length == 0)
10442        return PyBool_FromLong(0);
10443
10444    cased = 0;
10445    for (i = 0; i < length; i++) {
10446        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10447
10448        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10449            return PyBool_FromLong(0);
10450        else if (!cased && Py_UNICODE_ISUPPER(ch))
10451            cased = 1;
10452    }
10453    return PyBool_FromLong(cased);
10454}
10455
10456PyDoc_STRVAR(istitle__doc__,
10457             "S.istitle() -> bool\n\
10458\n\
10459Return True if S is a titlecased string and there is at least one\n\
10460character in S, i.e. upper- and titlecase characters may only\n\
10461follow uncased characters and lowercase characters only cased ones.\n\
10462Return False otherwise.");
10463
10464static PyObject*
10465unicode_istitle(PyUnicodeObject *self)
10466{
10467    Py_ssize_t i, length;
10468    int kind;
10469    void *data;
10470    int cased, previous_is_cased;
10471
10472    if (PyUnicode_READY(self) == -1)
10473        return NULL;
10474    length = PyUnicode_GET_LENGTH(self);
10475    kind = PyUnicode_KIND(self);
10476    data = PyUnicode_DATA(self);
10477
10478    /* Shortcut for single character strings */
10479    if (length == 1) {
10480        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10481        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10482                               (Py_UNICODE_ISUPPER(ch) != 0));
10483    }
10484
10485    /* Special case for empty strings */
10486    if (length == 0)
10487        return PyBool_FromLong(0);
10488
10489    cased = 0;
10490    previous_is_cased = 0;
10491    for (i = 0; i < length; i++) {
10492        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10493
10494        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10495            if (previous_is_cased)
10496                return PyBool_FromLong(0);
10497            previous_is_cased = 1;
10498            cased = 1;
10499        }
10500        else if (Py_UNICODE_ISLOWER(ch)) {
10501            if (!previous_is_cased)
10502                return PyBool_FromLong(0);
10503            previous_is_cased = 1;
10504            cased = 1;
10505        }
10506        else
10507            previous_is_cased = 0;
10508    }
10509    return PyBool_FromLong(cased);
10510}
10511
10512PyDoc_STRVAR(isspace__doc__,
10513             "S.isspace() -> bool\n\
10514\n\
10515Return True if all characters in S are whitespace\n\
10516and there is at least one character in S, False otherwise.");
10517
10518static PyObject*
10519unicode_isspace(PyUnicodeObject *self)
10520{
10521    Py_ssize_t i, length;
10522    int kind;
10523    void *data;
10524
10525    if (PyUnicode_READY(self) == -1)
10526        return NULL;
10527    length = PyUnicode_GET_LENGTH(self);
10528    kind = PyUnicode_KIND(self);
10529    data = PyUnicode_DATA(self);
10530
10531    /* Shortcut for single character strings */
10532    if (length == 1)
10533        return PyBool_FromLong(
10534            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10535
10536    /* Special case for empty strings */
10537    if (length == 0)
10538        return PyBool_FromLong(0);
10539
10540    for (i = 0; i < length; i++) {
10541        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10542        if (!Py_UNICODE_ISSPACE(ch))
10543            return PyBool_FromLong(0);
10544    }
10545    return PyBool_FromLong(1);
10546}
10547
10548PyDoc_STRVAR(isalpha__doc__,
10549             "S.isalpha() -> bool\n\
10550\n\
10551Return True if all characters in S are alphabetic\n\
10552and there is at least one character in S, False otherwise.");
10553
10554static PyObject*
10555unicode_isalpha(PyUnicodeObject *self)
10556{
10557    Py_ssize_t i, length;
10558    int kind;
10559    void *data;
10560
10561    if (PyUnicode_READY(self) == -1)
10562        return NULL;
10563    length = PyUnicode_GET_LENGTH(self);
10564    kind = PyUnicode_KIND(self);
10565    data = PyUnicode_DATA(self);
10566
10567    /* Shortcut for single character strings */
10568    if (length == 1)
10569        return PyBool_FromLong(
10570            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10571
10572    /* Special case for empty strings */
10573    if (length == 0)
10574        return PyBool_FromLong(0);
10575
10576    for (i = 0; i < length; i++) {
10577        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10578            return PyBool_FromLong(0);
10579    }
10580    return PyBool_FromLong(1);
10581}
10582
10583PyDoc_STRVAR(isalnum__doc__,
10584             "S.isalnum() -> bool\n\
10585\n\
10586Return True if all characters in S are alphanumeric\n\
10587and there is at least one character in S, False otherwise.");
10588
10589static PyObject*
10590unicode_isalnum(PyUnicodeObject *self)
10591{
10592    int kind;
10593    void *data;
10594    Py_ssize_t len, i;
10595
10596    if (PyUnicode_READY(self) == -1)
10597        return NULL;
10598
10599    kind = PyUnicode_KIND(self);
10600    data = PyUnicode_DATA(self);
10601    len = PyUnicode_GET_LENGTH(self);
10602
10603    /* Shortcut for single character strings */
10604    if (len == 1) {
10605        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10606        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10607    }
10608
10609    /* Special case for empty strings */
10610    if (len == 0)
10611        return PyBool_FromLong(0);
10612
10613    for (i = 0; i < len; i++) {
10614        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10615        if (!Py_UNICODE_ISALNUM(ch))
10616            return PyBool_FromLong(0);
10617    }
10618    return PyBool_FromLong(1);
10619}
10620
10621PyDoc_STRVAR(isdecimal__doc__,
10622             "S.isdecimal() -> bool\n\
10623\n\
10624Return True if there are only decimal characters in S,\n\
10625False otherwise.");
10626
10627static PyObject*
10628unicode_isdecimal(PyUnicodeObject *self)
10629{
10630    Py_ssize_t i, length;
10631    int kind;
10632    void *data;
10633
10634    if (PyUnicode_READY(self) == -1)
10635        return NULL;
10636    length = PyUnicode_GET_LENGTH(self);
10637    kind = PyUnicode_KIND(self);
10638    data = PyUnicode_DATA(self);
10639
10640    /* Shortcut for single character strings */
10641    if (length == 1)
10642        return PyBool_FromLong(
10643            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
10644
10645    /* Special case for empty strings */
10646    if (length == 0)
10647        return PyBool_FromLong(0);
10648
10649    for (i = 0; i < length; i++) {
10650        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
10651            return PyBool_FromLong(0);
10652    }
10653    return PyBool_FromLong(1);
10654}
10655
10656PyDoc_STRVAR(isdigit__doc__,
10657             "S.isdigit() -> bool\n\
10658\n\
10659Return True if all characters in S are digits\n\
10660and there is at least one character in S, False otherwise.");
10661
10662static PyObject*
10663unicode_isdigit(PyUnicodeObject *self)
10664{
10665    Py_ssize_t i, length;
10666    int kind;
10667    void *data;
10668
10669    if (PyUnicode_READY(self) == -1)
10670        return NULL;
10671    length = PyUnicode_GET_LENGTH(self);
10672    kind = PyUnicode_KIND(self);
10673    data = PyUnicode_DATA(self);
10674
10675    /* Shortcut for single character strings */
10676    if (length == 1) {
10677        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10678        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10679    }
10680
10681    /* Special case for empty strings */
10682    if (length == 0)
10683        return PyBool_FromLong(0);
10684
10685    for (i = 0; i < length; i++) {
10686        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
10687            return PyBool_FromLong(0);
10688    }
10689    return PyBool_FromLong(1);
10690}
10691
10692PyDoc_STRVAR(isnumeric__doc__,
10693             "S.isnumeric() -> bool\n\
10694\n\
10695Return True if there are only numeric characters in S,\n\
10696False otherwise.");
10697
10698static PyObject*
10699unicode_isnumeric(PyUnicodeObject *self)
10700{
10701    Py_ssize_t i, length;
10702    int kind;
10703    void *data;
10704
10705    if (PyUnicode_READY(self) == -1)
10706        return NULL;
10707    length = PyUnicode_GET_LENGTH(self);
10708    kind = PyUnicode_KIND(self);
10709    data = PyUnicode_DATA(self);
10710
10711    /* Shortcut for single character strings */
10712    if (length == 1)
10713        return PyBool_FromLong(
10714            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
10715
10716    /* Special case for empty strings */
10717    if (length == 0)
10718        return PyBool_FromLong(0);
10719
10720    for (i = 0; i < length; i++) {
10721        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
10722            return PyBool_FromLong(0);
10723    }
10724    return PyBool_FromLong(1);
10725}
10726
10727int
10728PyUnicode_IsIdentifier(PyObject *self)
10729{
10730    int kind;
10731    void *data;
10732    Py_ssize_t i;
10733    Py_UCS4 first;
10734
10735    if (PyUnicode_READY(self) == -1) {
10736        Py_FatalError("identifier not ready");
10737        return 0;
10738    }
10739
10740    /* Special case for empty strings */
10741    if (PyUnicode_GET_LENGTH(self) == 0)
10742        return 0;
10743    kind = PyUnicode_KIND(self);
10744    data = PyUnicode_DATA(self);
10745
10746    /* PEP 3131 says that the first character must be in
10747       XID_Start and subsequent characters in XID_Continue,
10748       and for the ASCII range, the 2.x rules apply (i.e
10749       start with letters and underscore, continue with
10750       letters, digits, underscore). However, given the current
10751       definition of XID_Start and XID_Continue, it is sufficient
10752       to check just for these, except that _ must be allowed
10753       as starting an identifier.  */
10754    first = PyUnicode_READ(kind, data, 0);
10755    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
10756        return 0;
10757
10758    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
10759        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
10760            return 0;
10761    return 1;
10762}
10763
10764PyDoc_STRVAR(isidentifier__doc__,
10765             "S.isidentifier() -> bool\n\
10766\n\
10767Return True if S is a valid identifier according\n\
10768to the language definition.");
10769
10770static PyObject*
10771unicode_isidentifier(PyObject *self)
10772{
10773    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10774}
10775
10776PyDoc_STRVAR(isprintable__doc__,
10777             "S.isprintable() -> bool\n\
10778\n\
10779Return True if all characters in S are considered\n\
10780printable in repr() or S is empty, False otherwise.");
10781
10782static PyObject*
10783unicode_isprintable(PyObject *self)
10784{
10785    Py_ssize_t i, length;
10786    int kind;
10787    void *data;
10788
10789    if (PyUnicode_READY(self) == -1)
10790        return NULL;
10791    length = PyUnicode_GET_LENGTH(self);
10792    kind = PyUnicode_KIND(self);
10793    data = PyUnicode_DATA(self);
10794
10795    /* Shortcut for single character strings */
10796    if (length == 1)
10797        return PyBool_FromLong(
10798            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
10799
10800    for (i = 0; i < length; i++) {
10801        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
10802            Py_RETURN_FALSE;
10803        }
10804    }
10805    Py_RETURN_TRUE;
10806}
10807
10808PyDoc_STRVAR(join__doc__,
10809             "S.join(iterable) -> str\n\
10810\n\
10811Return a string which is the concatenation of the strings in the\n\
10812iterable.  The separator between elements is S.");
10813
10814static PyObject*
10815unicode_join(PyObject *self, PyObject *data)
10816{
10817    return PyUnicode_Join(self, data);
10818}
10819
10820static Py_ssize_t
10821unicode_length(PyUnicodeObject *self)
10822{
10823    if (PyUnicode_READY(self) == -1)
10824        return -1;
10825    return PyUnicode_GET_LENGTH(self);
10826}
10827
10828PyDoc_STRVAR(ljust__doc__,
10829             "S.ljust(width[, fillchar]) -> str\n\
10830\n\
10831Return S left-justified in a Unicode string of length width. Padding is\n\
10832done using the specified fill character (default is a space).");
10833
10834static PyObject *
10835unicode_ljust(PyUnicodeObject *self, PyObject *args)
10836{
10837    Py_ssize_t width;
10838    Py_UCS4 fillchar = ' ';
10839
10840    if (PyUnicode_READY(self) == -1)
10841        return NULL;
10842
10843    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
10844        return NULL;
10845
10846    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10847        Py_INCREF(self);
10848        return (PyObject*) self;
10849    }
10850
10851    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
10852}
10853
10854PyDoc_STRVAR(lower__doc__,
10855             "S.lower() -> str\n\
10856\n\
10857Return a copy of the string S converted to lowercase.");
10858
10859static PyObject*
10860unicode_lower(PyUnicodeObject *self)
10861{
10862    return fixup(self, fixlower);
10863}
10864
10865#define LEFTSTRIP 0
10866#define RIGHTSTRIP 1
10867#define BOTHSTRIP 2
10868
10869/* Arrays indexed by above */
10870static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10871
10872#define STRIPNAME(i) (stripformat[i]+3)
10873
10874/* externally visible for str.strip(unicode) */
10875PyObject *
10876_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10877{
10878    void *data;
10879    int kind;
10880    Py_ssize_t i, j, len;
10881    BLOOM_MASK sepmask;
10882
10883    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10884        return NULL;
10885
10886    kind = PyUnicode_KIND(self);
10887    data = PyUnicode_DATA(self);
10888    len = PyUnicode_GET_LENGTH(self);
10889    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10890                              PyUnicode_DATA(sepobj),
10891                              PyUnicode_GET_LENGTH(sepobj));
10892
10893    i = 0;
10894    if (striptype != RIGHTSTRIP) {
10895        while (i < len &&
10896               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
10897            i++;
10898        }
10899    }
10900
10901    j = len;
10902    if (striptype != LEFTSTRIP) {
10903        do {
10904            j--;
10905        } while (j >= i &&
10906                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
10907        j++;
10908    }
10909
10910    return PyUnicode_Substring((PyObject*)self, i, j);
10911}
10912
10913PyObject*
10914PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10915{
10916    unsigned char *data;
10917    int kind;
10918    Py_ssize_t length;
10919
10920    if (PyUnicode_READY(self) == -1)
10921        return NULL;
10922
10923    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10924
10925    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
10926    {
10927        if (PyUnicode_CheckExact(self)) {
10928            Py_INCREF(self);
10929            return self;
10930        }
10931        else
10932            return PyUnicode_Copy(self);
10933    }
10934
10935    length = end - start;
10936    if (length == 1)
10937        return unicode_getitem(self, start);
10938
10939    if (start < 0 || end < 0) {
10940        PyErr_SetString(PyExc_IndexError, "string index out of range");
10941        return NULL;
10942    }
10943
10944    kind = PyUnicode_KIND(self);
10945    data = PyUnicode_1BYTE_DATA(self);
10946    return PyUnicode_FromKindAndData(kind,
10947                                     data + PyUnicode_KIND_SIZE(kind, start),
10948                                     length);
10949}
10950
10951static PyObject *
10952do_strip(PyUnicodeObject *self, int striptype)
10953{
10954    int kind;
10955    void *data;
10956    Py_ssize_t len, i, j;
10957
10958    if (PyUnicode_READY(self) == -1)
10959        return NULL;
10960
10961    kind = PyUnicode_KIND(self);
10962    data = PyUnicode_DATA(self);
10963    len = PyUnicode_GET_LENGTH(self);
10964
10965    i = 0;
10966    if (striptype != RIGHTSTRIP) {
10967        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
10968            i++;
10969        }
10970    }
10971
10972    j = len;
10973    if (striptype != LEFTSTRIP) {
10974        do {
10975            j--;
10976        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
10977        j++;
10978    }
10979
10980    return PyUnicode_Substring((PyObject*)self, i, j);
10981}
10982
10983
10984static PyObject *
10985do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10986{
10987    PyObject *sep = NULL;
10988
10989    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10990        return NULL;
10991
10992    if (sep != NULL && sep != Py_None) {
10993        if (PyUnicode_Check(sep))
10994            return _PyUnicode_XStrip(self, striptype, sep);
10995        else {
10996            PyErr_Format(PyExc_TypeError,
10997                         "%s arg must be None or str",
10998                         STRIPNAME(striptype));
10999            return NULL;
11000        }
11001    }
11002
11003    return do_strip(self, striptype);
11004}
11005
11006
11007PyDoc_STRVAR(strip__doc__,
11008             "S.strip([chars]) -> str\n\
11009\n\
11010Return a copy of the string S with leading and trailing\n\
11011whitespace removed.\n\
11012If chars is given and not None, remove characters in chars instead.");
11013
11014static PyObject *
11015unicode_strip(PyUnicodeObject *self, PyObject *args)
11016{
11017    if (PyTuple_GET_SIZE(args) == 0)
11018        return do_strip(self, BOTHSTRIP); /* Common case */
11019    else
11020        return do_argstrip(self, BOTHSTRIP, args);
11021}
11022
11023
11024PyDoc_STRVAR(lstrip__doc__,
11025             "S.lstrip([chars]) -> str\n\
11026\n\
11027Return a copy of the string S with leading whitespace removed.\n\
11028If chars is given and not None, remove characters in chars instead.");
11029
11030static PyObject *
11031unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11032{
11033    if (PyTuple_GET_SIZE(args) == 0)
11034        return do_strip(self, LEFTSTRIP); /* Common case */
11035    else
11036        return do_argstrip(self, LEFTSTRIP, args);
11037}
11038
11039
11040PyDoc_STRVAR(rstrip__doc__,
11041             "S.rstrip([chars]) -> str\n\
11042\n\
11043Return a copy of the string S with trailing whitespace removed.\n\
11044If chars is given and not None, remove characters in chars instead.");
11045
11046static PyObject *
11047unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11048{
11049    if (PyTuple_GET_SIZE(args) == 0)
11050        return do_strip(self, RIGHTSTRIP); /* Common case */
11051    else
11052        return do_argstrip(self, RIGHTSTRIP, args);
11053}
11054
11055
11056static PyObject*
11057unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
11058{
11059    PyUnicodeObject *u;
11060    Py_ssize_t nchars, n;
11061
11062    if (len < 1) {
11063        Py_INCREF(unicode_empty);
11064        return unicode_empty;
11065    }
11066
11067    if (len == 1 && PyUnicode_CheckExact(str)) {
11068        /* no repeat, return original string */
11069        Py_INCREF(str);
11070        return (PyObject*) str;
11071    }
11072
11073    if (PyUnicode_READY(str) == -1)
11074        return NULL;
11075
11076    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11077        PyErr_SetString(PyExc_OverflowError,
11078                        "repeated string is too long");
11079        return NULL;
11080    }
11081    nchars = len * PyUnicode_GET_LENGTH(str);
11082
11083    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11084    if (!u)
11085        return NULL;
11086    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11087
11088    if (PyUnicode_GET_LENGTH(str) == 1) {
11089        const int kind = PyUnicode_KIND(str);
11090        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11091        void *to = PyUnicode_DATA(u);
11092        if (kind == PyUnicode_1BYTE_KIND)
11093            memset(to, (unsigned char)fill_char, len);
11094        else {
11095            for (n = 0; n < len; ++n)
11096                PyUnicode_WRITE(kind, to, n, fill_char);
11097        }
11098    }
11099    else {
11100        /* number of characters copied this far */
11101        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11102        const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11103        char *to = (char *) PyUnicode_DATA(u);
11104        Py_MEMCPY(to, PyUnicode_DATA(str),
11105                  PyUnicode_GET_LENGTH(str) * char_size);
11106        while (done < nchars) {
11107            n = (done <= nchars-done) ? done : nchars-done;
11108            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11109            done += n;
11110        }
11111    }
11112
11113    return (PyObject*) u;
11114}
11115
11116PyObject *
11117PyUnicode_Replace(PyObject *obj,
11118                  PyObject *subobj,
11119                  PyObject *replobj,
11120                  Py_ssize_t maxcount)
11121{
11122    PyObject *self;
11123    PyObject *str1;
11124    PyObject *str2;
11125    PyObject *result;
11126
11127    self = PyUnicode_FromObject(obj);
11128    if (self == NULL || PyUnicode_READY(self) == -1)
11129        return NULL;
11130    str1 = PyUnicode_FromObject(subobj);
11131    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11132        Py_DECREF(self);
11133        return NULL;
11134    }
11135    str2 = PyUnicode_FromObject(replobj);
11136    if (str2 == NULL || PyUnicode_READY(str2)) {
11137        Py_DECREF(self);
11138        Py_DECREF(str1);
11139        return NULL;
11140    }
11141    result = replace(self, str1, str2, maxcount);
11142    Py_DECREF(self);
11143    Py_DECREF(str1);
11144    Py_DECREF(str2);
11145    return result;
11146}
11147
11148PyDoc_STRVAR(replace__doc__,
11149             "S.replace(old, new[, count]) -> str\n\
11150\n\
11151Return a copy of S with all occurrences of substring\n\
11152old replaced by new.  If the optional argument count is\n\
11153given, only the first count occurrences are replaced.");
11154
11155static PyObject*
11156unicode_replace(PyObject *self, PyObject *args)
11157{
11158    PyObject *str1;
11159    PyObject *str2;
11160    Py_ssize_t maxcount = -1;
11161    PyObject *result;
11162
11163    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11164        return NULL;
11165    if (!PyUnicode_READY(self) == -1)
11166        return NULL;
11167    str1 = PyUnicode_FromObject(str1);
11168    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11169        return NULL;
11170    str2 = PyUnicode_FromObject(str2);
11171    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11172        Py_DECREF(str1);
11173        return NULL;
11174    }
11175
11176    result = replace(self, str1, str2, maxcount);
11177
11178    Py_DECREF(str1);
11179    Py_DECREF(str2);
11180    return result;
11181}
11182
11183static PyObject *
11184unicode_repr(PyObject *unicode)
11185{
11186    PyObject *repr;
11187    Py_ssize_t isize;
11188    Py_ssize_t osize, squote, dquote, i, o;
11189    Py_UCS4 max, quote;
11190    int ikind, okind;
11191    void *idata, *odata;
11192
11193    if (PyUnicode_READY(unicode) == -1)
11194        return NULL;
11195
11196    isize = PyUnicode_GET_LENGTH(unicode);
11197    idata = PyUnicode_DATA(unicode);
11198
11199    /* Compute length of output, quote characters, and
11200       maximum character */
11201    osize = 2; /* quotes */
11202    max = 127;
11203    squote = dquote = 0;
11204    ikind = PyUnicode_KIND(unicode);
11205    for (i = 0; i < isize; i++) {
11206        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11207        switch (ch) {
11208        case '\'': squote++; osize++; break;
11209        case '"':  dquote++; osize++; break;
11210        case '\\': case '\t': case '\r': case '\n':
11211            osize += 2; break;
11212        default:
11213            /* Fast-path ASCII */
11214            if (ch < ' ' || ch == 0x7f)
11215                osize += 4; /* \xHH */
11216            else if (ch < 0x7f)
11217                osize++;
11218            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11219                osize++;
11220                max = ch > max ? ch : max;
11221            }
11222            else if (ch < 0x100)
11223                osize += 4; /* \xHH */
11224            else if (ch < 0x10000)
11225                osize += 6; /* \uHHHH */
11226            else
11227                osize += 10; /* \uHHHHHHHH */
11228        }
11229    }
11230
11231    quote = '\'';
11232    if (squote) {
11233        if (dquote)
11234            /* Both squote and dquote present. Use squote,
11235               and escape them */
11236            osize += squote;
11237        else
11238            quote = '"';
11239    }
11240
11241    repr = PyUnicode_New(osize, max);
11242    if (repr == NULL)
11243        return NULL;
11244    okind = PyUnicode_KIND(repr);
11245    odata = PyUnicode_DATA(repr);
11246
11247    PyUnicode_WRITE(okind, odata, 0, quote);
11248    PyUnicode_WRITE(okind, odata, osize-1, quote);
11249
11250    for (i = 0, o = 1; i < isize; i++) {
11251        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11252
11253        /* Escape quotes and backslashes */
11254        if ((ch == quote) || (ch == '\\')) {
11255            PyUnicode_WRITE(okind, odata, o++, '\\');
11256            PyUnicode_WRITE(okind, odata, o++, ch);
11257            continue;
11258        }
11259
11260        /* Map special whitespace to '\t', \n', '\r' */
11261        if (ch == '\t') {
11262            PyUnicode_WRITE(okind, odata, o++, '\\');
11263            PyUnicode_WRITE(okind, odata, o++, 't');
11264        }
11265        else if (ch == '\n') {
11266            PyUnicode_WRITE(okind, odata, o++, '\\');
11267            PyUnicode_WRITE(okind, odata, o++, 'n');
11268        }
11269        else if (ch == '\r') {
11270            PyUnicode_WRITE(okind, odata, o++, '\\');
11271            PyUnicode_WRITE(okind, odata, o++, 'r');
11272        }
11273
11274        /* Map non-printable US ASCII to '\xhh' */
11275        else if (ch < ' ' || ch == 0x7F) {
11276            PyUnicode_WRITE(okind, odata, o++, '\\');
11277            PyUnicode_WRITE(okind, odata, o++, 'x');
11278            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11279            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11280        }
11281
11282        /* Copy ASCII characters as-is */
11283        else if (ch < 0x7F) {
11284            PyUnicode_WRITE(okind, odata, o++, ch);
11285        }
11286
11287        /* Non-ASCII characters */
11288        else {
11289            /* Map Unicode whitespace and control characters
11290               (categories Z* and C* except ASCII space)
11291            */
11292            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11293                /* Map 8-bit characters to '\xhh' */
11294                if (ch <= 0xff) {
11295                    PyUnicode_WRITE(okind, odata, o++, '\\');
11296                    PyUnicode_WRITE(okind, odata, o++, 'x');
11297                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11298                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11299                }
11300                /* Map 21-bit characters to '\U00xxxxxx' */
11301                else if (ch >= 0x10000) {
11302                    PyUnicode_WRITE(okind, odata, o++, '\\');
11303                    PyUnicode_WRITE(okind, odata, o++, 'U');
11304                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11305                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11306                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11307                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11308                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11309                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11310                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11311                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11312                }
11313                /* Map 16-bit characters to '\uxxxx' */
11314                else {
11315                    PyUnicode_WRITE(okind, odata, o++, '\\');
11316                    PyUnicode_WRITE(okind, odata, o++, 'u');
11317                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11318                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11319                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11320                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11321                }
11322            }
11323            /* Copy characters as-is */
11324            else {
11325                PyUnicode_WRITE(okind, odata, o++, ch);
11326            }
11327        }
11328    }
11329    /* Closing quote already added at the beginning */
11330    return repr;
11331}
11332
11333PyDoc_STRVAR(rfind__doc__,
11334             "S.rfind(sub[, start[, end]]) -> int\n\
11335\n\
11336Return the highest index in S where substring sub is found,\n\
11337such that sub is contained within S[start:end].  Optional\n\
11338arguments start and end are interpreted as in slice notation.\n\
11339\n\
11340Return -1 on failure.");
11341
11342static PyObject *
11343unicode_rfind(PyObject *self, PyObject *args)
11344{
11345    PyUnicodeObject *substring;
11346    Py_ssize_t start;
11347    Py_ssize_t end;
11348    Py_ssize_t result;
11349
11350    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11351                                            &start, &end))
11352        return NULL;
11353
11354    if (PyUnicode_READY(self) == -1)
11355        return NULL;
11356    if (PyUnicode_READY(substring) == -1)
11357        return NULL;
11358
11359    result = any_find_slice(
11360        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11361        self, (PyObject*)substring, start, end
11362        );
11363
11364    Py_DECREF(substring);
11365
11366    if (result == -2)
11367        return NULL;
11368
11369    return PyLong_FromSsize_t(result);
11370}
11371
11372PyDoc_STRVAR(rindex__doc__,
11373             "S.rindex(sub[, start[, end]]) -> int\n\
11374\n\
11375Like S.rfind() but raise ValueError when the substring is not found.");
11376
11377static PyObject *
11378unicode_rindex(PyObject *self, PyObject *args)
11379{
11380    PyUnicodeObject *substring;
11381    Py_ssize_t start;
11382    Py_ssize_t end;
11383    Py_ssize_t result;
11384
11385    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11386                                            &start, &end))
11387        return NULL;
11388
11389    if (PyUnicode_READY(self) == -1)
11390        return NULL;
11391    if (PyUnicode_READY(substring) == -1)
11392        return NULL;
11393
11394    result = any_find_slice(
11395        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11396        self, (PyObject*)substring, start, end
11397        );
11398
11399    Py_DECREF(substring);
11400
11401    if (result == -2)
11402        return NULL;
11403
11404    if (result < 0) {
11405        PyErr_SetString(PyExc_ValueError, "substring not found");
11406        return NULL;
11407    }
11408
11409    return PyLong_FromSsize_t(result);
11410}
11411
11412PyDoc_STRVAR(rjust__doc__,
11413             "S.rjust(width[, fillchar]) -> str\n\
11414\n\
11415Return S right-justified in a string of length width. Padding is\n\
11416done using the specified fill character (default is a space).");
11417
11418static PyObject *
11419unicode_rjust(PyUnicodeObject *self, PyObject *args)
11420{
11421    Py_ssize_t width;
11422    Py_UCS4 fillchar = ' ';
11423
11424    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11425        return NULL;
11426
11427    if (PyUnicode_READY(self) == -1)
11428        return NULL;
11429
11430    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11431        Py_INCREF(self);
11432        return (PyObject*) self;
11433    }
11434
11435    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11436}
11437
11438PyObject *
11439PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11440{
11441    PyObject *result;
11442
11443    s = PyUnicode_FromObject(s);
11444    if (s == NULL)
11445        return NULL;
11446    if (sep != NULL) {
11447        sep = PyUnicode_FromObject(sep);
11448        if (sep == NULL) {
11449            Py_DECREF(s);
11450            return NULL;
11451        }
11452    }
11453
11454    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11455
11456    Py_DECREF(s);
11457    Py_XDECREF(sep);
11458    return result;
11459}
11460
11461PyDoc_STRVAR(split__doc__,
11462             "S.split([sep[, maxsplit]]) -> list of strings\n\
11463\n\
11464Return a list of the words in S, using sep as the\n\
11465delimiter string.  If maxsplit is given, at most maxsplit\n\
11466splits are done. If sep is not specified or is None, any\n\
11467whitespace string is a separator and empty strings are\n\
11468removed from the result.");
11469
11470static PyObject*
11471unicode_split(PyUnicodeObject *self, PyObject *args)
11472{
11473    PyObject *substring = Py_None;
11474    Py_ssize_t maxcount = -1;
11475
11476    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11477        return NULL;
11478
11479    if (substring == Py_None)
11480        return split(self, NULL, maxcount);
11481    else if (PyUnicode_Check(substring))
11482        return split(self, (PyUnicodeObject *)substring, maxcount);
11483    else
11484        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11485}
11486
11487PyObject *
11488PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11489{
11490    PyObject* str_obj;
11491    PyObject* sep_obj;
11492    PyObject* out;
11493    int kind1, kind2, kind;
11494    void *buf1 = NULL, *buf2 = NULL;
11495    Py_ssize_t len1, len2;
11496
11497    str_obj = PyUnicode_FromObject(str_in);
11498    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11499        return NULL;
11500    sep_obj = PyUnicode_FromObject(sep_in);
11501    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11502        Py_DECREF(str_obj);
11503        return NULL;
11504    }
11505
11506    kind1 = PyUnicode_KIND(str_in);
11507    kind2 = PyUnicode_KIND(sep_obj);
11508    kind = kind1 > kind2 ? kind1 : kind2;
11509    buf1 = PyUnicode_DATA(str_in);
11510    if (kind1 != kind)
11511        buf1 = _PyUnicode_AsKind(str_in, kind);
11512    if (!buf1)
11513        goto onError;
11514    buf2 = PyUnicode_DATA(sep_obj);
11515    if (kind2 != kind)
11516        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11517    if (!buf2)
11518        goto onError;
11519    len1 = PyUnicode_GET_LENGTH(str_obj);
11520    len2 = PyUnicode_GET_LENGTH(sep_obj);
11521
11522    switch(PyUnicode_KIND(str_in)) {
11523    case PyUnicode_1BYTE_KIND:
11524        out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11525        break;
11526    case PyUnicode_2BYTE_KIND:
11527        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11528        break;
11529    case PyUnicode_4BYTE_KIND:
11530        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11531        break;
11532    default:
11533        assert(0);
11534        out = 0;
11535    }
11536
11537    Py_DECREF(sep_obj);
11538    Py_DECREF(str_obj);
11539    if (kind1 != kind)
11540        PyMem_Free(buf1);
11541    if (kind2 != kind)
11542        PyMem_Free(buf2);
11543
11544    return out;
11545  onError:
11546    Py_DECREF(sep_obj);
11547    Py_DECREF(str_obj);
11548    if (kind1 != kind && buf1)
11549        PyMem_Free(buf1);
11550    if (kind2 != kind && buf2)
11551        PyMem_Free(buf2);
11552    return NULL;
11553}
11554
11555
11556PyObject *
11557PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11558{
11559    PyObject* str_obj;
11560    PyObject* sep_obj;
11561    PyObject* out;
11562    int kind1, kind2, kind;
11563    void *buf1 = NULL, *buf2 = NULL;
11564    Py_ssize_t len1, len2;
11565
11566    str_obj = PyUnicode_FromObject(str_in);
11567    if (!str_obj)
11568        return NULL;
11569    sep_obj = PyUnicode_FromObject(sep_in);
11570    if (!sep_obj) {
11571        Py_DECREF(str_obj);
11572        return NULL;
11573    }
11574
11575    kind1 = PyUnicode_KIND(str_in);
11576    kind2 = PyUnicode_KIND(sep_obj);
11577    kind = Py_MAX(kind1, kind2);
11578    buf1 = PyUnicode_DATA(str_in);
11579    if (kind1 != kind)
11580        buf1 = _PyUnicode_AsKind(str_in, kind);
11581    if (!buf1)
11582        goto onError;
11583    buf2 = PyUnicode_DATA(sep_obj);
11584    if (kind2 != kind)
11585        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11586    if (!buf2)
11587        goto onError;
11588    len1 = PyUnicode_GET_LENGTH(str_obj);
11589    len2 = PyUnicode_GET_LENGTH(sep_obj);
11590
11591    switch(PyUnicode_KIND(str_in)) {
11592    case PyUnicode_1BYTE_KIND:
11593        out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11594        break;
11595    case PyUnicode_2BYTE_KIND:
11596        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11597        break;
11598    case PyUnicode_4BYTE_KIND:
11599        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11600        break;
11601    default:
11602        assert(0);
11603        out = 0;
11604    }
11605
11606    Py_DECREF(sep_obj);
11607    Py_DECREF(str_obj);
11608    if (kind1 != kind)
11609        PyMem_Free(buf1);
11610    if (kind2 != kind)
11611        PyMem_Free(buf2);
11612
11613    return out;
11614  onError:
11615    Py_DECREF(sep_obj);
11616    Py_DECREF(str_obj);
11617    if (kind1 != kind && buf1)
11618        PyMem_Free(buf1);
11619    if (kind2 != kind && buf2)
11620        PyMem_Free(buf2);
11621    return NULL;
11622}
11623
11624PyDoc_STRVAR(partition__doc__,
11625             "S.partition(sep) -> (head, sep, tail)\n\
11626\n\
11627Search for the separator sep in S, and return the part before it,\n\
11628the separator itself, and the part after it.  If the separator is not\n\
11629found, return S and two empty strings.");
11630
11631static PyObject*
11632unicode_partition(PyUnicodeObject *self, PyObject *separator)
11633{
11634    return PyUnicode_Partition((PyObject *)self, separator);
11635}
11636
11637PyDoc_STRVAR(rpartition__doc__,
11638             "S.rpartition(sep) -> (head, sep, tail)\n\
11639\n\
11640Search for the separator sep in S, starting at the end of S, and return\n\
11641the part before it, the separator itself, and the part after it.  If the\n\
11642separator is not found, return two empty strings and S.");
11643
11644static PyObject*
11645unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11646{
11647    return PyUnicode_RPartition((PyObject *)self, separator);
11648}
11649
11650PyObject *
11651PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11652{
11653    PyObject *result;
11654
11655    s = PyUnicode_FromObject(s);
11656    if (s == NULL)
11657        return NULL;
11658    if (sep != NULL) {
11659        sep = PyUnicode_FromObject(sep);
11660        if (sep == NULL) {
11661            Py_DECREF(s);
11662            return NULL;
11663        }
11664    }
11665
11666    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11667
11668    Py_DECREF(s);
11669    Py_XDECREF(sep);
11670    return result;
11671}
11672
11673PyDoc_STRVAR(rsplit__doc__,
11674             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
11675\n\
11676Return a list of the words in S, using sep as the\n\
11677delimiter string, starting at the end of the string and\n\
11678working to the front.  If maxsplit is given, at most maxsplit\n\
11679splits are done. If sep is not specified, any whitespace string\n\
11680is a separator.");
11681
11682static PyObject*
11683unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11684{
11685    PyObject *substring = Py_None;
11686    Py_ssize_t maxcount = -1;
11687
11688    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
11689        return NULL;
11690
11691    if (substring == Py_None)
11692        return rsplit(self, NULL, maxcount);
11693    else if (PyUnicode_Check(substring))
11694        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
11695    else
11696        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
11697}
11698
11699PyDoc_STRVAR(splitlines__doc__,
11700             "S.splitlines([keepends]) -> list of strings\n\
11701\n\
11702Return a list of the lines in S, breaking at line boundaries.\n\
11703Line breaks are not included in the resulting list unless keepends\n\
11704is given and true.");
11705
11706static PyObject*
11707unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
11708{
11709    static char *kwlist[] = {"keepends", 0};
11710    int keepends = 0;
11711
11712    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11713                                     kwlist, &keepends))
11714        return NULL;
11715
11716    return PyUnicode_Splitlines((PyObject *)self, keepends);
11717}
11718
11719static
11720PyObject *unicode_str(PyObject *self)
11721{
11722    if (PyUnicode_CheckExact(self)) {
11723        Py_INCREF(self);
11724        return self;
11725    } else
11726        /* Subtype -- return genuine unicode string with the same value. */
11727        return PyUnicode_Copy(self);
11728}
11729
11730PyDoc_STRVAR(swapcase__doc__,
11731             "S.swapcase() -> str\n\
11732\n\
11733Return a copy of S with uppercase characters converted to lowercase\n\
11734and vice versa.");
11735
11736static PyObject*
11737unicode_swapcase(PyUnicodeObject *self)
11738{
11739    return fixup(self, fixswapcase);
11740}
11741
11742PyDoc_STRVAR(maketrans__doc__,
11743             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
11744\n\
11745Return a translation table usable for str.translate().\n\
11746If there is only one argument, it must be a dictionary mapping Unicode\n\
11747ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
11748Character keys will be then converted to ordinals.\n\
11749If there are two arguments, they must be strings of equal length, and\n\
11750in the resulting dictionary, each character in x will be mapped to the\n\
11751character at the same position in y. If there is a third argument, it\n\
11752must be a string, whose characters will be mapped to None in the result.");
11753
11754static PyObject*
11755unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11756{
11757    PyObject *x, *y = NULL, *z = NULL;
11758    PyObject *new = NULL, *key, *value;
11759    Py_ssize_t i = 0;
11760    int res;
11761
11762    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11763        return NULL;
11764    new = PyDict_New();
11765    if (!new)
11766        return NULL;
11767    if (y != NULL) {
11768        int x_kind, y_kind, z_kind;
11769        void *x_data, *y_data, *z_data;
11770
11771        /* x must be a string too, of equal length */
11772        if (!PyUnicode_Check(x)) {
11773            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11774                            "be a string if there is a second argument");
11775            goto err;
11776        }
11777        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
11778            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11779                            "arguments must have equal length");
11780            goto err;
11781        }
11782        /* create entries for translating chars in x to those in y */
11783        x_kind = PyUnicode_KIND(x);
11784        y_kind = PyUnicode_KIND(y);
11785        x_data = PyUnicode_DATA(x);
11786        y_data = PyUnicode_DATA(y);
11787        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11788            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11789            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
11790            if (!key || !value)
11791                goto err;
11792            res = PyDict_SetItem(new, key, value);
11793            Py_DECREF(key);
11794            Py_DECREF(value);
11795            if (res < 0)
11796                goto err;
11797        }
11798        /* create entries for deleting chars in z */
11799        if (z != NULL) {
11800            z_kind = PyUnicode_KIND(z);
11801            z_data = PyUnicode_DATA(z);
11802            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
11803                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
11804                if (!key)
11805                    goto err;
11806                res = PyDict_SetItem(new, key, Py_None);
11807                Py_DECREF(key);
11808                if (res < 0)
11809                    goto err;
11810            }
11811        }
11812    } else {
11813        int kind;
11814        void *data;
11815
11816        /* x must be a dict */
11817        if (!PyDict_CheckExact(x)) {
11818            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11819                            "to maketrans it must be a dict");
11820            goto err;
11821        }
11822        /* copy entries into the new dict, converting string keys to int keys */
11823        while (PyDict_Next(x, &i, &key, &value)) {
11824            if (PyUnicode_Check(key)) {
11825                /* convert string keys to integer keys */
11826                PyObject *newkey;
11827                if (PyUnicode_GET_SIZE(key) != 1) {
11828                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
11829                                    "table must be of length 1");
11830                    goto err;
11831                }
11832                kind = PyUnicode_KIND(key);
11833                data = PyUnicode_DATA(key);
11834                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
11835                if (!newkey)
11836                    goto err;
11837                res = PyDict_SetItem(new, newkey, value);
11838                Py_DECREF(newkey);
11839                if (res < 0)
11840                    goto err;
11841            } else if (PyLong_Check(key)) {
11842                /* just keep integer keys */
11843                if (PyDict_SetItem(new, key, value) < 0)
11844                    goto err;
11845            } else {
11846                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11847                                "be strings or integers");
11848                goto err;
11849            }
11850        }
11851    }
11852    return new;
11853  err:
11854    Py_DECREF(new);
11855    return NULL;
11856}
11857
11858PyDoc_STRVAR(translate__doc__,
11859             "S.translate(table) -> str\n\
11860\n\
11861Return a copy of the string S, where all characters have been mapped\n\
11862through the given translation table, which must be a mapping of\n\
11863Unicode ordinals to Unicode ordinals, strings, or None.\n\
11864Unmapped characters are left untouched. Characters mapped to None\n\
11865are deleted.");
11866
11867static PyObject*
11868unicode_translate(PyObject *self, PyObject *table)
11869{
11870    return _PyUnicode_TranslateCharmap(self, table, "ignore");
11871}
11872
11873PyDoc_STRVAR(upper__doc__,
11874             "S.upper() -> str\n\
11875\n\
11876Return a copy of S converted to uppercase.");
11877
11878static PyObject*
11879unicode_upper(PyUnicodeObject *self)
11880{
11881    return fixup(self, fixupper);
11882}
11883
11884PyDoc_STRVAR(zfill__doc__,
11885             "S.zfill(width) -> str\n\
11886\n\
11887Pad a numeric string S with zeros on the left, to fill a field\n\
11888of the specified width. The string S is never truncated.");
11889
11890static PyObject *
11891unicode_zfill(PyUnicodeObject *self, PyObject *args)
11892{
11893    Py_ssize_t fill;
11894    PyUnicodeObject *u;
11895    Py_ssize_t width;
11896    int kind;
11897    void *data;
11898    Py_UCS4 chr;
11899
11900    if (PyUnicode_READY(self) == -1)
11901        return NULL;
11902
11903    if (!PyArg_ParseTuple(args, "n:zfill", &width))
11904        return NULL;
11905
11906    if (PyUnicode_GET_LENGTH(self) >= width) {
11907        if (PyUnicode_CheckExact(self)) {
11908            Py_INCREF(self);
11909            return (PyObject*) self;
11910        }
11911        else
11912            return PyUnicode_Copy((PyObject*)self);
11913    }
11914
11915    fill = width - _PyUnicode_LENGTH(self);
11916
11917    u = pad(self, fill, 0, '0');
11918
11919    if (u == NULL)
11920        return NULL;
11921
11922    kind = PyUnicode_KIND(u);
11923    data = PyUnicode_DATA(u);
11924    chr = PyUnicode_READ(kind, data, fill);
11925
11926    if (chr == '+' || chr == '-') {
11927        /* move sign to beginning of string */
11928        PyUnicode_WRITE(kind, data, 0, chr);
11929        PyUnicode_WRITE(kind, data, fill, '0');
11930    }
11931
11932    return (PyObject*) u;
11933}
11934
11935#if 0
11936static PyObject *
11937unicode__decimal2ascii(PyObject *self)
11938{
11939    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
11940}
11941#endif
11942
11943PyDoc_STRVAR(startswith__doc__,
11944             "S.startswith(prefix[, start[, end]]) -> bool\n\
11945\n\
11946Return True if S starts with the specified prefix, False otherwise.\n\
11947With optional start, test S beginning at that position.\n\
11948With optional end, stop comparing S at that position.\n\
11949prefix can also be a tuple of strings to try.");
11950
11951static PyObject *
11952unicode_startswith(PyUnicodeObject *self,
11953                   PyObject *args)
11954{
11955    PyObject *subobj;
11956    PyUnicodeObject *substring;
11957    Py_ssize_t start = 0;
11958    Py_ssize_t end = PY_SSIZE_T_MAX;
11959    int result;
11960
11961    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
11962        return NULL;
11963    if (PyTuple_Check(subobj)) {
11964        Py_ssize_t i;
11965        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11966            substring = (PyUnicodeObject *)PyUnicode_FromObject(
11967                PyTuple_GET_ITEM(subobj, i));
11968            if (substring == NULL)
11969                return NULL;
11970            result = tailmatch(self, substring, start, end, -1);
11971            Py_DECREF(substring);
11972            if (result) {
11973                Py_RETURN_TRUE;
11974            }
11975        }
11976        /* nothing matched */
11977        Py_RETURN_FALSE;
11978    }
11979    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
11980    if (substring == NULL) {
11981        if (PyErr_ExceptionMatches(PyExc_TypeError))
11982            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11983                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
11984        return NULL;
11985    }
11986    result = tailmatch(self, substring, start, end, -1);
11987    Py_DECREF(substring);
11988    return PyBool_FromLong(result);
11989}
11990
11991
11992PyDoc_STRVAR(endswith__doc__,
11993             "S.endswith(suffix[, start[, end]]) -> bool\n\
11994\n\
11995Return True if S ends with the specified suffix, False otherwise.\n\
11996With optional start, test S beginning at that position.\n\
11997With optional end, stop comparing S at that position.\n\
11998suffix can also be a tuple of strings to try.");
11999
12000static PyObject *
12001unicode_endswith(PyUnicodeObject *self,
12002                 PyObject *args)
12003{
12004    PyObject *subobj;
12005    PyUnicodeObject *substring;
12006    Py_ssize_t start = 0;
12007    Py_ssize_t end = PY_SSIZE_T_MAX;
12008    int result;
12009
12010    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12011        return NULL;
12012    if (PyTuple_Check(subobj)) {
12013        Py_ssize_t i;
12014        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12015            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12016                PyTuple_GET_ITEM(subobj, i));
12017            if (substring == NULL)
12018                return NULL;
12019            result = tailmatch(self, substring, start, end, +1);
12020            Py_DECREF(substring);
12021            if (result) {
12022                Py_RETURN_TRUE;
12023            }
12024        }
12025        Py_RETURN_FALSE;
12026    }
12027    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12028    if (substring == NULL) {
12029        if (PyErr_ExceptionMatches(PyExc_TypeError))
12030            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12031                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12032        return NULL;
12033    }
12034    result = tailmatch(self, substring, start, end, +1);
12035    Py_DECREF(substring);
12036    return PyBool_FromLong(result);
12037}
12038
12039#include "stringlib/unicode_format.h"
12040
12041PyDoc_STRVAR(format__doc__,
12042             "S.format(*args, **kwargs) -> str\n\
12043\n\
12044Return a formatted version of S, using substitutions from args and kwargs.\n\
12045The substitutions are identified by braces ('{' and '}').");
12046
12047PyDoc_STRVAR(format_map__doc__,
12048             "S.format_map(mapping) -> str\n\
12049\n\
12050Return a formatted version of S, using substitutions from mapping.\n\
12051The substitutions are identified by braces ('{' and '}').");
12052
12053static PyObject *
12054unicode__format__(PyObject* self, PyObject* args)
12055{
12056    PyObject *format_spec;
12057
12058    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12059        return NULL;
12060
12061    return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12062                                     PyUnicode_GET_LENGTH(format_spec));
12063}
12064
12065PyDoc_STRVAR(p_format__doc__,
12066             "S.__format__(format_spec) -> str\n\
12067\n\
12068Return a formatted version of S as described by format_spec.");
12069
12070static PyObject *
12071unicode__sizeof__(PyUnicodeObject *v)
12072{
12073    Py_ssize_t size;
12074
12075    /* If it's a compact object, account for base structure +
12076       character data. */
12077    if (PyUnicode_IS_COMPACT_ASCII(v))
12078        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12079    else if (PyUnicode_IS_COMPACT(v))
12080        size = sizeof(PyCompactUnicodeObject) +
12081            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12082    else {
12083        /* If it is a two-block object, account for base object, and
12084           for character block if present. */
12085        size = sizeof(PyUnicodeObject);
12086        if (_PyUnicode_DATA_ANY(v))
12087            size += (PyUnicode_GET_LENGTH(v) + 1) *
12088                PyUnicode_CHARACTER_SIZE(v);
12089    }
12090    /* If the wstr pointer is present, account for it unless it is shared
12091       with the data pointer. Check if the data is not shared. */
12092    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12093        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12094    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12095        size += PyUnicode_UTF8_LENGTH(v) + 1;
12096
12097    return PyLong_FromSsize_t(size);
12098}
12099
12100PyDoc_STRVAR(sizeof__doc__,
12101             "S.__sizeof__() -> size of S in memory, in bytes");
12102
12103static PyObject *
12104unicode_getnewargs(PyObject *v)
12105{
12106    PyObject *copy = PyUnicode_Copy(v);
12107    if (!copy)
12108        return NULL;
12109    return Py_BuildValue("(N)", copy);
12110}
12111
12112static PyMethodDef unicode_methods[] = {
12113
12114    /* Order is according to common usage: often used methods should
12115       appear first, since lookup is done sequentially. */
12116
12117    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12118    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12119    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12120    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12121    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12122    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12123    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12124    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12125    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12126    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12127    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12128    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12129    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12130    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12131    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12132    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12133    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12134    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12135    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12136    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12137    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12138    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12139    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12140    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12141    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12142    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12143    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12144    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12145    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12146    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12147    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12148    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12149    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12150    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12151    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12152    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12153    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12154    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12155    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12156    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12157    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12158    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12159    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12160    {"maketrans", (PyCFunction) unicode_maketrans,
12161     METH_VARARGS | METH_STATIC, maketrans__doc__},
12162    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12163#if 0
12164    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12165#endif
12166
12167#if 0
12168    /* These methods are just used for debugging the implementation. */
12169    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12170#endif
12171
12172    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12173    {NULL, NULL}
12174};
12175
12176static PyObject *
12177unicode_mod(PyObject *v, PyObject *w)
12178{
12179    if (!PyUnicode_Check(v))
12180        Py_RETURN_NOTIMPLEMENTED;
12181    return PyUnicode_Format(v, w);
12182}
12183
12184static PyNumberMethods unicode_as_number = {
12185    0,              /*nb_add*/
12186    0,              /*nb_subtract*/
12187    0,              /*nb_multiply*/
12188    unicode_mod,            /*nb_remainder*/
12189};
12190
12191static PySequenceMethods unicode_as_sequence = {
12192    (lenfunc) unicode_length,       /* sq_length */
12193    PyUnicode_Concat,           /* sq_concat */
12194    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12195    (ssizeargfunc) unicode_getitem,     /* sq_item */
12196    0,                  /* sq_slice */
12197    0,                  /* sq_ass_item */
12198    0,                  /* sq_ass_slice */
12199    PyUnicode_Contains,         /* sq_contains */
12200};
12201
12202static PyObject*
12203unicode_subscript(PyUnicodeObject* self, PyObject* item)
12204{
12205    if (PyUnicode_READY(self) == -1)
12206        return NULL;
12207
12208    if (PyIndex_Check(item)) {
12209        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12210        if (i == -1 && PyErr_Occurred())
12211            return NULL;
12212        if (i < 0)
12213            i += PyUnicode_GET_LENGTH(self);
12214        return unicode_getitem((PyObject*)self, i);
12215    } else if (PySlice_Check(item)) {
12216        Py_ssize_t start, stop, step, slicelength, cur, i;
12217        const Py_UNICODE* source_buf;
12218        Py_UNICODE* result_buf;
12219        PyObject* result;
12220
12221        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12222                                 &start, &stop, &step, &slicelength) < 0) {
12223            return NULL;
12224        }
12225
12226        if (slicelength <= 0) {
12227            return PyUnicode_New(0, 0);
12228        } else if (start == 0 && step == 1 &&
12229                   slicelength == PyUnicode_GET_LENGTH(self) &&
12230                   PyUnicode_CheckExact(self)) {
12231            Py_INCREF(self);
12232            return (PyObject *)self;
12233        } else if (step == 1) {
12234            return PyUnicode_Substring((PyObject*)self,
12235                                       start, start + slicelength);
12236        } else {
12237            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
12238            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12239                                                       sizeof(Py_UNICODE));
12240
12241            if (result_buf == NULL)
12242                return PyErr_NoMemory();
12243
12244            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12245                result_buf[i] = source_buf[cur];
12246            }
12247
12248            result = PyUnicode_FromUnicode(result_buf, slicelength);
12249            PyObject_FREE(result_buf);
12250            return result;
12251        }
12252    } else {
12253        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12254        return NULL;
12255    }
12256}
12257
12258static PyMappingMethods unicode_as_mapping = {
12259    (lenfunc)unicode_length,        /* mp_length */
12260    (binaryfunc)unicode_subscript,  /* mp_subscript */
12261    (objobjargproc)0,           /* mp_ass_subscript */
12262};
12263
12264
12265/* Helpers for PyUnicode_Format() */
12266
12267static PyObject *
12268getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12269{
12270    Py_ssize_t argidx = *p_argidx;
12271    if (argidx < arglen) {
12272        (*p_argidx)++;
12273        if (arglen < 0)
12274            return args;
12275        else
12276            return PyTuple_GetItem(args, argidx);
12277    }
12278    PyErr_SetString(PyExc_TypeError,
12279                    "not enough arguments for format string");
12280    return NULL;
12281}
12282
12283/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12284
12285static PyObject *
12286formatfloat(PyObject *v, int flags, int prec, int type)
12287{
12288    char *p;
12289    PyObject *result;
12290    double x;
12291
12292    x = PyFloat_AsDouble(v);
12293    if (x == -1.0 && PyErr_Occurred())
12294        return NULL;
12295
12296    if (prec < 0)
12297        prec = 6;
12298
12299    p = PyOS_double_to_string(x, type, prec,
12300                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12301    if (p == NULL)
12302        return NULL;
12303    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12304    PyMem_Free(p);
12305    return result;
12306}
12307
12308static PyObject*
12309formatlong(PyObject *val, int flags, int prec, int type)
12310{
12311    char *buf;
12312    int len;
12313    PyObject *str; /* temporary string object. */
12314    PyObject *result;
12315
12316    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12317    if (!str)
12318        return NULL;
12319    result = PyUnicode_DecodeASCII(buf, len, NULL);
12320    Py_DECREF(str);
12321    return result;
12322}
12323
12324static int
12325formatchar(Py_UCS4 *buf,
12326           size_t buflen,
12327           PyObject *v)
12328{
12329    /* presume that the buffer is at least 3 characters long */
12330    if (PyUnicode_Check(v)) {
12331        if (PyUnicode_GET_LENGTH(v) == 1) {
12332            buf[0] = PyUnicode_READ_CHAR(v, 0);
12333            buf[1] = '\0';
12334            return 1;
12335        }
12336        goto onError;
12337    }
12338    else {
12339        /* Integer input truncated to a character */
12340        long x;
12341        x = PyLong_AsLong(v);
12342        if (x == -1 && PyErr_Occurred())
12343            goto onError;
12344
12345        if (x < 0 || x > 0x10ffff) {
12346            PyErr_SetString(PyExc_OverflowError,
12347                            "%c arg not in range(0x110000)");
12348            return -1;
12349        }
12350
12351        buf[0] = (Py_UCS4) x;
12352        buf[1] = '\0';
12353        return 1;
12354    }
12355
12356  onError:
12357    PyErr_SetString(PyExc_TypeError,
12358                    "%c requires int or char");
12359    return -1;
12360}
12361
12362/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12363   FORMATBUFLEN is the length of the buffer in which chars are formatted.
12364*/
12365#define FORMATBUFLEN (size_t)10
12366
12367PyObject *
12368PyUnicode_Format(PyObject *format, PyObject *args)
12369{
12370    void *fmt;
12371    int fmtkind;
12372    PyObject *result;
12373    Py_UCS4 *res, *res0;
12374    Py_UCS4 max;
12375    int kind;
12376    Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12377    int args_owned = 0;
12378    PyObject *dict = NULL;
12379    PyUnicodeObject *uformat;
12380
12381    if (format == NULL || args == NULL) {
12382        PyErr_BadInternalCall();
12383        return NULL;
12384    }
12385    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12386    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12387        return NULL;
12388    fmt = PyUnicode_DATA(uformat);
12389    fmtkind = PyUnicode_KIND(uformat);
12390    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12391    fmtpos = 0;
12392
12393    reslen = rescnt = fmtcnt + 100;
12394    res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12395    if (res0 == NULL) {
12396        PyErr_NoMemory();
12397        goto onError;
12398    }
12399
12400    if (PyTuple_Check(args)) {
12401        arglen = PyTuple_Size(args);
12402        argidx = 0;
12403    }
12404    else {
12405        arglen = -1;
12406        argidx = -2;
12407    }
12408    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12409        !PyUnicode_Check(args))
12410        dict = args;
12411
12412    while (--fmtcnt >= 0) {
12413        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12414            if (--rescnt < 0) {
12415                rescnt = fmtcnt + 100;
12416                reslen += rescnt;
12417                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12418                if (res0 == NULL){
12419                    PyErr_NoMemory();
12420                    goto onError;
12421                }
12422                res = res0 + reslen - rescnt;
12423                --rescnt;
12424            }
12425            *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12426        }
12427        else {
12428            /* Got a format specifier */
12429            int flags = 0;
12430            Py_ssize_t width = -1;
12431            int prec = -1;
12432            Py_UCS4 c = '\0';
12433            Py_UCS4 fill;
12434            int isnumok;
12435            PyObject *v = NULL;
12436            PyObject *temp = NULL;
12437            void *pbuf;
12438            Py_ssize_t pindex;
12439            Py_UNICODE sign;
12440            Py_ssize_t len, len1;
12441            Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12442
12443            fmtpos++;
12444            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12445                Py_ssize_t keystart;
12446                Py_ssize_t keylen;
12447                PyObject *key;
12448                int pcount = 1;
12449
12450                if (dict == NULL) {
12451                    PyErr_SetString(PyExc_TypeError,
12452                                    "format requires a mapping");
12453                    goto onError;
12454                }
12455                ++fmtpos;
12456                --fmtcnt;
12457                keystart = fmtpos;
12458                /* Skip over balanced parentheses */
12459                while (pcount > 0 && --fmtcnt >= 0) {
12460                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12461                        --pcount;
12462                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12463                        ++pcount;
12464                    fmtpos++;
12465                }
12466                keylen = fmtpos - keystart - 1;
12467                if (fmtcnt < 0 || pcount > 0) {
12468                    PyErr_SetString(PyExc_ValueError,
12469                                    "incomplete format key");
12470                    goto onError;
12471                }
12472                key = PyUnicode_Substring((PyObject*)uformat,
12473                                          keystart, keystart + keylen);
12474                if (key == NULL)
12475                    goto onError;
12476                if (args_owned) {
12477                    Py_DECREF(args);
12478                    args_owned = 0;
12479                }
12480                args = PyObject_GetItem(dict, key);
12481                Py_DECREF(key);
12482                if (args == NULL) {
12483                    goto onError;
12484                }
12485                args_owned = 1;
12486                arglen = -1;
12487                argidx = -2;
12488            }
12489            while (--fmtcnt >= 0) {
12490                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12491                case '-': flags |= F_LJUST; continue;
12492                case '+': flags |= F_SIGN; continue;
12493                case ' ': flags |= F_BLANK; continue;
12494                case '#': flags |= F_ALT; continue;
12495                case '0': flags |= F_ZERO; continue;
12496                }
12497                break;
12498            }
12499            if (c == '*') {
12500                v = getnextarg(args, arglen, &argidx);
12501                if (v == NULL)
12502                    goto onError;
12503                if (!PyLong_Check(v)) {
12504                    PyErr_SetString(PyExc_TypeError,
12505                                    "* wants int");
12506                    goto onError;
12507                }
12508                width = PyLong_AsLong(v);
12509                if (width == -1 && PyErr_Occurred())
12510                    goto onError;
12511                if (width < 0) {
12512                    flags |= F_LJUST;
12513                    width = -width;
12514                }
12515                if (--fmtcnt >= 0)
12516                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12517            }
12518            else if (c >= '0' && c <= '9') {
12519                width = c - '0';
12520                while (--fmtcnt >= 0) {
12521                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12522                    if (c < '0' || c > '9')
12523                        break;
12524                    if ((width*10) / 10 != width) {
12525                        PyErr_SetString(PyExc_ValueError,
12526                                        "width too big");
12527                        goto onError;
12528                    }
12529                    width = width*10 + (c - '0');
12530                }
12531            }
12532            if (c == '.') {
12533                prec = 0;
12534                if (--fmtcnt >= 0)
12535                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12536                if (c == '*') {
12537                    v = getnextarg(args, arglen, &argidx);
12538                    if (v == NULL)
12539                        goto onError;
12540                    if (!PyLong_Check(v)) {
12541                        PyErr_SetString(PyExc_TypeError,
12542                                        "* wants int");
12543                        goto onError;
12544                    }
12545                    prec = PyLong_AsLong(v);
12546                    if (prec == -1 && PyErr_Occurred())
12547                        goto onError;
12548                    if (prec < 0)
12549                        prec = 0;
12550                    if (--fmtcnt >= 0)
12551                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12552                }
12553                else if (c >= '0' && c <= '9') {
12554                    prec = c - '0';
12555                    while (--fmtcnt >= 0) {
12556                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12557                        if (c < '0' || c > '9')
12558                            break;
12559                        if ((prec*10) / 10 != prec) {
12560                            PyErr_SetString(PyExc_ValueError,
12561                                            "prec too big");
12562                            goto onError;
12563                        }
12564                        prec = prec*10 + (c - '0');
12565                    }
12566                }
12567            } /* prec */
12568            if (fmtcnt >= 0) {
12569                if (c == 'h' || c == 'l' || c == 'L') {
12570                    if (--fmtcnt >= 0)
12571                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12572                }
12573            }
12574            if (fmtcnt < 0) {
12575                PyErr_SetString(PyExc_ValueError,
12576                                "incomplete format");
12577                goto onError;
12578            }
12579            if (c != '%') {
12580                v = getnextarg(args, arglen, &argidx);
12581                if (v == NULL)
12582                    goto onError;
12583            }
12584            sign = 0;
12585            fill = ' ';
12586            switch (c) {
12587
12588            case '%':
12589                pbuf = formatbuf;
12590                kind = PyUnicode_4BYTE_KIND;
12591                /* presume that buffer length is at least 1 */
12592                PyUnicode_WRITE(kind, pbuf, 0, '%');
12593                len = 1;
12594                break;
12595
12596            case 's':
12597            case 'r':
12598            case 'a':
12599                if (PyUnicode_CheckExact(v) && c == 's') {
12600                    temp = v;
12601                    Py_INCREF(temp);
12602                }
12603                else {
12604                    if (c == 's')
12605                        temp = PyObject_Str(v);
12606                    else if (c == 'r')
12607                        temp = PyObject_Repr(v);
12608                    else
12609                        temp = PyObject_ASCII(v);
12610                    if (temp == NULL)
12611                        goto onError;
12612                    if (PyUnicode_Check(temp))
12613                        /* nothing to do */;
12614                    else {
12615                        Py_DECREF(temp);
12616                        PyErr_SetString(PyExc_TypeError,
12617                                        "%s argument has non-string str()");
12618                        goto onError;
12619                    }
12620                }
12621                if (PyUnicode_READY(temp) == -1) {
12622                    Py_CLEAR(temp);
12623                    goto onError;
12624                }
12625                pbuf = PyUnicode_DATA(temp);
12626                kind = PyUnicode_KIND(temp);
12627                len = PyUnicode_GET_LENGTH(temp);
12628                if (prec >= 0 && len > prec)
12629                    len = prec;
12630                break;
12631
12632            case 'i':
12633            case 'd':
12634            case 'u':
12635            case 'o':
12636            case 'x':
12637            case 'X':
12638                isnumok = 0;
12639                if (PyNumber_Check(v)) {
12640                    PyObject *iobj=NULL;
12641
12642                    if (PyLong_Check(v)) {
12643                        iobj = v;
12644                        Py_INCREF(iobj);
12645                    }
12646                    else {
12647                        iobj = PyNumber_Long(v);
12648                    }
12649                    if (iobj!=NULL) {
12650                        if (PyLong_Check(iobj)) {
12651                            isnumok = 1;
12652                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
12653                            Py_DECREF(iobj);
12654                            if (!temp)
12655                                goto onError;
12656                            if (PyUnicode_READY(temp) == -1) {
12657                                Py_CLEAR(temp);
12658                                goto onError;
12659                            }
12660                            pbuf = PyUnicode_DATA(temp);
12661                            kind = PyUnicode_KIND(temp);
12662                            len = PyUnicode_GET_LENGTH(temp);
12663                            sign = 1;
12664                        }
12665                        else {
12666                            Py_DECREF(iobj);
12667                        }
12668                    }
12669                }
12670                if (!isnumok) {
12671                    PyErr_Format(PyExc_TypeError,
12672                                 "%%%c format: a number is required, "
12673                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12674                    goto onError;
12675                }
12676                if (flags & F_ZERO)
12677                    fill = '0';
12678                break;
12679
12680            case 'e':
12681            case 'E':
12682            case 'f':
12683            case 'F':
12684            case 'g':
12685            case 'G':
12686                temp = formatfloat(v, flags, prec, c);
12687                if (!temp)
12688                    goto onError;
12689                if (PyUnicode_READY(temp) == -1) {
12690                    Py_CLEAR(temp);
12691                    goto onError;
12692                }
12693                pbuf = PyUnicode_DATA(temp);
12694                kind = PyUnicode_KIND(temp);
12695                len = PyUnicode_GET_LENGTH(temp);
12696                sign = 1;
12697                if (flags & F_ZERO)
12698                    fill = '0';
12699                break;
12700
12701            case 'c':
12702                pbuf = formatbuf;
12703                kind = PyUnicode_4BYTE_KIND;
12704                len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
12705                if (len < 0)
12706                    goto onError;
12707                break;
12708
12709            default:
12710                PyErr_Format(PyExc_ValueError,
12711                             "unsupported format character '%c' (0x%x) "
12712                             "at index %zd",
12713                             (31<=c && c<=126) ? (char)c : '?',
12714                             (int)c,
12715                             fmtpos - 1);
12716                goto onError;
12717            }
12718            /* pbuf is initialized here. */
12719            pindex = 0;
12720            if (sign) {
12721                if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12722                    PyUnicode_READ(kind, pbuf, pindex) == '+') {
12723                    sign = PyUnicode_READ(kind, pbuf, pindex++);
12724                    len--;
12725                }
12726                else if (flags & F_SIGN)
12727                    sign = '+';
12728                else if (flags & F_BLANK)
12729                    sign = ' ';
12730                else
12731                    sign = 0;
12732            }
12733            if (width < len)
12734                width = len;
12735            if (rescnt - (sign != 0) < width) {
12736                reslen -= rescnt;
12737                rescnt = width + fmtcnt + 100;
12738                reslen += rescnt;
12739                if (reslen < 0) {
12740                    Py_XDECREF(temp);
12741                    PyErr_NoMemory();
12742                    goto onError;
12743                }
12744                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12745                if (res0 == 0) {
12746                    PyErr_NoMemory();
12747                    Py_XDECREF(temp);
12748                    goto onError;
12749                }
12750                res = res0 + reslen - rescnt;
12751            }
12752            if (sign) {
12753                if (fill != ' ')
12754                    *res++ = sign;
12755                rescnt--;
12756                if (width > len)
12757                    width--;
12758            }
12759            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12760                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12761                assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12762                if (fill != ' ') {
12763                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12764                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12765                }
12766                rescnt -= 2;
12767                width -= 2;
12768                if (width < 0)
12769                    width = 0;
12770                len -= 2;
12771            }
12772            if (width > len && !(flags & F_LJUST)) {
12773                do {
12774                    --rescnt;
12775                    *res++ = fill;
12776                } while (--width > len);
12777            }
12778            if (fill == ' ') {
12779                if (sign)
12780                    *res++ = sign;
12781                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12782                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12783                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12784                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12785                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12786                }
12787            }
12788            /* Copy all characters, preserving len */
12789            len1 = len;
12790            while (len1--) {
12791                *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12792                rescnt--;
12793            }
12794            while (--width >= len) {
12795                --rescnt;
12796                *res++ = ' ';
12797            }
12798            if (dict && (argidx < arglen) && c != '%') {
12799                PyErr_SetString(PyExc_TypeError,
12800                                "not all arguments converted during string formatting");
12801                Py_XDECREF(temp);
12802                goto onError;
12803            }
12804            Py_XDECREF(temp);
12805        } /* '%' */
12806    } /* until end */
12807    if (argidx < arglen && !dict) {
12808        PyErr_SetString(PyExc_TypeError,
12809                        "not all arguments converted during string formatting");
12810        goto onError;
12811    }
12812
12813
12814    for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12815        if (*res > max)
12816            max = *res;
12817    result = PyUnicode_New(reslen - rescnt, max);
12818    if (!result)
12819        goto onError;
12820    kind = PyUnicode_KIND(result);
12821    for (res = res0; res < res0+reslen-rescnt; res++)
12822        PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12823    PyMem_Free(res0);
12824    if (args_owned) {
12825        Py_DECREF(args);
12826    }
12827    Py_DECREF(uformat);
12828    return (PyObject *)result;
12829
12830  onError:
12831    PyMem_Free(res0);
12832    Py_DECREF(uformat);
12833    if (args_owned) {
12834        Py_DECREF(args);
12835    }
12836    return NULL;
12837}
12838
12839static PyObject *
12840unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12841
12842static PyObject *
12843unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12844{
12845    PyObject *x = NULL;
12846    static char *kwlist[] = {"object", "encoding", "errors", 0};
12847    char *encoding = NULL;
12848    char *errors = NULL;
12849
12850    if (type != &PyUnicode_Type)
12851        return unicode_subtype_new(type, args, kwds);
12852    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
12853                                     kwlist, &x, &encoding, &errors))
12854        return NULL;
12855    if (x == NULL)
12856        return (PyObject *)PyUnicode_New(0, 0);
12857    if (encoding == NULL && errors == NULL)
12858        return PyObject_Str(x);
12859    else
12860        return PyUnicode_FromEncodedObject(x, encoding, errors);
12861}
12862
12863static PyObject *
12864unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12865{
12866    PyUnicodeObject *unicode, *self;
12867    Py_ssize_t length, char_size;
12868    int share_wstr, share_utf8;
12869    unsigned int kind;
12870    void *data;
12871
12872    assert(PyType_IsSubtype(type, &PyUnicode_Type));
12873
12874    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12875    if (unicode == NULL)
12876        return NULL;
12877    assert(_PyUnicode_CHECK(unicode));
12878    if (_PyUnicode_READY_REPLACE(&unicode))
12879        return NULL;
12880
12881    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12882    if (self == NULL) {
12883        Py_DECREF(unicode);
12884        return NULL;
12885    }
12886    kind = PyUnicode_KIND(unicode);
12887    length = PyUnicode_GET_LENGTH(unicode);
12888
12889    _PyUnicode_LENGTH(self) = length;
12890    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12891    _PyUnicode_STATE(self).interned = 0;
12892    _PyUnicode_STATE(self).kind = kind;
12893    _PyUnicode_STATE(self).compact = 0;
12894    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
12895    _PyUnicode_STATE(self).ready = 1;
12896    _PyUnicode_WSTR(self) = NULL;
12897    _PyUnicode_UTF8_LENGTH(self) = 0;
12898    _PyUnicode_UTF8(self) = NULL;
12899    _PyUnicode_WSTR_LENGTH(self) = 0;
12900    _PyUnicode_DATA_ANY(self) = NULL;
12901
12902    share_utf8 = 0;
12903    share_wstr = 0;
12904    if (kind == PyUnicode_1BYTE_KIND) {
12905        char_size = 1;
12906        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12907            share_utf8 = 1;
12908    }
12909    else if (kind == PyUnicode_2BYTE_KIND) {
12910        char_size = 2;
12911        if (sizeof(wchar_t) == 2)
12912            share_wstr = 1;
12913    }
12914    else {
12915        assert(kind == PyUnicode_4BYTE_KIND);
12916        char_size = 4;
12917        if (sizeof(wchar_t) == 4)
12918            share_wstr = 1;
12919    }
12920
12921    /* Ensure we won't overflow the length. */
12922    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12923        PyErr_NoMemory();
12924        goto onError;
12925    }
12926    data = PyObject_MALLOC((length + 1) * char_size);
12927    if (data == NULL) {
12928        PyErr_NoMemory();
12929        goto onError;
12930    }
12931
12932    _PyUnicode_DATA_ANY(self) = data;
12933    if (share_utf8) {
12934        _PyUnicode_UTF8_LENGTH(self) = length;
12935        _PyUnicode_UTF8(self) = data;
12936    }
12937    if (share_wstr) {
12938        _PyUnicode_WSTR_LENGTH(self) = length;
12939        _PyUnicode_WSTR(self) = (wchar_t *)data;
12940    }
12941
12942    Py_MEMCPY(data, PyUnicode_DATA(unicode),
12943              PyUnicode_KIND_SIZE(kind, length + 1));
12944    Py_DECREF(unicode);
12945    return (PyObject *)self;
12946
12947onError:
12948    Py_DECREF(unicode);
12949    Py_DECREF(self);
12950    return NULL;
12951}
12952
12953PyDoc_STRVAR(unicode_doc,
12954             "str(string[, encoding[, errors]]) -> str\n\
12955\n\
12956Create a new string object from the given encoded string.\n\
12957encoding defaults to the current default string encoding.\n\
12958errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
12959
12960static PyObject *unicode_iter(PyObject *seq);
12961
12962PyTypeObject PyUnicode_Type = {
12963    PyVarObject_HEAD_INIT(&PyType_Type, 0)
12964    "str",              /* tp_name */
12965    sizeof(PyUnicodeObject),        /* tp_size */
12966    0,                  /* tp_itemsize */
12967    /* Slots */
12968    (destructor)unicode_dealloc,    /* tp_dealloc */
12969    0,                  /* tp_print */
12970    0,                  /* tp_getattr */
12971    0,                  /* tp_setattr */
12972    0,                  /* tp_reserved */
12973    unicode_repr,           /* tp_repr */
12974    &unicode_as_number,         /* tp_as_number */
12975    &unicode_as_sequence,       /* tp_as_sequence */
12976    &unicode_as_mapping,        /* tp_as_mapping */
12977    (hashfunc) unicode_hash,        /* tp_hash*/
12978    0,                  /* tp_call*/
12979    (reprfunc) unicode_str,     /* tp_str */
12980    PyObject_GenericGetAttr,        /* tp_getattro */
12981    0,                  /* tp_setattro */
12982    0,                  /* tp_as_buffer */
12983    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
12984    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
12985    unicode_doc,            /* tp_doc */
12986    0,                  /* tp_traverse */
12987    0,                  /* tp_clear */
12988    PyUnicode_RichCompare,      /* tp_richcompare */
12989    0,                  /* tp_weaklistoffset */
12990    unicode_iter,           /* tp_iter */
12991    0,                  /* tp_iternext */
12992    unicode_methods,            /* tp_methods */
12993    0,                  /* tp_members */
12994    0,                  /* tp_getset */
12995    &PyBaseObject_Type,         /* tp_base */
12996    0,                  /* tp_dict */
12997    0,                  /* tp_descr_get */
12998    0,                  /* tp_descr_set */
12999    0,                  /* tp_dictoffset */
13000    0,                  /* tp_init */
13001    0,                  /* tp_alloc */
13002    unicode_new,            /* tp_new */
13003    PyObject_Del,           /* tp_free */
13004};
13005
13006/* Initialize the Unicode implementation */
13007
13008void _PyUnicode_Init(void)
13009{
13010    int i;
13011
13012    /* XXX - move this array to unicodectype.c ? */
13013    Py_UCS2 linebreak[] = {
13014        0x000A, /* LINE FEED */
13015        0x000D, /* CARRIAGE RETURN */
13016        0x001C, /* FILE SEPARATOR */
13017        0x001D, /* GROUP SEPARATOR */
13018        0x001E, /* RECORD SEPARATOR */
13019        0x0085, /* NEXT LINE */
13020        0x2028, /* LINE SEPARATOR */
13021        0x2029, /* PARAGRAPH SEPARATOR */
13022    };
13023
13024    /* Init the implementation */
13025    unicode_empty = PyUnicode_New(0, 0);
13026    if (!unicode_empty)
13027        Py_FatalError("Can't create empty string");
13028
13029    for (i = 0; i < 256; i++)
13030        unicode_latin1[i] = NULL;
13031    if (PyType_Ready(&PyUnicode_Type) < 0)
13032        Py_FatalError("Can't initialize 'unicode'");
13033
13034    /* initialize the linebreak bloom filter */
13035    bloom_linebreak = make_bloom_mask(
13036        PyUnicode_2BYTE_KIND, linebreak,
13037        Py_ARRAY_LENGTH(linebreak));
13038
13039    PyType_Ready(&EncodingMapType);
13040}
13041
13042/* Finalize the Unicode implementation */
13043
13044int
13045PyUnicode_ClearFreeList(void)
13046{
13047    return 0;
13048}
13049
13050void
13051_PyUnicode_Fini(void)
13052{
13053    int i;
13054
13055    Py_XDECREF(unicode_empty);
13056    unicode_empty = NULL;
13057
13058    for (i = 0; i < 256; i++) {
13059        if (unicode_latin1[i]) {
13060            Py_DECREF(unicode_latin1[i]);
13061            unicode_latin1[i] = NULL;
13062        }
13063    }
13064    (void)PyUnicode_ClearFreeList();
13065}
13066
13067void
13068PyUnicode_InternInPlace(PyObject **p)
13069{
13070    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13071    PyObject *t;
13072#ifdef Py_DEBUG
13073    assert(s != NULL);
13074    assert(_PyUnicode_CHECK(s));
13075#else
13076    if (s == NULL || !PyUnicode_Check(s))
13077        return;
13078#endif
13079    /* If it's a subclass, we don't really know what putting
13080       it in the interned dict might do. */
13081    if (!PyUnicode_CheckExact(s))
13082        return;
13083    if (PyUnicode_CHECK_INTERNED(s))
13084        return;
13085    if (_PyUnicode_READY_REPLACE(p)) {
13086        assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
13087        return;
13088    }
13089    s = (PyUnicodeObject *)(*p);
13090    if (interned == NULL) {
13091        interned = PyDict_New();
13092        if (interned == NULL) {
13093            PyErr_Clear(); /* Don't leave an exception */
13094            return;
13095        }
13096    }
13097    /* It might be that the GetItem call fails even
13098       though the key is present in the dictionary,
13099       namely when this happens during a stack overflow. */
13100    Py_ALLOW_RECURSION
13101        t = PyDict_GetItem(interned, (PyObject *)s);
13102    Py_END_ALLOW_RECURSION
13103
13104        if (t) {
13105            Py_INCREF(t);
13106            Py_DECREF(*p);
13107            *p = t;
13108            return;
13109        }
13110
13111    PyThreadState_GET()->recursion_critical = 1;
13112    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13113        PyErr_Clear();
13114        PyThreadState_GET()->recursion_critical = 0;
13115        return;
13116    }
13117    PyThreadState_GET()->recursion_critical = 0;
13118    /* The two references in interned are not counted by refcnt.
13119       The deallocator will take care of this */
13120    Py_REFCNT(s) -= 2;
13121    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13122}
13123
13124void
13125PyUnicode_InternImmortal(PyObject **p)
13126{
13127    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13128
13129    PyUnicode_InternInPlace(p);
13130    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13131        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13132        Py_INCREF(*p);
13133    }
13134}
13135
13136PyObject *
13137PyUnicode_InternFromString(const char *cp)
13138{
13139    PyObject *s = PyUnicode_FromString(cp);
13140    if (s == NULL)
13141        return NULL;
13142    PyUnicode_InternInPlace(&s);
13143    return s;
13144}
13145
13146void
13147_Py_ReleaseInternedUnicodeStrings(void)
13148{
13149    PyObject *keys;
13150    PyUnicodeObject *s;
13151    Py_ssize_t i, n;
13152    Py_ssize_t immortal_size = 0, mortal_size = 0;
13153
13154    if (interned == NULL || !PyDict_Check(interned))
13155        return;
13156    keys = PyDict_Keys(interned);
13157    if (keys == NULL || !PyList_Check(keys)) {
13158        PyErr_Clear();
13159        return;
13160    }
13161
13162    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13163       detector, interned unicode strings are not forcibly deallocated;
13164       rather, we give them their stolen references back, and then clear
13165       and DECREF the interned dict. */
13166
13167    n = PyList_GET_SIZE(keys);
13168    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13169            n);
13170    for (i = 0; i < n; i++) {
13171        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13172        if (PyUnicode_READY(s) == -1)
13173            fprintf(stderr, "could not ready string\n");
13174        switch (PyUnicode_CHECK_INTERNED(s)) {
13175        case SSTATE_NOT_INTERNED:
13176            /* XXX Shouldn't happen */
13177            break;
13178        case SSTATE_INTERNED_IMMORTAL:
13179            Py_REFCNT(s) += 1;
13180            immortal_size += PyUnicode_GET_LENGTH(s);
13181            break;
13182        case SSTATE_INTERNED_MORTAL:
13183            Py_REFCNT(s) += 2;
13184            mortal_size += PyUnicode_GET_LENGTH(s);
13185            break;
13186        default:
13187            Py_FatalError("Inconsistent interned string state.");
13188        }
13189        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13190    }
13191    fprintf(stderr, "total size of all interned strings: "
13192            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13193            "mortal/immortal\n", mortal_size, immortal_size);
13194    Py_DECREF(keys);
13195    PyDict_Clear(interned);
13196    Py_DECREF(interned);
13197    interned = NULL;
13198}
13199
13200
13201/********************* Unicode Iterator **************************/
13202
13203typedef struct {
13204    PyObject_HEAD
13205    Py_ssize_t it_index;
13206    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13207} unicodeiterobject;
13208
13209static void
13210unicodeiter_dealloc(unicodeiterobject *it)
13211{
13212    _PyObject_GC_UNTRACK(it);
13213    Py_XDECREF(it->it_seq);
13214    PyObject_GC_Del(it);
13215}
13216
13217static int
13218unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13219{
13220    Py_VISIT(it->it_seq);
13221    return 0;
13222}
13223
13224static PyObject *
13225unicodeiter_next(unicodeiterobject *it)
13226{
13227    PyUnicodeObject *seq;
13228    PyObject *item;
13229
13230    assert(it != NULL);
13231    seq = it->it_seq;
13232    if (seq == NULL)
13233        return NULL;
13234    assert(_PyUnicode_CHECK(seq));
13235
13236    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13237        int kind = PyUnicode_KIND(seq);
13238        void *data = PyUnicode_DATA(seq);
13239        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13240        item = PyUnicode_FromOrdinal(chr);
13241        if (item != NULL)
13242            ++it->it_index;
13243        return item;
13244    }
13245
13246    Py_DECREF(seq);
13247    it->it_seq = NULL;
13248    return NULL;
13249}
13250
13251static PyObject *
13252unicodeiter_len(unicodeiterobject *it)
13253{
13254    Py_ssize_t len = 0;
13255    if (it->it_seq)
13256        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13257    return PyLong_FromSsize_t(len);
13258}
13259
13260PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13261
13262static PyMethodDef unicodeiter_methods[] = {
13263    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13264     length_hint_doc},
13265    {NULL,      NULL}       /* sentinel */
13266};
13267
13268PyTypeObject PyUnicodeIter_Type = {
13269    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13270    "str_iterator",         /* tp_name */
13271    sizeof(unicodeiterobject),      /* tp_basicsize */
13272    0,                  /* tp_itemsize */
13273    /* methods */
13274    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13275    0,                  /* tp_print */
13276    0,                  /* tp_getattr */
13277    0,                  /* tp_setattr */
13278    0,                  /* tp_reserved */
13279    0,                  /* tp_repr */
13280    0,                  /* tp_as_number */
13281    0,                  /* tp_as_sequence */
13282    0,                  /* tp_as_mapping */
13283    0,                  /* tp_hash */
13284    0,                  /* tp_call */
13285    0,                  /* tp_str */
13286    PyObject_GenericGetAttr,        /* tp_getattro */
13287    0,                  /* tp_setattro */
13288    0,                  /* tp_as_buffer */
13289    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13290    0,                  /* tp_doc */
13291    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13292    0,                  /* tp_clear */
13293    0,                  /* tp_richcompare */
13294    0,                  /* tp_weaklistoffset */
13295    PyObject_SelfIter,          /* tp_iter */
13296    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13297    unicodeiter_methods,            /* tp_methods */
13298    0,
13299};
13300
13301static PyObject *
13302unicode_iter(PyObject *seq)
13303{
13304    unicodeiterobject *it;
13305
13306    if (!PyUnicode_Check(seq)) {
13307        PyErr_BadInternalCall();
13308        return NULL;
13309    }
13310    if (PyUnicode_READY(seq) == -1)
13311        return NULL;
13312    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13313    if (it == NULL)
13314        return NULL;
13315    it->it_index = 0;
13316    Py_INCREF(seq);
13317    it->it_seq = (PyUnicodeObject *)seq;
13318    _PyObject_GC_TRACK(it);
13319    return (PyObject *)it;
13320}
13321
13322#define UNIOP(x) Py_UNICODE_##x
13323#define UNIOP_t Py_UNICODE
13324#include "uniops.h"
13325#undef UNIOP
13326#undef UNIOP_t
13327#define UNIOP(x) Py_UCS4_##x
13328#define UNIOP_t Py_UCS4
13329#include "uniops.h"
13330#undef UNIOP
13331#undef UNIOP_t
13332
13333Py_UNICODE*
13334PyUnicode_AsUnicodeCopy(PyObject *object)
13335{
13336    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13337    Py_UNICODE *copy;
13338    Py_ssize_t size;
13339
13340    if (!PyUnicode_Check(unicode)) {
13341        PyErr_BadArgument();
13342        return NULL;
13343    }
13344    /* Ensure we won't overflow the size. */
13345    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13346        PyErr_NoMemory();
13347        return NULL;
13348    }
13349    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13350    size *= sizeof(Py_UNICODE);
13351    copy = PyMem_Malloc(size);
13352    if (copy == NULL) {
13353        PyErr_NoMemory();
13354        return NULL;
13355    }
13356    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13357    return copy;
13358}
13359
13360/* A _string module, to export formatter_parser and formatter_field_name_split
13361   to the string.Formatter class implemented in Python. */
13362
13363static PyMethodDef _string_methods[] = {
13364    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13365     METH_O, PyDoc_STR("split the argument as a field name")},
13366    {"formatter_parser", (PyCFunction) formatter_parser,
13367     METH_O, PyDoc_STR("parse the argument as a format string")},
13368    {NULL, NULL}
13369};
13370
13371static struct PyModuleDef _string_module = {
13372    PyModuleDef_HEAD_INIT,
13373    "_string",
13374    PyDoc_STR("string helper module"),
13375    0,
13376    _string_methods,
13377    NULL,
13378    NULL,
13379    NULL,
13380    NULL
13381};
13382
13383PyMODINIT_FUNC
13384PyInit__string(void)
13385{
13386    return PyModule_Create(&_string_module);
13387}
13388
13389
13390#ifdef __cplusplus
13391}
13392#endif
13393