unicodeobject.c revision 9e9d689d85e60193494603e65bdbac7717187058
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49/* Limit for the Unicode object free list */
50
51#define PyUnicode_MAXFREELIST       1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55   The implementation will keep allocated Unicode memory intact for
56   all objects on the free list having a size less than this
57   limit. This reduces malloc() overhead for small Unicode objects.
58
59   At worst this will result in PyUnicode_MAXFREELIST *
60   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
61   malloc()-overhead) bytes of unused garbage.
62
63   Setting the limit to 0 effectively turns the feature off.
64
65   Note: This is an experimental feature ! If you get core dumps when
66   using Unicode objects, turn this feature off.
67
68*/
69
70#define KEEPALIVE_SIZE_LIMIT       9
71
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
80/* --- Globals ------------------------------------------------------------
81
82   The globals are initialized by the _PyUnicode_Init() API and should
83   not be used before calling that API.
84
85*/
86
87
88#ifdef __cplusplus
89extern "C" {
90#endif
91
92#ifdef Py_DEBUG
93#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
97
98#define _PyUnicode_UTF8(op)                             \
99    (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op)                              \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     assert(PyUnicode_IS_READY(op)),                    \
103     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
104         ((char*)((PyASCIIObject*)(op) + 1)) :          \
105         _PyUnicode_UTF8(op))
106#define _PyUnicode_UTF8_LENGTH(op)                      \
107    (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     assert(PyUnicode_IS_READY(op)),                    \
111     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
112         ((PyASCIIObject*)(op))->length :               \
113         _PyUnicode_UTF8_LENGTH(op))
114#define _PyUnicode_WSTR(op) (((PyASCIIObject*)(op))->wstr)
115#define _PyUnicode_WSTR_LENGTH(op) (((PyCompactUnicodeObject*)(op))->wstr_length)
116#define _PyUnicode_LENGTH(op) (((PyASCIIObject *)(op))->length)
117#define _PyUnicode_STATE(op) (((PyASCIIObject *)(op))->state)
118#define _PyUnicode_HASH(op) (((PyASCIIObject *)(op))->hash)
119#define _PyUnicode_KIND(op)                             \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     ((PyASCIIObject *)(op))->state.kind)
122#define _PyUnicode_GET_LENGTH(op)                       \
123    (assert(_PyUnicode_CHECK(op)),                      \
124     ((PyASCIIObject *)(op))->length)
125#define _PyUnicode_DATA_ANY(op) (((PyUnicodeObject*)(op))->data.any)
126
127#undef PyUnicode_READY
128#define PyUnicode_READY(op)                             \
129    (assert(_PyUnicode_CHECK(op)),                      \
130     (PyUnicode_IS_READY(op) ?                          \
131      0 : _PyUnicode_Ready((PyObject *)(op))))
132
133#define _PyUnicode_READY_REPLACE(p_obj)                 \
134    (assert(_PyUnicode_CHECK(*p_obj)),                  \
135     (PyUnicode_IS_READY(*p_obj) ?                      \
136      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
137
138#define _PyUnicode_SHARE_UTF8(op)                       \
139    (assert(_PyUnicode_CHECK(op)),                      \
140     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
141     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
142#define _PyUnicode_SHARE_WSTR(op)                       \
143    (assert(_PyUnicode_CHECK(op)),                      \
144     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
145
146/* true if the Unicode object has an allocated UTF-8 memory block
147   (not shared with other data) */
148#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
149    (assert(_PyUnicode_CHECK(op)),                      \
150     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
151      && _PyUnicode_UTF8(op)                            \
152      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
153
154/* true if the Unicode object has an allocated wstr memory block
155   (not shared with other data) */
156#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
157    (assert(_PyUnicode_CHECK(op)),                      \
158     (_PyUnicode_WSTR(op) &&                            \
159      (!PyUnicode_IS_READY(op) ||                       \
160       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
161
162/* Generic helper macro to convert characters of different types.
163   from_type and to_type have to be valid type names, begin and end
164   are pointers to the source characters which should be of type
165   "from_type *".  to is a pointer of type "to_type *" and points to the
166   buffer where the result characters are written to. */
167#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
168    do {                                                \
169        const from_type *iter_; to_type *to_;           \
170        for (iter_ = (begin), to_ = (to_type *)(to);    \
171             iter_ < (end);                             \
172             ++iter_, ++to_) {                          \
173            *to_ = (to_type)*iter_;                     \
174        }                                               \
175    } while (0)
176
177/* The Unicode string has been modified: reset the hash */
178#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
179
180/* This dictionary holds all interned unicode strings.  Note that references
181   to strings in this dictionary are *not* counted in the string's ob_refcnt.
182   When the interned string reaches a refcnt of 0 the string deallocation
183   function will delete the reference from this dictionary.
184
185   Another way to look at this is that to say that the actual reference
186   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
187*/
188static PyObject *interned;
189
190/* The empty Unicode object is shared to improve performance. */
191static PyObject *unicode_empty;
192
193/* Single character Unicode strings in the Latin-1 range are being
194   shared as well. */
195static PyObject *unicode_latin1[256];
196
197/* Fast detection of the most frequent whitespace characters */
198const unsigned char _Py_ascii_whitespace[] = {
199    0, 0, 0, 0, 0, 0, 0, 0,
200/*     case 0x0009: * CHARACTER TABULATION */
201/*     case 0x000A: * LINE FEED */
202/*     case 0x000B: * LINE TABULATION */
203/*     case 0x000C: * FORM FEED */
204/*     case 0x000D: * CARRIAGE RETURN */
205    0, 1, 1, 1, 1, 1, 0, 0,
206    0, 0, 0, 0, 0, 0, 0, 0,
207/*     case 0x001C: * FILE SEPARATOR */
208/*     case 0x001D: * GROUP SEPARATOR */
209/*     case 0x001E: * RECORD SEPARATOR */
210/*     case 0x001F: * UNIT SEPARATOR */
211    0, 0, 0, 0, 1, 1, 1, 1,
212/*     case 0x0020: * SPACE */
213    1, 0, 0, 0, 0, 0, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0,
215    0, 0, 0, 0, 0, 0, 0, 0,
216    0, 0, 0, 0, 0, 0, 0, 0,
217
218    0, 0, 0, 0, 0, 0, 0, 0,
219    0, 0, 0, 0, 0, 0, 0, 0,
220    0, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224    0, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0
226};
227
228/* forward */
229static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
230static PyObject* get_latin1_char(unsigned char ch);
231
232static PyObject *
233unicode_encode_call_errorhandler(const char *errors,
234       PyObject **errorHandler,const char *encoding, const char *reason,
235       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
236       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
237
238static void
239raise_encode_exception(PyObject **exceptionObject,
240                       const char *encoding,
241                       const Py_UNICODE *unicode, Py_ssize_t size,
242                       Py_ssize_t startpos, Py_ssize_t endpos,
243                       const char *reason);
244
245/* Same for linebreaks */
246static unsigned char ascii_linebreak[] = {
247    0, 0, 0, 0, 0, 0, 0, 0,
248/*         0x000A, * LINE FEED */
249/*         0x000B, * LINE TABULATION */
250/*         0x000C, * FORM FEED */
251/*         0x000D, * CARRIAGE RETURN */
252    0, 0, 1, 1, 1, 1, 0, 0,
253    0, 0, 0, 0, 0, 0, 0, 0,
254/*         0x001C, * FILE SEPARATOR */
255/*         0x001D, * GROUP SEPARATOR */
256/*         0x001E, * RECORD SEPARATOR */
257    0, 0, 0, 0, 1, 1, 1, 0,
258    0, 0, 0, 0, 0, 0, 0, 0,
259    0, 0, 0, 0, 0, 0, 0, 0,
260    0, 0, 0, 0, 0, 0, 0, 0,
261    0, 0, 0, 0, 0, 0, 0, 0,
262
263    0, 0, 0, 0, 0, 0, 0, 0,
264    0, 0, 0, 0, 0, 0, 0, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0,
269    0, 0, 0, 0, 0, 0, 0, 0,
270    0, 0, 0, 0, 0, 0, 0, 0
271};
272
273/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
274   This function is kept for backward compatibility with the old API. */
275Py_UNICODE
276PyUnicode_GetMax(void)
277{
278#ifdef Py_UNICODE_WIDE
279    return 0x10FFFF;
280#else
281    /* This is actually an illegal character, so it should
282       not be passed to unichr. */
283    return 0xFFFF;
284#endif
285}
286
287#ifdef Py_DEBUG
288static int
289_PyUnicode_CheckConsistency(void *op)
290{
291    PyASCIIObject *ascii;
292    unsigned int kind;
293
294    assert(PyUnicode_Check(op));
295
296    ascii = (PyASCIIObject *)op;
297    kind = ascii->state.kind;
298
299    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
300        assert(kind == PyUnicode_1BYTE_KIND);
301        assert(ascii->state.ready == 1);
302    }
303    else if (ascii->state.compact == 1) {
304        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
305        void *data;
306        assert(kind == PyUnicode_1BYTE_KIND
307               || kind == PyUnicode_2BYTE_KIND
308               || kind == PyUnicode_4BYTE_KIND);
309        assert(ascii->state.ascii == 0);
310        assert(ascii->state.ready == 1);
311        data = compact + 1;
312        assert (compact->utf8 != data);
313        if (
314#if SIZEOF_WCHAR_T == 2
315            kind == PyUnicode_2BYTE_KIND
316#else
317            kind == PyUnicode_4BYTE_KIND
318#endif
319           )
320            assert(ascii->wstr == data);
321        else
322            assert(ascii->wstr != data);
323    } else {
324        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325        PyUnicodeObject *unicode = (PyUnicodeObject *)op;
326
327        if (kind == PyUnicode_WCHAR_KIND) {
328            assert(ascii->state.compact == 0);
329            assert(ascii->state.ascii == 0);
330            assert(ascii->state.ready == 0);
331            assert(ascii->wstr != NULL);
332            assert(unicode->data.any == NULL);
333            assert(compact->utf8 == NULL);
334            assert(ascii->state.interned == SSTATE_NOT_INTERNED);
335        }
336        else {
337            assert(kind == PyUnicode_1BYTE_KIND
338                   || kind == PyUnicode_2BYTE_KIND
339                   || kind == PyUnicode_4BYTE_KIND);
340            assert(ascii->state.compact == 0);
341            assert(ascii->state.ready == 1);
342            assert(unicode->data.any != NULL);
343            if (ascii->state.ascii)
344                assert (compact->utf8 == unicode->data.any);
345            else
346                assert (compact->utf8 != unicode->data.any);
347            if (
348#if SIZEOF_WCHAR_T == 2
349                kind == PyUnicode_2BYTE_KIND
350#else
351                kind == PyUnicode_4BYTE_KIND
352#endif
353               )
354                assert(ascii->wstr == unicode->data.any);
355            else
356                assert(ascii->wstr != unicode->data.any);
357        }
358    }
359    return 1;
360}
361#endif
362
363/* --- Bloom Filters ----------------------------------------------------- */
364
365/* stuff to implement simple "bloom filters" for Unicode characters.
366   to keep things simple, we use a single bitmask, using the least 5
367   bits from each unicode characters as the bit index. */
368
369/* the linebreak mask is set up by Unicode_Init below */
370
371#if LONG_BIT >= 128
372#define BLOOM_WIDTH 128
373#elif LONG_BIT >= 64
374#define BLOOM_WIDTH 64
375#elif LONG_BIT >= 32
376#define BLOOM_WIDTH 32
377#else
378#error "LONG_BIT is smaller than 32"
379#endif
380
381#define BLOOM_MASK unsigned long
382
383static BLOOM_MASK bloom_linebreak;
384
385#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
386#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
387
388#define BLOOM_LINEBREAK(ch)                                             \
389    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
390     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
391
392Py_LOCAL_INLINE(BLOOM_MASK)
393make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
394{
395    /* calculate simple bloom-style bitmask for a given unicode string */
396
397    BLOOM_MASK mask;
398    Py_ssize_t i;
399
400    mask = 0;
401    for (i = 0; i < len; i++)
402        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
403
404    return mask;
405}
406
407#define BLOOM_MEMBER(mask, chr, str) \
408    (BLOOM(mask, chr) \
409     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
410
411/* --- Unicode Object ----------------------------------------------------- */
412
413static PyObject *
414fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
415
416Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
417                                 Py_ssize_t size, Py_UCS4 ch,
418                                 int direction)
419{
420    /* like wcschr, but doesn't stop at NULL characters */
421    Py_ssize_t i;
422    if (direction == 1) {
423        for(i = 0; i < size; i++)
424            if (PyUnicode_READ(kind, s, i) == ch)
425                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
426    }
427    else {
428        for(i = size-1; i >= 0; i--)
429            if (PyUnicode_READ(kind, s, i) == ch)
430                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
431    }
432    return NULL;
433}
434
435static PyObject*
436resize_compact(PyObject *unicode, Py_ssize_t length)
437{
438    Py_ssize_t char_size;
439    Py_ssize_t struct_size;
440    Py_ssize_t new_size;
441    int share_wstr;
442
443    assert(PyUnicode_IS_READY(unicode));
444    char_size = PyUnicode_CHARACTER_SIZE(unicode);
445    if (PyUnicode_IS_COMPACT_ASCII(unicode))
446        struct_size = sizeof(PyASCIIObject);
447    else
448        struct_size = sizeof(PyCompactUnicodeObject);
449    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
450
451    _Py_DEC_REFTOTAL;
452    _Py_ForgetReference(unicode);
453
454    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
455        PyErr_NoMemory();
456        return NULL;
457    }
458    new_size = (struct_size + (length + 1) * char_size);
459
460    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
461    if (unicode == NULL) {
462        PyObject_Del(unicode);
463        PyErr_NoMemory();
464        return NULL;
465    }
466    _Py_NewReference(unicode);
467    _PyUnicode_LENGTH(unicode) = length;
468    if (share_wstr) {
469        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
470        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
471            _PyUnicode_WSTR_LENGTH(unicode) = length;
472    }
473    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
474                    length, 0);
475    return unicode;
476}
477
478static int
479resize_inplace(register PyUnicodeObject *unicode, Py_ssize_t length)
480{
481    void *oldstr;
482
483    assert(!PyUnicode_IS_COMPACT(unicode));
484
485    assert(Py_REFCNT(unicode) == 1);
486    _PyUnicode_DIRTY(unicode);
487
488    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
489    {
490        PyObject_DEL(_PyUnicode_UTF8(unicode));
491        _PyUnicode_UTF8(unicode) = NULL;
492    }
493
494    if (PyUnicode_IS_READY(unicode)) {
495        Py_ssize_t char_size;
496        Py_ssize_t new_size;
497        int share_wstr, share_utf8;
498        void *data;
499
500        data = _PyUnicode_DATA_ANY(unicode);
501        assert(data != NULL);
502        char_size = PyUnicode_CHARACTER_SIZE(unicode);
503        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
504        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
505
506        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
507            PyErr_NoMemory();
508            return -1;
509        }
510        new_size = (length + 1) * char_size;
511
512        data = (PyObject *)PyObject_REALLOC(data, new_size);
513        if (data == NULL) {
514            PyErr_NoMemory();
515            return -1;
516        }
517        _PyUnicode_DATA_ANY(unicode) = data;
518        if (share_wstr) {
519            _PyUnicode_WSTR(unicode) = data;
520            _PyUnicode_WSTR_LENGTH(unicode) = length;
521        }
522        if (share_utf8) {
523            _PyUnicode_UTF8(unicode) = data;
524            _PyUnicode_UTF8_LENGTH(unicode) = length;
525        }
526        _PyUnicode_LENGTH(unicode) = length;
527        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
528        if (share_wstr)
529            return 0;
530    }
531    if (_PyUnicode_WSTR(unicode) != NULL) {
532        assert(_PyUnicode_WSTR(unicode) != NULL);
533
534        oldstr = _PyUnicode_WSTR(unicode);
535        _PyUnicode_WSTR(unicode) = PyObject_REALLOC(_PyUnicode_WSTR(unicode),
536                                         sizeof(Py_UNICODE) * (length + 1));
537        if (!_PyUnicode_WSTR(unicode)) {
538            _PyUnicode_WSTR(unicode) = (Py_UNICODE *)oldstr;
539            PyErr_NoMemory();
540            return -1;
541        }
542        _PyUnicode_WSTR(unicode)[length] = 0;
543        _PyUnicode_WSTR_LENGTH(unicode) = length;
544    }
545    return 0;
546}
547
548static PyObject*
549resize_copy(PyObject *unicode, Py_ssize_t length)
550{
551    Py_ssize_t copy_length;
552    if (PyUnicode_IS_COMPACT(unicode)) {
553        PyObject *copy;
554        assert(PyUnicode_IS_READY(unicode));
555
556        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
557        if (copy == NULL)
558            return NULL;
559
560        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
561        if (PyUnicode_CopyCharacters(copy, 0,
562                                     unicode, 0,
563                                     copy_length) < 0)
564        {
565            Py_DECREF(copy);
566            return NULL;
567        }
568        return copy;
569    }
570    else {
571        PyUnicodeObject *w;
572        assert(_PyUnicode_WSTR(unicode) != NULL);
573        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
574        w = _PyUnicode_New(length);
575        if (w == NULL)
576            return NULL;
577        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
578        copy_length = Py_MIN(copy_length, length);
579        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
580                        copy_length);
581        return (PyObject*)w;
582    }
583}
584
585/* We allocate one more byte to make sure the string is
586   Ux0000 terminated; some code (e.g. new_identifier)
587   relies on that.
588
589   XXX This allocator could further be enhanced by assuring that the
590   free list never reduces its size below 1.
591
592*/
593
594#ifdef Py_DEBUG
595int unicode_old_new_calls = 0;
596#endif
597
598static PyUnicodeObject *
599_PyUnicode_New(Py_ssize_t length)
600{
601    register PyUnicodeObject *unicode;
602    size_t new_size;
603
604    /* Optimization for empty strings */
605    if (length == 0 && unicode_empty != NULL) {
606        Py_INCREF(unicode_empty);
607        return (PyUnicodeObject*)unicode_empty;
608    }
609
610    /* Ensure we won't overflow the size. */
611    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
612        return (PyUnicodeObject *)PyErr_NoMemory();
613    }
614    if (length < 0) {
615        PyErr_SetString(PyExc_SystemError,
616                        "Negative size passed to _PyUnicode_New");
617        return NULL;
618    }
619
620#ifdef Py_DEBUG
621    ++unicode_old_new_calls;
622#endif
623
624    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
625    if (unicode == NULL)
626        return NULL;
627    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
628    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
629    if (!_PyUnicode_WSTR(unicode)) {
630        PyErr_NoMemory();
631        goto onError;
632    }
633
634    /* Initialize the first element to guard against cases where
635     * the caller fails before initializing str -- unicode_resize()
636     * reads str[0], and the Keep-Alive optimization can keep memory
637     * allocated for str alive across a call to unicode_dealloc(unicode).
638     * We don't want unicode_resize to read uninitialized memory in
639     * that case.
640     */
641    _PyUnicode_WSTR(unicode)[0] = 0;
642    _PyUnicode_WSTR(unicode)[length] = 0;
643    _PyUnicode_WSTR_LENGTH(unicode) = length;
644    _PyUnicode_HASH(unicode) = -1;
645    _PyUnicode_STATE(unicode).interned = 0;
646    _PyUnicode_STATE(unicode).kind = 0;
647    _PyUnicode_STATE(unicode).compact = 0;
648    _PyUnicode_STATE(unicode).ready = 0;
649    _PyUnicode_STATE(unicode).ascii = 0;
650    _PyUnicode_DATA_ANY(unicode) = NULL;
651    _PyUnicode_LENGTH(unicode) = 0;
652    _PyUnicode_UTF8(unicode) = NULL;
653    _PyUnicode_UTF8_LENGTH(unicode) = 0;
654    return unicode;
655
656  onError:
657    /* XXX UNREF/NEWREF interface should be more symmetrical */
658    _Py_DEC_REFTOTAL;
659    _Py_ForgetReference((PyObject *)unicode);
660    PyObject_Del(unicode);
661    return NULL;
662}
663
664static const char*
665unicode_kind_name(PyObject *unicode)
666{
667    /* don't check consistency: unicode_kind_name() is called from
668       _PyUnicode_Dump() */
669    if (!PyUnicode_IS_COMPACT(unicode))
670    {
671        if (!PyUnicode_IS_READY(unicode))
672            return "wstr";
673        switch(PyUnicode_KIND(unicode))
674        {
675        case PyUnicode_1BYTE_KIND:
676            if (PyUnicode_IS_ASCII(unicode))
677                return "legacy ascii";
678            else
679                return "legacy latin1";
680        case PyUnicode_2BYTE_KIND:
681            return "legacy UCS2";
682        case PyUnicode_4BYTE_KIND:
683            return "legacy UCS4";
684        default:
685            return "<legacy invalid kind>";
686        }
687    }
688    assert(PyUnicode_IS_READY(unicode));
689    switch(PyUnicode_KIND(unicode))
690    {
691    case PyUnicode_1BYTE_KIND:
692        if (PyUnicode_IS_ASCII(unicode))
693            return "ascii";
694        else
695            return "latin1";
696    case PyUnicode_2BYTE_KIND:
697        return "UCS2";
698    case PyUnicode_4BYTE_KIND:
699        return "UCS4";
700    default:
701        return "<invalid compact kind>";
702    }
703}
704
705#ifdef Py_DEBUG
706int unicode_new_new_calls = 0;
707
708/* Functions wrapping macros for use in debugger */
709char *_PyUnicode_utf8(void *unicode){
710    return PyUnicode_UTF8(unicode);
711}
712
713void *_PyUnicode_compact_data(void *unicode) {
714    return _PyUnicode_COMPACT_DATA(unicode);
715}
716void *_PyUnicode_data(void *unicode){
717    printf("obj %p\n", unicode);
718    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
719    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
720    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
721    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
722    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
723    return PyUnicode_DATA(unicode);
724}
725
726void
727_PyUnicode_Dump(PyObject *op)
728{
729    PyASCIIObject *ascii = (PyASCIIObject *)op;
730    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
731    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
732    void *data;
733    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
734    if (ascii->state.compact)
735        data = (compact + 1);
736    else
737        data = unicode->data.any;
738    if (ascii->wstr == data)
739        printf("shared ");
740    printf("wstr=%p", ascii->wstr);
741    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
742        printf(" (%zu), ", compact->wstr_length);
743        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
744            printf("shared ");
745        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
746    }
747    printf(", data=%p\n", data);
748}
749#endif
750
751PyObject *
752PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
753{
754    PyObject *obj;
755    PyCompactUnicodeObject *unicode;
756    void *data;
757    int kind_state;
758    int is_sharing, is_ascii;
759    Py_ssize_t char_size;
760    Py_ssize_t struct_size;
761
762    /* Optimization for empty strings */
763    if (size == 0 && unicode_empty != NULL) {
764        Py_INCREF(unicode_empty);
765        return unicode_empty;
766    }
767
768#ifdef Py_DEBUG
769    ++unicode_new_new_calls;
770#endif
771
772    is_ascii = 0;
773    is_sharing = 0;
774    struct_size = sizeof(PyCompactUnicodeObject);
775    if (maxchar < 128) {
776        kind_state = PyUnicode_1BYTE_KIND;
777        char_size = 1;
778        is_ascii = 1;
779        struct_size = sizeof(PyASCIIObject);
780    }
781    else if (maxchar < 256) {
782        kind_state = PyUnicode_1BYTE_KIND;
783        char_size = 1;
784    }
785    else if (maxchar < 65536) {
786        kind_state = PyUnicode_2BYTE_KIND;
787        char_size = 2;
788        if (sizeof(wchar_t) == 2)
789            is_sharing = 1;
790    }
791    else {
792        kind_state = PyUnicode_4BYTE_KIND;
793        char_size = 4;
794        if (sizeof(wchar_t) == 4)
795            is_sharing = 1;
796    }
797
798    /* Ensure we won't overflow the size. */
799    if (size < 0) {
800        PyErr_SetString(PyExc_SystemError,
801                        "Negative size passed to PyUnicode_New");
802        return NULL;
803    }
804    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
805        return PyErr_NoMemory();
806
807    /* Duplicated allocation code from _PyObject_New() instead of a call to
808     * PyObject_New() so we are able to allocate space for the object and
809     * it's data buffer.
810     */
811    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
812    if (obj == NULL)
813        return PyErr_NoMemory();
814    obj = PyObject_INIT(obj, &PyUnicode_Type);
815    if (obj == NULL)
816        return NULL;
817
818    unicode = (PyCompactUnicodeObject *)obj;
819    if (is_ascii)
820        data = ((PyASCIIObject*)obj) + 1;
821    else
822        data = unicode + 1;
823    _PyUnicode_LENGTH(unicode) = size;
824    _PyUnicode_HASH(unicode) = -1;
825    _PyUnicode_STATE(unicode).interned = 0;
826    _PyUnicode_STATE(unicode).kind = kind_state;
827    _PyUnicode_STATE(unicode).compact = 1;
828    _PyUnicode_STATE(unicode).ready = 1;
829    _PyUnicode_STATE(unicode).ascii = is_ascii;
830    if (is_ascii) {
831        ((char*)data)[size] = 0;
832        _PyUnicode_WSTR(unicode) = NULL;
833    }
834    else if (kind_state == PyUnicode_1BYTE_KIND) {
835        ((char*)data)[size] = 0;
836        _PyUnicode_WSTR(unicode) = NULL;
837        _PyUnicode_WSTR_LENGTH(unicode) = 0;
838        unicode->utf8 = NULL;
839        unicode->utf8_length = 0;
840        }
841    else {
842        unicode->utf8 = NULL;
843        unicode->utf8_length = 0;
844        if (kind_state == PyUnicode_2BYTE_KIND)
845            ((Py_UCS2*)data)[size] = 0;
846        else /* kind_state == PyUnicode_4BYTE_KIND */
847            ((Py_UCS4*)data)[size] = 0;
848        if (is_sharing) {
849            _PyUnicode_WSTR_LENGTH(unicode) = size;
850            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
851        }
852        else {
853            _PyUnicode_WSTR_LENGTH(unicode) = 0;
854            _PyUnicode_WSTR(unicode) = NULL;
855        }
856    }
857    return obj;
858}
859
860#if SIZEOF_WCHAR_T == 2
861/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
862   will decode surrogate pairs, the other conversions are implemented as macros
863   for efficency.
864
865   This function assumes that unicode can hold one more code point than wstr
866   characters for a terminating null character. */
867static void
868unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
869                              PyUnicodeObject *unicode)
870{
871    const wchar_t *iter;
872    Py_UCS4 *ucs4_out;
873
874    assert(unicode != NULL);
875    assert(_PyUnicode_CHECK(unicode));
876    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
877    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
878
879    for (iter = begin; iter < end; ) {
880        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
881                           _PyUnicode_GET_LENGTH(unicode)));
882        if (*iter >= 0xD800 && *iter <= 0xDBFF
883            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
884        {
885            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
886            iter += 2;
887        }
888        else {
889            *ucs4_out++ = *iter;
890            iter++;
891        }
892    }
893    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
894                        _PyUnicode_GET_LENGTH(unicode)));
895
896}
897#endif
898
899static int
900_PyUnicode_Dirty(PyObject *unicode)
901{
902    assert(_PyUnicode_CHECK(unicode));
903    if (Py_REFCNT(unicode) != 1) {
904        PyErr_SetString(PyExc_SystemError,
905                        "Cannot modify a string having more than 1 reference");
906        return -1;
907    }
908    _PyUnicode_DIRTY(unicode);
909    return 0;
910}
911
912Py_ssize_t
913PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
914                         PyObject *from, Py_ssize_t from_start,
915                         Py_ssize_t how_many)
916{
917    unsigned int from_kind, to_kind;
918    void *from_data, *to_data;
919
920    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
921        PyErr_BadInternalCall();
922        return -1;
923    }
924
925    if (PyUnicode_READY(from))
926        return -1;
927    if (PyUnicode_READY(to))
928        return -1;
929
930    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
931    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
932        PyErr_Format(PyExc_SystemError,
933                     "Cannot write %zi characters at %zi "
934                     "in a string of %zi characters",
935                     how_many, to_start, PyUnicode_GET_LENGTH(to));
936        return -1;
937    }
938    if (how_many == 0)
939        return 0;
940
941    if (_PyUnicode_Dirty(to))
942        return -1;
943
944    from_kind = PyUnicode_KIND(from);
945    from_data = PyUnicode_DATA(from);
946    to_kind = PyUnicode_KIND(to);
947    to_data = PyUnicode_DATA(to);
948
949    if (from_kind == to_kind
950        /* deny latin1 => ascii */
951        && PyUnicode_MAX_CHAR_VALUE(to) >= PyUnicode_MAX_CHAR_VALUE(from))
952    {
953        Py_MEMCPY((char*)to_data
954                      + PyUnicode_KIND_SIZE(to_kind, to_start),
955                  (char*)from_data
956                      + PyUnicode_KIND_SIZE(from_kind, from_start),
957                  PyUnicode_KIND_SIZE(to_kind, how_many));
958    }
959    else if (from_kind == PyUnicode_1BYTE_KIND
960             && to_kind == PyUnicode_2BYTE_KIND)
961    {
962        _PyUnicode_CONVERT_BYTES(
963            Py_UCS1, Py_UCS2,
964            PyUnicode_1BYTE_DATA(from) + from_start,
965            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
966            PyUnicode_2BYTE_DATA(to) + to_start
967            );
968    }
969    else if (from_kind == PyUnicode_1BYTE_KIND
970             && to_kind == PyUnicode_4BYTE_KIND)
971    {
972        _PyUnicode_CONVERT_BYTES(
973            Py_UCS1, Py_UCS4,
974            PyUnicode_1BYTE_DATA(from) + from_start,
975            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
976            PyUnicode_4BYTE_DATA(to) + to_start
977            );
978    }
979    else if (from_kind == PyUnicode_2BYTE_KIND
980             && to_kind == PyUnicode_4BYTE_KIND)
981    {
982        _PyUnicode_CONVERT_BYTES(
983            Py_UCS2, Py_UCS4,
984            PyUnicode_2BYTE_DATA(from) + from_start,
985            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
986            PyUnicode_4BYTE_DATA(to) + to_start
987            );
988    }
989    else {
990        int invalid_kinds;
991
992        /* check if max_char(from substring) <= max_char(to) */
993        if (from_kind > to_kind
994                /* latin1 => ascii */
995            || (PyUnicode_IS_ASCII(to)
996                && to_kind == PyUnicode_1BYTE_KIND
997                && !PyUnicode_IS_ASCII(from)))
998        {
999            /* slow path to check for character overflow */
1000            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1001            Py_UCS4 ch, maxchar;
1002            Py_ssize_t i;
1003
1004            maxchar = 0;
1005            invalid_kinds = 0;
1006            for (i=0; i < how_many; i++) {
1007                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1008                if (ch > maxchar) {
1009                    maxchar = ch;
1010                    if (maxchar > to_maxchar) {
1011                        invalid_kinds = 1;
1012                        break;
1013                    }
1014                }
1015                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1016            }
1017        }
1018        else
1019            invalid_kinds = 1;
1020        if (invalid_kinds) {
1021            PyErr_Format(PyExc_SystemError,
1022                         "Cannot copy %s characters "
1023                         "into a string of %s characters",
1024                         unicode_kind_name(from),
1025                         unicode_kind_name(to));
1026            return -1;
1027        }
1028    }
1029    return how_many;
1030}
1031
1032/* Find the maximum code point and count the number of surrogate pairs so a
1033   correct string length can be computed before converting a string to UCS4.
1034   This function counts single surrogates as a character and not as a pair.
1035
1036   Return 0 on success, or -1 on error. */
1037static int
1038find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1039                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1040{
1041    const wchar_t *iter;
1042
1043    assert(num_surrogates != NULL && maxchar != NULL);
1044    if (num_surrogates == NULL || maxchar == NULL) {
1045        PyErr_SetString(PyExc_SystemError,
1046                        "unexpected NULL arguments to "
1047                        "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1048        return -1;
1049    }
1050
1051    *num_surrogates = 0;
1052    *maxchar = 0;
1053
1054    for (iter = begin; iter < end; ) {
1055        if (*iter > *maxchar)
1056            *maxchar = *iter;
1057#if SIZEOF_WCHAR_T == 2
1058        if (*iter >= 0xD800 && *iter <= 0xDBFF
1059            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1060        {
1061            Py_UCS4 surrogate_val;
1062            surrogate_val = (((iter[0] & 0x3FF)<<10)
1063                             | (iter[1] & 0x3FF)) + 0x10000;
1064            ++(*num_surrogates);
1065            if (surrogate_val > *maxchar)
1066                *maxchar = surrogate_val;
1067            iter += 2;
1068        }
1069        else
1070            iter++;
1071#else
1072        iter++;
1073#endif
1074    }
1075    return 0;
1076}
1077
1078#ifdef Py_DEBUG
1079int unicode_ready_calls = 0;
1080#endif
1081
1082static int
1083unicode_ready(PyObject **p_obj, int replace)
1084{
1085    PyUnicodeObject *unicode;
1086    wchar_t *end;
1087    Py_UCS4 maxchar = 0;
1088    Py_ssize_t num_surrogates;
1089#if SIZEOF_WCHAR_T == 2
1090    Py_ssize_t length_wo_surrogates;
1091#endif
1092
1093    assert(p_obj != NULL);
1094    unicode = (PyUnicodeObject *)*p_obj;
1095
1096    /* _PyUnicode_Ready() is only intented for old-style API usage where
1097       strings were created using _PyObject_New() and where no canonical
1098       representation (the str field) has been set yet aka strings
1099       which are not yet ready. */
1100    assert(_PyUnicode_CHECK(unicode));
1101    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1102    assert(_PyUnicode_WSTR(unicode) != NULL);
1103    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1104    assert(_PyUnicode_UTF8(unicode) == NULL);
1105    /* Actually, it should neither be interned nor be anything else: */
1106    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1107
1108#ifdef Py_DEBUG
1109    ++unicode_ready_calls;
1110#endif
1111
1112#ifdef Py_DEBUG
1113    assert(!replace || Py_REFCNT(unicode) == 1);
1114#else
1115    if (replace && Py_REFCNT(unicode) != 1)
1116        replace = 0;
1117#endif
1118    if (replace) {
1119        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1120        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1121        /* Optimization for empty strings */
1122        if (len == 0) {
1123            Py_INCREF(unicode_empty);
1124            Py_DECREF(*p_obj);
1125            *p_obj = unicode_empty;
1126            return 0;
1127        }
1128        if (len == 1 && wstr[0] < 256) {
1129            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1130            if (latin1_char == NULL)
1131                return -1;
1132            Py_DECREF(*p_obj);
1133            *p_obj = latin1_char;
1134            return 0;
1135        }
1136    }
1137
1138    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1139    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1140                                &maxchar, &num_surrogates) == -1)
1141        return -1;
1142
1143    if (maxchar < 256) {
1144        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1145        if (!_PyUnicode_DATA_ANY(unicode)) {
1146            PyErr_NoMemory();
1147            return -1;
1148        }
1149        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1150                                _PyUnicode_WSTR(unicode), end,
1151                                PyUnicode_1BYTE_DATA(unicode));
1152        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1153        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1154        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1155        if (maxchar < 128) {
1156            _PyUnicode_STATE(unicode).ascii = 1;
1157            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1158            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1159        }
1160        else {
1161            _PyUnicode_STATE(unicode).ascii = 0;
1162            _PyUnicode_UTF8(unicode) = NULL;
1163            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1164        }
1165        PyObject_FREE(_PyUnicode_WSTR(unicode));
1166        _PyUnicode_WSTR(unicode) = NULL;
1167        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1168    }
1169    /* In this case we might have to convert down from 4-byte native
1170       wchar_t to 2-byte unicode. */
1171    else if (maxchar < 65536) {
1172        assert(num_surrogates == 0 &&
1173               "FindMaxCharAndNumSurrogatePairs() messed up");
1174
1175#if SIZEOF_WCHAR_T == 2
1176        /* We can share representations and are done. */
1177        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1178        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1179        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1180        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1181        _PyUnicode_UTF8(unicode) = NULL;
1182        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1183#else
1184        /* sizeof(wchar_t) == 4 */
1185        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1186            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1187        if (!_PyUnicode_DATA_ANY(unicode)) {
1188            PyErr_NoMemory();
1189            return -1;
1190        }
1191        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1192                                _PyUnicode_WSTR(unicode), end,
1193                                PyUnicode_2BYTE_DATA(unicode));
1194        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1195        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1196        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1197        _PyUnicode_UTF8(unicode) = NULL;
1198        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1199        PyObject_FREE(_PyUnicode_WSTR(unicode));
1200        _PyUnicode_WSTR(unicode) = NULL;
1201        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1202#endif
1203    }
1204    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1205    else {
1206#if SIZEOF_WCHAR_T == 2
1207        /* in case the native representation is 2-bytes, we need to allocate a
1208           new normalized 4-byte version. */
1209        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1210        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1211        if (!_PyUnicode_DATA_ANY(unicode)) {
1212            PyErr_NoMemory();
1213            return -1;
1214        }
1215        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1216        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1217        _PyUnicode_UTF8(unicode) = NULL;
1218        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1219        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1220        _PyUnicode_STATE(unicode).ready = 1;
1221        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1222        PyObject_FREE(_PyUnicode_WSTR(unicode));
1223        _PyUnicode_WSTR(unicode) = NULL;
1224        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1225#else
1226        assert(num_surrogates == 0);
1227
1228        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1229        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1230        _PyUnicode_UTF8(unicode) = NULL;
1231        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1232        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1233#endif
1234        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1235    }
1236    _PyUnicode_STATE(unicode).ready = 1;
1237    return 0;
1238}
1239
1240int
1241_PyUnicode_ReadyReplace(PyObject **op)
1242{
1243    return unicode_ready(op, 1);
1244}
1245
1246int
1247_PyUnicode_Ready(PyObject *op)
1248{
1249    return unicode_ready(&op, 0);
1250}
1251
1252static void
1253unicode_dealloc(register PyUnicodeObject *unicode)
1254{
1255    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1256    case SSTATE_NOT_INTERNED:
1257        break;
1258
1259    case SSTATE_INTERNED_MORTAL:
1260        /* revive dead object temporarily for DelItem */
1261        Py_REFCNT(unicode) = 3;
1262        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1263            Py_FatalError(
1264                "deletion of interned string failed");
1265        break;
1266
1267    case SSTATE_INTERNED_IMMORTAL:
1268        Py_FatalError("Immortal interned string died.");
1269
1270    default:
1271        Py_FatalError("Inconsistent interned string state.");
1272    }
1273
1274    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1275        PyObject_DEL(_PyUnicode_WSTR(unicode));
1276    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1277        PyObject_DEL(_PyUnicode_UTF8(unicode));
1278
1279    if (PyUnicode_IS_COMPACT(unicode)) {
1280        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1281    }
1282    else {
1283        if (_PyUnicode_DATA_ANY(unicode))
1284            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1285        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1286    }
1287}
1288
1289static int
1290unicode_resizable(PyObject *unicode)
1291{
1292    if (Py_REFCNT(unicode) != 1)
1293        return 0;
1294    if (PyUnicode_CHECK_INTERNED(unicode))
1295        return 0;
1296    assert (unicode != unicode_empty);
1297#ifdef Py_DEBUG
1298    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1299        && PyUnicode_GET_LENGTH(unicode) == 1)
1300    {
1301        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1302        if (ch < 256 && unicode_latin1[ch] == unicode)
1303            return 0;
1304    }
1305#endif
1306    return 1;
1307}
1308
1309static int
1310unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1311{
1312    PyObject *unicode;
1313    Py_ssize_t old_length;
1314
1315    assert(p_unicode != NULL);
1316    unicode = *p_unicode;
1317
1318    assert(unicode != NULL);
1319    assert(PyUnicode_Check(unicode));
1320    assert(0 <= length);
1321
1322    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1323        old_length = PyUnicode_WSTR_LENGTH(unicode);
1324    else
1325        old_length = PyUnicode_GET_LENGTH(unicode);
1326    if (old_length == length)
1327        return 0;
1328
1329    if (!unicode_resizable(unicode)) {
1330        PyObject *copy = resize_copy(unicode, length);
1331        if (copy == NULL)
1332            return -1;
1333        Py_DECREF(*p_unicode);
1334        *p_unicode = copy;
1335        return 0;
1336    }
1337
1338    if (PyUnicode_IS_COMPACT(unicode)) {
1339        *p_unicode = resize_compact(unicode, length);
1340        if (*p_unicode == NULL)
1341            return -1;
1342        return 0;
1343    } else
1344        return resize_inplace((PyUnicodeObject*)unicode, length);
1345}
1346
1347int
1348PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1349{
1350    PyObject *unicode;
1351    if (p_unicode == NULL) {
1352        PyErr_BadInternalCall();
1353        return -1;
1354    }
1355    unicode = *p_unicode;
1356    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1357        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1358    {
1359        PyErr_BadInternalCall();
1360        return -1;
1361    }
1362    return unicode_resize(p_unicode, length);
1363}
1364
1365static PyObject*
1366get_latin1_char(unsigned char ch)
1367{
1368    PyObject *unicode = unicode_latin1[ch];
1369    if (!unicode) {
1370        unicode = PyUnicode_New(1, ch);
1371        if (!unicode)
1372            return NULL;
1373        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1374        unicode_latin1[ch] = unicode;
1375    }
1376    Py_INCREF(unicode);
1377    return unicode;
1378}
1379
1380PyObject *
1381PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1382{
1383    PyUnicodeObject *unicode;
1384    Py_UCS4 maxchar = 0;
1385    Py_ssize_t num_surrogates;
1386
1387    if (u == NULL)
1388        return (PyObject*)_PyUnicode_New(size);
1389
1390    /* If the Unicode data is known at construction time, we can apply
1391       some optimizations which share commonly used objects. */
1392
1393    /* Optimization for empty strings */
1394    if (size == 0 && unicode_empty != NULL) {
1395        Py_INCREF(unicode_empty);
1396        return unicode_empty;
1397    }
1398
1399    /* Single character Unicode objects in the Latin-1 range are
1400       shared when using this constructor */
1401    if (size == 1 && *u < 256)
1402        return get_latin1_char((unsigned char)*u);
1403
1404    /* If not empty and not single character, copy the Unicode data
1405       into the new object */
1406    if (find_maxchar_surrogates(u, u + size,
1407                                &maxchar, &num_surrogates) == -1)
1408        return NULL;
1409
1410    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1411                                                maxchar);
1412    if (!unicode)
1413        return NULL;
1414
1415    switch (PyUnicode_KIND(unicode)) {
1416    case PyUnicode_1BYTE_KIND:
1417        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1418                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1419        break;
1420    case PyUnicode_2BYTE_KIND:
1421#if Py_UNICODE_SIZE == 2
1422        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1423#else
1424        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1425                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1426#endif
1427        break;
1428    case PyUnicode_4BYTE_KIND:
1429#if SIZEOF_WCHAR_T == 2
1430        /* This is the only case which has to process surrogates, thus
1431           a simple copy loop is not enough and we need a function. */
1432        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1433#else
1434        assert(num_surrogates == 0);
1435        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1436#endif
1437        break;
1438    default:
1439        assert(0 && "Impossible state");
1440    }
1441
1442    return (PyObject *)unicode;
1443}
1444
1445PyObject *
1446PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1447{
1448    PyUnicodeObject *unicode;
1449
1450    if (size < 0) {
1451        PyErr_SetString(PyExc_SystemError,
1452                        "Negative size passed to PyUnicode_FromStringAndSize");
1453        return NULL;
1454    }
1455
1456    /* If the Unicode data is known at construction time, we can apply
1457       some optimizations which share commonly used objects.
1458       Also, this means the input must be UTF-8, so fall back to the
1459       UTF-8 decoder at the end. */
1460    if (u != NULL) {
1461
1462        /* Optimization for empty strings */
1463        if (size == 0 && unicode_empty != NULL) {
1464            Py_INCREF(unicode_empty);
1465            return unicode_empty;
1466        }
1467
1468        /* Single characters are shared when using this constructor.
1469           Restrict to ASCII, since the input must be UTF-8. */
1470        if (size == 1 && Py_CHARMASK(*u) < 128)
1471            return get_latin1_char(Py_CHARMASK(*u));
1472
1473        return PyUnicode_DecodeUTF8(u, size, NULL);
1474    }
1475
1476    unicode = _PyUnicode_New(size);
1477    if (!unicode)
1478        return NULL;
1479
1480    return (PyObject *)unicode;
1481}
1482
1483PyObject *
1484PyUnicode_FromString(const char *u)
1485{
1486    size_t size = strlen(u);
1487    if (size > PY_SSIZE_T_MAX) {
1488        PyErr_SetString(PyExc_OverflowError, "input too long");
1489        return NULL;
1490    }
1491
1492    return PyUnicode_FromStringAndSize(u, size);
1493}
1494
1495static PyObject*
1496_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1497{
1498    PyObject *res;
1499    unsigned char max = 127;
1500    Py_ssize_t i;
1501    for (i = 0; i < size; i++) {
1502        if (u[i] & 0x80) {
1503            max = 255;
1504            break;
1505        }
1506    }
1507    res = PyUnicode_New(size, max);
1508    if (!res)
1509        return NULL;
1510    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1511    return res;
1512}
1513
1514static PyObject*
1515_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1516{
1517    PyObject *res;
1518    Py_UCS2 max = 0;
1519    Py_ssize_t i;
1520    for (i = 0; i < size; i++)
1521        if (u[i] > max)
1522            max = u[i];
1523    res = PyUnicode_New(size, max);
1524    if (!res)
1525        return NULL;
1526    if (max >= 256)
1527        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1528    else
1529        for (i = 0; i < size; i++)
1530            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1531    return res;
1532}
1533
1534static PyObject*
1535_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1536{
1537    PyObject *res;
1538    Py_UCS4 max = 0;
1539    Py_ssize_t i;
1540    for (i = 0; i < size; i++)
1541        if (u[i] > max)
1542            max = u[i];
1543    res = PyUnicode_New(size, max);
1544    if (!res)
1545        return NULL;
1546    if (max >= 0x10000)
1547        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1548    else {
1549        int kind = PyUnicode_KIND(res);
1550        void *data = PyUnicode_DATA(res);
1551        for (i = 0; i < size; i++)
1552            PyUnicode_WRITE(kind, data, i, u[i]);
1553    }
1554    return res;
1555}
1556
1557PyObject*
1558PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1559{
1560    switch(kind) {
1561    case PyUnicode_1BYTE_KIND:
1562        return _PyUnicode_FromUCS1(buffer, size);
1563    case PyUnicode_2BYTE_KIND:
1564        return _PyUnicode_FromUCS2(buffer, size);
1565    case PyUnicode_4BYTE_KIND:
1566        return _PyUnicode_FromUCS4(buffer, size);
1567    }
1568    PyErr_SetString(PyExc_SystemError, "invalid kind");
1569    return NULL;
1570}
1571
1572PyObject*
1573PyUnicode_Copy(PyObject *unicode)
1574{
1575    Py_ssize_t size;
1576    PyObject *copy;
1577    void *data;
1578
1579    if (!PyUnicode_Check(unicode)) {
1580        PyErr_BadInternalCall();
1581        return NULL;
1582    }
1583    if (PyUnicode_READY(unicode))
1584        return NULL;
1585
1586    size = PyUnicode_GET_LENGTH(unicode);
1587    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1588    if (!copy)
1589        return NULL;
1590    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1591
1592    data = PyUnicode_DATA(unicode);
1593    switch (PyUnicode_KIND(unicode))
1594    {
1595    case PyUnicode_1BYTE_KIND:
1596        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1597        break;
1598    case PyUnicode_2BYTE_KIND:
1599        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1600        break;
1601    case PyUnicode_4BYTE_KIND:
1602        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1603        break;
1604    default:
1605        assert(0);
1606        break;
1607    }
1608    return copy;
1609}
1610
1611
1612/* Widen Unicode objects to larger buffers. Don't write terminating null
1613   character. Return NULL on error. */
1614
1615void*
1616_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1617{
1618    Py_ssize_t len;
1619    void *result;
1620    unsigned int skind;
1621
1622    if (PyUnicode_READY(s))
1623        return NULL;
1624
1625    len = PyUnicode_GET_LENGTH(s);
1626    skind = PyUnicode_KIND(s);
1627    if (skind >= kind) {
1628        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1629        return NULL;
1630    }
1631    switch(kind) {
1632    case PyUnicode_2BYTE_KIND:
1633        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1634        if (!result)
1635            return PyErr_NoMemory();
1636        assert(skind == PyUnicode_1BYTE_KIND);
1637        _PyUnicode_CONVERT_BYTES(
1638            Py_UCS1, Py_UCS2,
1639            PyUnicode_1BYTE_DATA(s),
1640            PyUnicode_1BYTE_DATA(s) + len,
1641            result);
1642        return result;
1643    case PyUnicode_4BYTE_KIND:
1644        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1645        if (!result)
1646            return PyErr_NoMemory();
1647        if (skind == PyUnicode_2BYTE_KIND) {
1648            _PyUnicode_CONVERT_BYTES(
1649                Py_UCS2, Py_UCS4,
1650                PyUnicode_2BYTE_DATA(s),
1651                PyUnicode_2BYTE_DATA(s) + len,
1652                result);
1653        }
1654        else {
1655            assert(skind == PyUnicode_1BYTE_KIND);
1656            _PyUnicode_CONVERT_BYTES(
1657                Py_UCS1, Py_UCS4,
1658                PyUnicode_1BYTE_DATA(s),
1659                PyUnicode_1BYTE_DATA(s) + len,
1660                result);
1661        }
1662        return result;
1663    default:
1664        break;
1665    }
1666    PyErr_SetString(PyExc_SystemError, "invalid kind");
1667    return NULL;
1668}
1669
1670static Py_UCS4*
1671as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1672        int copy_null)
1673{
1674    int kind;
1675    void *data;
1676    Py_ssize_t len, targetlen;
1677    if (PyUnicode_READY(string) == -1)
1678        return NULL;
1679    kind = PyUnicode_KIND(string);
1680    data = PyUnicode_DATA(string);
1681    len = PyUnicode_GET_LENGTH(string);
1682    targetlen = len;
1683    if (copy_null)
1684        targetlen++;
1685    if (!target) {
1686        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1687            PyErr_NoMemory();
1688            return NULL;
1689        }
1690        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1691        if (!target) {
1692            PyErr_NoMemory();
1693            return NULL;
1694        }
1695    }
1696    else {
1697        if (targetsize < targetlen) {
1698            PyErr_Format(PyExc_SystemError,
1699                         "string is longer than the buffer");
1700            if (copy_null && 0 < targetsize)
1701                target[0] = 0;
1702            return NULL;
1703        }
1704    }
1705    if (kind != PyUnicode_4BYTE_KIND) {
1706        Py_ssize_t i;
1707        for (i = 0; i < len; i++)
1708            target[i] = PyUnicode_READ(kind, data, i);
1709    }
1710    else
1711        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1712    if (copy_null)
1713        target[len] = 0;
1714    return target;
1715}
1716
1717Py_UCS4*
1718PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1719                 int copy_null)
1720{
1721    if (target == NULL || targetsize < 1) {
1722        PyErr_BadInternalCall();
1723        return NULL;
1724    }
1725    return as_ucs4(string, target, targetsize, copy_null);
1726}
1727
1728Py_UCS4*
1729PyUnicode_AsUCS4Copy(PyObject *string)
1730{
1731    return as_ucs4(string, NULL, 0, 1);
1732}
1733
1734#ifdef HAVE_WCHAR_H
1735
1736PyObject *
1737PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
1738{
1739    if (w == NULL) {
1740        if (size == 0)
1741            return PyUnicode_New(0, 0);
1742        PyErr_BadInternalCall();
1743        return NULL;
1744    }
1745
1746    if (size == -1) {
1747        size = wcslen(w);
1748    }
1749
1750    return PyUnicode_FromUnicode(w, size);
1751}
1752
1753#endif /* HAVE_WCHAR_H */
1754
1755static void
1756makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1757        int zeropad, int width, int precision, char c)
1758{
1759    *fmt++ = '%';
1760    if (width) {
1761        if (zeropad)
1762            *fmt++ = '0';
1763        fmt += sprintf(fmt, "%d", width);
1764    }
1765    if (precision)
1766        fmt += sprintf(fmt, ".%d", precision);
1767    if (longflag)
1768        *fmt++ = 'l';
1769    else if (longlongflag) {
1770        /* longlongflag should only ever be nonzero on machines with
1771           HAVE_LONG_LONG defined */
1772#ifdef HAVE_LONG_LONG
1773        char *f = PY_FORMAT_LONG_LONG;
1774        while (*f)
1775            *fmt++ = *f++;
1776#else
1777        /* we shouldn't ever get here */
1778        assert(0);
1779        *fmt++ = 'l';
1780#endif
1781    }
1782    else if (size_tflag) {
1783        char *f = PY_FORMAT_SIZE_T;
1784        while (*f)
1785            *fmt++ = *f++;
1786    }
1787    *fmt++ = c;
1788    *fmt = '\0';
1789}
1790
1791/* helper for PyUnicode_FromFormatV() */
1792
1793static const char*
1794parse_format_flags(const char *f,
1795                   int *p_width, int *p_precision,
1796                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1797{
1798    int width, precision, longflag, longlongflag, size_tflag;
1799
1800    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1801    f++;
1802    width = 0;
1803    while (Py_ISDIGIT((unsigned)*f))
1804        width = (width*10) + *f++ - '0';
1805    precision = 0;
1806    if (*f == '.') {
1807        f++;
1808        while (Py_ISDIGIT((unsigned)*f))
1809            precision = (precision*10) + *f++ - '0';
1810        if (*f == '%') {
1811            /* "%.3%s" => f points to "3" */
1812            f--;
1813        }
1814    }
1815    if (*f == '\0') {
1816        /* bogus format "%.1" => go backward, f points to "1" */
1817        f--;
1818    }
1819    if (p_width != NULL)
1820        *p_width = width;
1821    if (p_precision != NULL)
1822        *p_precision = precision;
1823
1824    /* Handle %ld, %lu, %lld and %llu. */
1825    longflag = 0;
1826    longlongflag = 0;
1827    size_tflag = 0;
1828
1829    if (*f == 'l') {
1830        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
1831            longflag = 1;
1832            ++f;
1833        }
1834#ifdef HAVE_LONG_LONG
1835        else if (f[1] == 'l' &&
1836                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
1837            longlongflag = 1;
1838            f += 2;
1839        }
1840#endif
1841    }
1842    /* handle the size_t flag. */
1843    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
1844        size_tflag = 1;
1845        ++f;
1846    }
1847    if (p_longflag != NULL)
1848        *p_longflag = longflag;
1849    if (p_longlongflag != NULL)
1850        *p_longlongflag = longlongflag;
1851    if (p_size_tflag != NULL)
1852        *p_size_tflag = size_tflag;
1853    return f;
1854}
1855
1856/* maximum number of characters required for output of %ld.  21 characters
1857   allows for 64-bit integers (in decimal) and an optional sign. */
1858#define MAX_LONG_CHARS 21
1859/* maximum number of characters required for output of %lld.
1860   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1861   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
1862#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1863
1864PyObject *
1865PyUnicode_FromFormatV(const char *format, va_list vargs)
1866{
1867    va_list count;
1868    Py_ssize_t callcount = 0;
1869    PyObject **callresults = NULL;
1870    PyObject **callresult = NULL;
1871    Py_ssize_t n = 0;
1872    int width = 0;
1873    int precision = 0;
1874    int zeropad;
1875    const char* f;
1876    PyUnicodeObject *string;
1877    /* used by sprintf */
1878    char fmt[61]; /* should be enough for %0width.precisionlld */
1879    Py_UCS4 maxchar = 127; /* result is ASCII by default */
1880    Py_UCS4 argmaxchar;
1881    Py_ssize_t numbersize = 0;
1882    char *numberresults = NULL;
1883    char *numberresult = NULL;
1884    Py_ssize_t i;
1885    int kind;
1886    void *data;
1887
1888    Py_VA_COPY(count, vargs);
1889    /* step 1: count the number of %S/%R/%A/%s format specifications
1890     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1891     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
1892     * result in an array)
1893     * also esimate a upper bound for all the number formats in the string,
1894     * numbers will be formated in step 3 and be keept in a '\0'-separated
1895     * buffer before putting everything together. */
1896    for (f = format; *f; f++) {
1897        if (*f == '%') {
1898            int longlongflag;
1899            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1900            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1901            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1902                ++callcount;
1903
1904            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
1905#ifdef HAVE_LONG_LONG
1906                if (longlongflag) {
1907                    if (width < MAX_LONG_LONG_CHARS)
1908                        width = MAX_LONG_LONG_CHARS;
1909                }
1910                else
1911#endif
1912                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1913                       including sign.  Decimal takes the most space.  This
1914                       isn't enough for octal.  If a width is specified we
1915                       need more (which we allocate later). */
1916                    if (width < MAX_LONG_CHARS)
1917                        width = MAX_LONG_CHARS;
1918
1919                /* account for the size + '\0' to separate numbers
1920                   inside of the numberresults buffer */
1921                numbersize += (width + 1);
1922            }
1923        }
1924        else if ((unsigned char)*f > 127) {
1925            PyErr_Format(PyExc_ValueError,
1926                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1927                "string, got a non-ASCII byte: 0x%02x",
1928                (unsigned char)*f);
1929            return NULL;
1930        }
1931    }
1932    /* step 2: allocate memory for the results of
1933     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1934    if (callcount) {
1935        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1936        if (!callresults) {
1937            PyErr_NoMemory();
1938            return NULL;
1939        }
1940        callresult = callresults;
1941    }
1942    /* step 2.5: allocate memory for the results of formating numbers */
1943    if (numbersize) {
1944        numberresults = PyObject_Malloc(numbersize);
1945        if (!numberresults) {
1946            PyErr_NoMemory();
1947            goto fail;
1948        }
1949        numberresult = numberresults;
1950    }
1951
1952    /* step 3: format numbers and figure out how large a buffer we need */
1953    for (f = format; *f; f++) {
1954        if (*f == '%') {
1955            const char* p;
1956            int longflag;
1957            int longlongflag;
1958            int size_tflag;
1959            int numprinted;
1960
1961            p = f;
1962            zeropad = (f[1] == '0');
1963            f = parse_format_flags(f, &width, &precision,
1964                                   &longflag, &longlongflag, &size_tflag);
1965            switch (*f) {
1966            case 'c':
1967            {
1968                Py_UCS4 ordinal = va_arg(count, int);
1969                maxchar = Py_MAX(maxchar, ordinal);
1970                n++;
1971                break;
1972            }
1973            case '%':
1974                n++;
1975                break;
1976            case 'i':
1977            case 'd':
1978                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1979                        width, precision, *f);
1980                if (longflag)
1981                    numprinted = sprintf(numberresult, fmt,
1982                                         va_arg(count, long));
1983#ifdef HAVE_LONG_LONG
1984                else if (longlongflag)
1985                    numprinted = sprintf(numberresult, fmt,
1986                                         va_arg(count, PY_LONG_LONG));
1987#endif
1988                else if (size_tflag)
1989                    numprinted = sprintf(numberresult, fmt,
1990                                         va_arg(count, Py_ssize_t));
1991                else
1992                    numprinted = sprintf(numberresult, fmt,
1993                                         va_arg(count, int));
1994                n += numprinted;
1995                /* advance by +1 to skip over the '\0' */
1996                numberresult += (numprinted + 1);
1997                assert(*(numberresult - 1) == '\0');
1998                assert(*(numberresult - 2) != '\0');
1999                assert(numprinted >= 0);
2000                assert(numberresult <= numberresults + numbersize);
2001                break;
2002            case 'u':
2003                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2004                        width, precision, 'u');
2005                if (longflag)
2006                    numprinted = sprintf(numberresult, fmt,
2007                                         va_arg(count, unsigned long));
2008#ifdef HAVE_LONG_LONG
2009                else if (longlongflag)
2010                    numprinted = sprintf(numberresult, fmt,
2011                                         va_arg(count, unsigned PY_LONG_LONG));
2012#endif
2013                else if (size_tflag)
2014                    numprinted = sprintf(numberresult, fmt,
2015                                         va_arg(count, size_t));
2016                else
2017                    numprinted = sprintf(numberresult, fmt,
2018                                         va_arg(count, unsigned int));
2019                n += numprinted;
2020                numberresult += (numprinted + 1);
2021                assert(*(numberresult - 1) == '\0');
2022                assert(*(numberresult - 2) != '\0');
2023                assert(numprinted >= 0);
2024                assert(numberresult <= numberresults + numbersize);
2025                break;
2026            case 'x':
2027                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2028                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2029                n += numprinted;
2030                numberresult += (numprinted + 1);
2031                assert(*(numberresult - 1) == '\0');
2032                assert(*(numberresult - 2) != '\0');
2033                assert(numprinted >= 0);
2034                assert(numberresult <= numberresults + numbersize);
2035                break;
2036            case 'p':
2037                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2038                /* %p is ill-defined:  ensure leading 0x. */
2039                if (numberresult[1] == 'X')
2040                    numberresult[1] = 'x';
2041                else if (numberresult[1] != 'x') {
2042                    memmove(numberresult + 2, numberresult,
2043                            strlen(numberresult) + 1);
2044                    numberresult[0] = '0';
2045                    numberresult[1] = 'x';
2046                    numprinted += 2;
2047                }
2048                n += numprinted;
2049                numberresult += (numprinted + 1);
2050                assert(*(numberresult - 1) == '\0');
2051                assert(*(numberresult - 2) != '\0');
2052                assert(numprinted >= 0);
2053                assert(numberresult <= numberresults + numbersize);
2054                break;
2055            case 's':
2056            {
2057                /* UTF-8 */
2058                const char *s = va_arg(count, const char*);
2059                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2060                if (!str)
2061                    goto fail;
2062                /* since PyUnicode_DecodeUTF8 returns already flexible
2063                   unicode objects, there is no need to call ready on them */
2064                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2065                maxchar = Py_MAX(maxchar, argmaxchar);
2066                n += PyUnicode_GET_LENGTH(str);
2067                /* Remember the str and switch to the next slot */
2068                *callresult++ = str;
2069                break;
2070            }
2071            case 'U':
2072            {
2073                PyObject *obj = va_arg(count, PyObject *);
2074                assert(obj && _PyUnicode_CHECK(obj));
2075                if (PyUnicode_READY(obj) == -1)
2076                    goto fail;
2077                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2078                maxchar = Py_MAX(maxchar, argmaxchar);
2079                n += PyUnicode_GET_LENGTH(obj);
2080                break;
2081            }
2082            case 'V':
2083            {
2084                PyObject *obj = va_arg(count, PyObject *);
2085                const char *str = va_arg(count, const char *);
2086                PyObject *str_obj;
2087                assert(obj || str);
2088                assert(!obj || _PyUnicode_CHECK(obj));
2089                if (obj) {
2090                    if (PyUnicode_READY(obj) == -1)
2091                        goto fail;
2092                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2093                    maxchar = Py_MAX(maxchar, argmaxchar);
2094                    n += PyUnicode_GET_LENGTH(obj);
2095                    *callresult++ = NULL;
2096                }
2097                else {
2098                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2099                    if (!str_obj)
2100                        goto fail;
2101                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2102                    maxchar = Py_MAX(maxchar, argmaxchar);
2103                    n += PyUnicode_GET_LENGTH(str_obj);
2104                    *callresult++ = str_obj;
2105                }
2106                break;
2107            }
2108            case 'S':
2109            {
2110                PyObject *obj = va_arg(count, PyObject *);
2111                PyObject *str;
2112                assert(obj);
2113                str = PyObject_Str(obj);
2114                if (!str || PyUnicode_READY(str) == -1)
2115                    goto fail;
2116                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2117                maxchar = Py_MAX(maxchar, argmaxchar);
2118                n += PyUnicode_GET_LENGTH(str);
2119                /* Remember the str and switch to the next slot */
2120                *callresult++ = str;
2121                break;
2122            }
2123            case 'R':
2124            {
2125                PyObject *obj = va_arg(count, PyObject *);
2126                PyObject *repr;
2127                assert(obj);
2128                repr = PyObject_Repr(obj);
2129                if (!repr || PyUnicode_READY(repr) == -1)
2130                    goto fail;
2131                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2132                maxchar = Py_MAX(maxchar, argmaxchar);
2133                n += PyUnicode_GET_LENGTH(repr);
2134                /* Remember the repr and switch to the next slot */
2135                *callresult++ = repr;
2136                break;
2137            }
2138            case 'A':
2139            {
2140                PyObject *obj = va_arg(count, PyObject *);
2141                PyObject *ascii;
2142                assert(obj);
2143                ascii = PyObject_ASCII(obj);
2144                if (!ascii || PyUnicode_READY(ascii) == -1)
2145                    goto fail;
2146                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2147                maxchar = Py_MAX(maxchar, argmaxchar);
2148                n += PyUnicode_GET_LENGTH(ascii);
2149                /* Remember the repr and switch to the next slot */
2150                *callresult++ = ascii;
2151                break;
2152            }
2153            default:
2154                /* if we stumble upon an unknown
2155                   formatting code, copy the rest of
2156                   the format string to the output
2157                   string. (we cannot just skip the
2158                   code, since there's no way to know
2159                   what's in the argument list) */
2160                n += strlen(p);
2161                goto expand;
2162            }
2163        } else
2164            n++;
2165    }
2166  expand:
2167    /* step 4: fill the buffer */
2168    /* Since we've analyzed how much space we need,
2169       we don't have to resize the string.
2170       There can be no errors beyond this point. */
2171    string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
2172    if (!string)
2173        goto fail;
2174    kind = PyUnicode_KIND(string);
2175    data = PyUnicode_DATA(string);
2176    callresult = callresults;
2177    numberresult = numberresults;
2178
2179    for (i = 0, f = format; *f; f++) {
2180        if (*f == '%') {
2181            const char* p;
2182
2183            p = f;
2184            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2185            /* checking for == because the last argument could be a empty
2186               string, which causes i to point to end, the assert at the end of
2187               the loop */
2188            assert(i <= PyUnicode_GET_LENGTH(string));
2189
2190            switch (*f) {
2191            case 'c':
2192            {
2193                const int ordinal = va_arg(vargs, int);
2194                PyUnicode_WRITE(kind, data, i++, ordinal);
2195                break;
2196            }
2197            case 'i':
2198            case 'd':
2199            case 'u':
2200            case 'x':
2201            case 'p':
2202                /* unused, since we already have the result */
2203                if (*f == 'p')
2204                    (void) va_arg(vargs, void *);
2205                else
2206                    (void) va_arg(vargs, int);
2207                /* extract the result from numberresults and append. */
2208                for (; *numberresult; ++i, ++numberresult)
2209                    PyUnicode_WRITE(kind, data, i, *numberresult);
2210                /* skip over the separating '\0' */
2211                assert(*numberresult == '\0');
2212                numberresult++;
2213                assert(numberresult <= numberresults + numbersize);
2214                break;
2215            case 's':
2216            {
2217                /* unused, since we already have the result */
2218                Py_ssize_t size;
2219                (void) va_arg(vargs, char *);
2220                size = PyUnicode_GET_LENGTH(*callresult);
2221                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2222                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2223                                             *callresult, 0,
2224                                             size) < 0)
2225                    goto fail;
2226                i += size;
2227                /* We're done with the unicode()/repr() => forget it */
2228                Py_DECREF(*callresult);
2229                /* switch to next unicode()/repr() result */
2230                ++callresult;
2231                break;
2232            }
2233            case 'U':
2234            {
2235                PyObject *obj = va_arg(vargs, PyObject *);
2236                Py_ssize_t size;
2237                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2238                size = PyUnicode_GET_LENGTH(obj);
2239                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2240                                             obj, 0,
2241                                             size) < 0)
2242                    goto fail;
2243                i += size;
2244                break;
2245            }
2246            case 'V':
2247            {
2248                Py_ssize_t size;
2249                PyObject *obj = va_arg(vargs, PyObject *);
2250                va_arg(vargs, const char *);
2251                if (obj) {
2252                    size = PyUnicode_GET_LENGTH(obj);
2253                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2254                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2255                                                 obj, 0,
2256                                                 size) < 0)
2257                        goto fail;
2258                    i += size;
2259                } else {
2260                    size = PyUnicode_GET_LENGTH(*callresult);
2261                    assert(PyUnicode_KIND(*callresult) <=
2262                           PyUnicode_KIND(string));
2263                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2264                                                 *callresult,
2265                                                 0, size) < 0)
2266                        goto fail;
2267                    i += size;
2268                    Py_DECREF(*callresult);
2269                }
2270                ++callresult;
2271                break;
2272            }
2273            case 'S':
2274            case 'R':
2275            case 'A':
2276            {
2277                /* unused, since we already have the result */
2278                (void) va_arg(vargs, PyObject *);
2279                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2280                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2281                                             *callresult, 0,
2282                                             PyUnicode_GET_LENGTH(*callresult)) < 0)
2283                    goto fail;
2284                i += PyUnicode_GET_LENGTH(*callresult);
2285                /* We're done with the unicode()/repr() => forget it */
2286                Py_DECREF(*callresult);
2287                /* switch to next unicode()/repr() result */
2288                ++callresult;
2289                break;
2290            }
2291            case '%':
2292                PyUnicode_WRITE(kind, data, i++, '%');
2293                break;
2294            default:
2295                for (; *p; ++p, ++i)
2296                    PyUnicode_WRITE(kind, data, i, *p);
2297                assert(i == PyUnicode_GET_LENGTH(string));
2298                goto end;
2299            }
2300        }
2301        else {
2302            assert(i < PyUnicode_GET_LENGTH(string));
2303            PyUnicode_WRITE(kind, data, i++, *f);
2304        }
2305    }
2306    assert(i == PyUnicode_GET_LENGTH(string));
2307
2308  end:
2309    if (callresults)
2310        PyObject_Free(callresults);
2311    if (numberresults)
2312        PyObject_Free(numberresults);
2313    return (PyObject *)string;
2314  fail:
2315    if (callresults) {
2316        PyObject **callresult2 = callresults;
2317        while (callresult2 < callresult) {
2318            Py_XDECREF(*callresult2);
2319            ++callresult2;
2320        }
2321        PyObject_Free(callresults);
2322    }
2323    if (numberresults)
2324        PyObject_Free(numberresults);
2325    return NULL;
2326}
2327
2328PyObject *
2329PyUnicode_FromFormat(const char *format, ...)
2330{
2331    PyObject* ret;
2332    va_list vargs;
2333
2334#ifdef HAVE_STDARG_PROTOTYPES
2335    va_start(vargs, format);
2336#else
2337    va_start(vargs);
2338#endif
2339    ret = PyUnicode_FromFormatV(format, vargs);
2340    va_end(vargs);
2341    return ret;
2342}
2343
2344#ifdef HAVE_WCHAR_H
2345
2346/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2347   convert a Unicode object to a wide character string.
2348
2349   - If w is NULL: return the number of wide characters (including the null
2350     character) required to convert the unicode object. Ignore size argument.
2351
2352   - Otherwise: return the number of wide characters (excluding the null
2353     character) written into w. Write at most size wide characters (including
2354     the null character). */
2355static Py_ssize_t
2356unicode_aswidechar(PyUnicodeObject *unicode,
2357                   wchar_t *w,
2358                   Py_ssize_t size)
2359{
2360    Py_ssize_t res;
2361    const wchar_t *wstr;
2362
2363    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2364    if (wstr == NULL)
2365        return -1;
2366
2367    if (w != NULL) {
2368        if (size > res)
2369            size = res + 1;
2370        else
2371            res = size;
2372        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2373        return res;
2374    }
2375    else
2376        return res + 1;
2377}
2378
2379Py_ssize_t
2380PyUnicode_AsWideChar(PyObject *unicode,
2381                     wchar_t *w,
2382                     Py_ssize_t size)
2383{
2384    if (unicode == NULL) {
2385        PyErr_BadInternalCall();
2386        return -1;
2387    }
2388    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2389}
2390
2391wchar_t*
2392PyUnicode_AsWideCharString(PyObject *unicode,
2393                           Py_ssize_t *size)
2394{
2395    wchar_t* buffer;
2396    Py_ssize_t buflen;
2397
2398    if (unicode == NULL) {
2399        PyErr_BadInternalCall();
2400        return NULL;
2401    }
2402
2403    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2404    if (buflen == -1)
2405        return NULL;
2406    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2407        PyErr_NoMemory();
2408        return NULL;
2409    }
2410
2411    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2412    if (buffer == NULL) {
2413        PyErr_NoMemory();
2414        return NULL;
2415    }
2416    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2417    if (buflen == -1)
2418        return NULL;
2419    if (size != NULL)
2420        *size = buflen;
2421    return buffer;
2422}
2423
2424#endif /* HAVE_WCHAR_H */
2425
2426PyObject *
2427PyUnicode_FromOrdinal(int ordinal)
2428{
2429    PyObject *v;
2430    if (ordinal < 0 || ordinal > 0x10ffff) {
2431        PyErr_SetString(PyExc_ValueError,
2432                        "chr() arg not in range(0x110000)");
2433        return NULL;
2434    }
2435
2436    if (ordinal < 256)
2437        return get_latin1_char(ordinal);
2438
2439    v = PyUnicode_New(1, ordinal);
2440    if (v == NULL)
2441        return NULL;
2442    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2443    return v;
2444}
2445
2446PyObject *
2447PyUnicode_FromObject(register PyObject *obj)
2448{
2449    /* XXX Perhaps we should make this API an alias of
2450       PyObject_Str() instead ?! */
2451    if (PyUnicode_CheckExact(obj)) {
2452        if (PyUnicode_READY(obj))
2453            return NULL;
2454        Py_INCREF(obj);
2455        return obj;
2456    }
2457    if (PyUnicode_Check(obj)) {
2458        /* For a Unicode subtype that's not a Unicode object,
2459           return a true Unicode object with the same data. */
2460        return PyUnicode_Copy(obj);
2461    }
2462    PyErr_Format(PyExc_TypeError,
2463                 "Can't convert '%.100s' object to str implicitly",
2464                 Py_TYPE(obj)->tp_name);
2465    return NULL;
2466}
2467
2468PyObject *
2469PyUnicode_FromEncodedObject(register PyObject *obj,
2470                            const char *encoding,
2471                            const char *errors)
2472{
2473    Py_buffer buffer;
2474    PyObject *v;
2475
2476    if (obj == NULL) {
2477        PyErr_BadInternalCall();
2478        return NULL;
2479    }
2480
2481    /* Decoding bytes objects is the most common case and should be fast */
2482    if (PyBytes_Check(obj)) {
2483        if (PyBytes_GET_SIZE(obj) == 0) {
2484            Py_INCREF(unicode_empty);
2485            v = unicode_empty;
2486        }
2487        else {
2488            v = PyUnicode_Decode(
2489                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2490                    encoding, errors);
2491        }
2492        return v;
2493    }
2494
2495    if (PyUnicode_Check(obj)) {
2496        PyErr_SetString(PyExc_TypeError,
2497                        "decoding str is not supported");
2498        return NULL;
2499    }
2500
2501    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2502    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2503        PyErr_Format(PyExc_TypeError,
2504                     "coercing to str: need bytes, bytearray "
2505                     "or buffer-like object, %.80s found",
2506                     Py_TYPE(obj)->tp_name);
2507        return NULL;
2508    }
2509
2510    if (buffer.len == 0) {
2511        Py_INCREF(unicode_empty);
2512        v = unicode_empty;
2513    }
2514    else
2515        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2516
2517    PyBuffer_Release(&buffer);
2518    return v;
2519}
2520
2521/* Convert encoding to lower case and replace '_' with '-' in order to
2522   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2523   1 on success. */
2524static int
2525normalize_encoding(const char *encoding,
2526                   char *lower,
2527                   size_t lower_len)
2528{
2529    const char *e;
2530    char *l;
2531    char *l_end;
2532
2533    e = encoding;
2534    l = lower;
2535    l_end = &lower[lower_len - 1];
2536    while (*e) {
2537        if (l == l_end)
2538            return 0;
2539        if (Py_ISUPPER(*e)) {
2540            *l++ = Py_TOLOWER(*e++);
2541        }
2542        else if (*e == '_') {
2543            *l++ = '-';
2544            e++;
2545        }
2546        else {
2547            *l++ = *e++;
2548        }
2549    }
2550    *l = '\0';
2551    return 1;
2552}
2553
2554PyObject *
2555PyUnicode_Decode(const char *s,
2556                 Py_ssize_t size,
2557                 const char *encoding,
2558                 const char *errors)
2559{
2560    PyObject *buffer = NULL, *unicode;
2561    Py_buffer info;
2562    char lower[11];  /* Enough for any encoding shortcut */
2563
2564    if (encoding == NULL)
2565        return PyUnicode_DecodeUTF8(s, size, errors);
2566
2567    /* Shortcuts for common default encodings */
2568    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2569        if ((strcmp(lower, "utf-8") == 0) ||
2570            (strcmp(lower, "utf8") == 0))
2571            return PyUnicode_DecodeUTF8(s, size, errors);
2572        else if ((strcmp(lower, "latin-1") == 0) ||
2573                 (strcmp(lower, "latin1") == 0) ||
2574                 (strcmp(lower, "iso-8859-1") == 0))
2575            return PyUnicode_DecodeLatin1(s, size, errors);
2576#ifdef HAVE_MBCS
2577        else if (strcmp(lower, "mbcs") == 0)
2578            return PyUnicode_DecodeMBCS(s, size, errors);
2579#endif
2580        else if (strcmp(lower, "ascii") == 0)
2581            return PyUnicode_DecodeASCII(s, size, errors);
2582        else if (strcmp(lower, "utf-16") == 0)
2583            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2584        else if (strcmp(lower, "utf-32") == 0)
2585            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2586    }
2587
2588    /* Decode via the codec registry */
2589    buffer = NULL;
2590    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2591        goto onError;
2592    buffer = PyMemoryView_FromBuffer(&info);
2593    if (buffer == NULL)
2594        goto onError;
2595    unicode = PyCodec_Decode(buffer, encoding, errors);
2596    if (unicode == NULL)
2597        goto onError;
2598    if (!PyUnicode_Check(unicode)) {
2599        PyErr_Format(PyExc_TypeError,
2600                     "decoder did not return a str object (type=%.400s)",
2601                     Py_TYPE(unicode)->tp_name);
2602        Py_DECREF(unicode);
2603        goto onError;
2604    }
2605    Py_DECREF(buffer);
2606    if (_PyUnicode_READY_REPLACE(&unicode)) {
2607        Py_DECREF(unicode);
2608        return NULL;
2609    }
2610    return unicode;
2611
2612  onError:
2613    Py_XDECREF(buffer);
2614    return NULL;
2615}
2616
2617PyObject *
2618PyUnicode_AsDecodedObject(PyObject *unicode,
2619                          const char *encoding,
2620                          const char *errors)
2621{
2622    PyObject *v;
2623
2624    if (!PyUnicode_Check(unicode)) {
2625        PyErr_BadArgument();
2626        goto onError;
2627    }
2628
2629    if (encoding == NULL)
2630        encoding = PyUnicode_GetDefaultEncoding();
2631
2632    /* Decode via the codec registry */
2633    v = PyCodec_Decode(unicode, encoding, errors);
2634    if (v == NULL)
2635        goto onError;
2636    return v;
2637
2638  onError:
2639    return NULL;
2640}
2641
2642PyObject *
2643PyUnicode_AsDecodedUnicode(PyObject *unicode,
2644                           const char *encoding,
2645                           const char *errors)
2646{
2647    PyObject *v;
2648
2649    if (!PyUnicode_Check(unicode)) {
2650        PyErr_BadArgument();
2651        goto onError;
2652    }
2653
2654    if (encoding == NULL)
2655        encoding = PyUnicode_GetDefaultEncoding();
2656
2657    /* Decode via the codec registry */
2658    v = PyCodec_Decode(unicode, encoding, errors);
2659    if (v == NULL)
2660        goto onError;
2661    if (!PyUnicode_Check(v)) {
2662        PyErr_Format(PyExc_TypeError,
2663                     "decoder did not return a str object (type=%.400s)",
2664                     Py_TYPE(v)->tp_name);
2665        Py_DECREF(v);
2666        goto onError;
2667    }
2668    return v;
2669
2670  onError:
2671    return NULL;
2672}
2673
2674PyObject *
2675PyUnicode_Encode(const Py_UNICODE *s,
2676                 Py_ssize_t size,
2677                 const char *encoding,
2678                 const char *errors)
2679{
2680    PyObject *v, *unicode;
2681
2682    unicode = PyUnicode_FromUnicode(s, size);
2683    if (unicode == NULL)
2684        return NULL;
2685    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2686    Py_DECREF(unicode);
2687    return v;
2688}
2689
2690PyObject *
2691PyUnicode_AsEncodedObject(PyObject *unicode,
2692                          const char *encoding,
2693                          const char *errors)
2694{
2695    PyObject *v;
2696
2697    if (!PyUnicode_Check(unicode)) {
2698        PyErr_BadArgument();
2699        goto onError;
2700    }
2701
2702    if (encoding == NULL)
2703        encoding = PyUnicode_GetDefaultEncoding();
2704
2705    /* Encode via the codec registry */
2706    v = PyCodec_Encode(unicode, encoding, errors);
2707    if (v == NULL)
2708        goto onError;
2709    return v;
2710
2711  onError:
2712    return NULL;
2713}
2714
2715PyObject *
2716PyUnicode_EncodeFSDefault(PyObject *unicode)
2717{
2718#ifdef HAVE_MBCS
2719    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2720                                PyUnicode_GET_SIZE(unicode),
2721                                NULL);
2722#elif defined(__APPLE__)
2723    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2724#else
2725    PyInterpreterState *interp = PyThreadState_GET()->interp;
2726    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2727       cannot use it to encode and decode filenames before it is loaded. Load
2728       the Python codec requires to encode at least its own filename. Use the C
2729       version of the locale codec until the codec registry is initialized and
2730       the Python codec is loaded.
2731
2732       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2733       cannot only rely on it: check also interp->fscodec_initialized for
2734       subinterpreters. */
2735    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2736        return PyUnicode_AsEncodedString(unicode,
2737                                         Py_FileSystemDefaultEncoding,
2738                                         "surrogateescape");
2739    }
2740    else {
2741        /* locale encoding with surrogateescape */
2742        wchar_t *wchar;
2743        char *bytes;
2744        PyObject *bytes_obj;
2745        size_t error_pos;
2746
2747        wchar = PyUnicode_AsWideCharString(unicode, NULL);
2748        if (wchar == NULL)
2749            return NULL;
2750        bytes = _Py_wchar2char(wchar, &error_pos);
2751        if (bytes == NULL) {
2752            if (error_pos != (size_t)-1) {
2753                char *errmsg = strerror(errno);
2754                PyObject *exc = NULL;
2755                if (errmsg == NULL)
2756                    errmsg = "Py_wchar2char() failed";
2757                raise_encode_exception(&exc,
2758                    "filesystemencoding",
2759                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2760                    error_pos, error_pos+1,
2761                    errmsg);
2762                Py_XDECREF(exc);
2763            }
2764            else
2765                PyErr_NoMemory();
2766            PyMem_Free(wchar);
2767            return NULL;
2768        }
2769        PyMem_Free(wchar);
2770
2771        bytes_obj = PyBytes_FromString(bytes);
2772        PyMem_Free(bytes);
2773        return bytes_obj;
2774    }
2775#endif
2776}
2777
2778PyObject *
2779PyUnicode_AsEncodedString(PyObject *unicode,
2780                          const char *encoding,
2781                          const char *errors)
2782{
2783    PyObject *v;
2784    char lower[11];  /* Enough for any encoding shortcut */
2785
2786    if (!PyUnicode_Check(unicode)) {
2787        PyErr_BadArgument();
2788        return NULL;
2789    }
2790
2791    if (encoding == NULL) {
2792        if (errors == NULL || strcmp(errors, "strict") == 0)
2793            return _PyUnicode_AsUTF8String(unicode, NULL);
2794        else
2795            return _PyUnicode_AsUTF8String(unicode, errors);
2796    }
2797
2798    /* Shortcuts for common default encodings */
2799    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2800        if ((strcmp(lower, "utf-8") == 0) ||
2801            (strcmp(lower, "utf8") == 0))
2802        {
2803            if (errors == NULL || strcmp(errors, "strict") == 0)
2804                return _PyUnicode_AsUTF8String(unicode, NULL);
2805            else
2806                return _PyUnicode_AsUTF8String(unicode, errors);
2807        }
2808        else if ((strcmp(lower, "latin-1") == 0) ||
2809                 (strcmp(lower, "latin1") == 0) ||
2810                 (strcmp(lower, "iso-8859-1") == 0))
2811            return _PyUnicode_AsLatin1String(unicode, errors);
2812#ifdef HAVE_MBCS
2813        else if (strcmp(lower, "mbcs") == 0)
2814            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2815                                        PyUnicode_GET_SIZE(unicode),
2816                                        errors);
2817#endif
2818        else if (strcmp(lower, "ascii") == 0)
2819            return _PyUnicode_AsASCIIString(unicode, errors);
2820    }
2821
2822    /* Encode via the codec registry */
2823    v = PyCodec_Encode(unicode, encoding, errors);
2824    if (v == NULL)
2825        return NULL;
2826
2827    /* The normal path */
2828    if (PyBytes_Check(v))
2829        return v;
2830
2831    /* If the codec returns a buffer, raise a warning and convert to bytes */
2832    if (PyByteArray_Check(v)) {
2833        int error;
2834        PyObject *b;
2835
2836        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2837            "encoder %s returned bytearray instead of bytes",
2838            encoding);
2839        if (error) {
2840            Py_DECREF(v);
2841            return NULL;
2842        }
2843
2844        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2845        Py_DECREF(v);
2846        return b;
2847    }
2848
2849    PyErr_Format(PyExc_TypeError,
2850                 "encoder did not return a bytes object (type=%.400s)",
2851                 Py_TYPE(v)->tp_name);
2852    Py_DECREF(v);
2853    return NULL;
2854}
2855
2856PyObject *
2857PyUnicode_AsEncodedUnicode(PyObject *unicode,
2858                           const char *encoding,
2859                           const char *errors)
2860{
2861    PyObject *v;
2862
2863    if (!PyUnicode_Check(unicode)) {
2864        PyErr_BadArgument();
2865        goto onError;
2866    }
2867
2868    if (encoding == NULL)
2869        encoding = PyUnicode_GetDefaultEncoding();
2870
2871    /* Encode via the codec registry */
2872    v = PyCodec_Encode(unicode, encoding, errors);
2873    if (v == NULL)
2874        goto onError;
2875    if (!PyUnicode_Check(v)) {
2876        PyErr_Format(PyExc_TypeError,
2877                     "encoder did not return an str object (type=%.400s)",
2878                     Py_TYPE(v)->tp_name);
2879        Py_DECREF(v);
2880        goto onError;
2881    }
2882    return v;
2883
2884  onError:
2885    return NULL;
2886}
2887
2888PyObject*
2889PyUnicode_DecodeFSDefault(const char *s) {
2890    Py_ssize_t size = (Py_ssize_t)strlen(s);
2891    return PyUnicode_DecodeFSDefaultAndSize(s, size);
2892}
2893
2894PyObject*
2895PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2896{
2897#ifdef HAVE_MBCS
2898    return PyUnicode_DecodeMBCS(s, size, NULL);
2899#elif defined(__APPLE__)
2900    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2901#else
2902    PyInterpreterState *interp = PyThreadState_GET()->interp;
2903    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2904       cannot use it to encode and decode filenames before it is loaded. Load
2905       the Python codec requires to encode at least its own filename. Use the C
2906       version of the locale codec until the codec registry is initialized and
2907       the Python codec is loaded.
2908
2909       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2910       cannot only rely on it: check also interp->fscodec_initialized for
2911       subinterpreters. */
2912    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2913        return PyUnicode_Decode(s, size,
2914                                Py_FileSystemDefaultEncoding,
2915                                "surrogateescape");
2916    }
2917    else {
2918        /* locale encoding with surrogateescape */
2919        wchar_t *wchar;
2920        PyObject *unicode;
2921        size_t len;
2922
2923        if (s[size] != '\0' || size != strlen(s)) {
2924            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2925            return NULL;
2926        }
2927
2928        wchar = _Py_char2wchar(s, &len);
2929        if (wchar == NULL)
2930            return PyErr_NoMemory();
2931
2932        unicode = PyUnicode_FromWideChar(wchar, len);
2933        PyMem_Free(wchar);
2934        return unicode;
2935    }
2936#endif
2937}
2938
2939
2940int
2941PyUnicode_FSConverter(PyObject* arg, void* addr)
2942{
2943    PyObject *output = NULL;
2944    Py_ssize_t size;
2945    void *data;
2946    if (arg == NULL) {
2947        Py_DECREF(*(PyObject**)addr);
2948        return 1;
2949    }
2950    if (PyBytes_Check(arg)) {
2951        output = arg;
2952        Py_INCREF(output);
2953    }
2954    else {
2955        arg = PyUnicode_FromObject(arg);
2956        if (!arg)
2957            return 0;
2958        output = PyUnicode_EncodeFSDefault(arg);
2959        Py_DECREF(arg);
2960        if (!output)
2961            return 0;
2962        if (!PyBytes_Check(output)) {
2963            Py_DECREF(output);
2964            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
2965            return 0;
2966        }
2967    }
2968    size = PyBytes_GET_SIZE(output);
2969    data = PyBytes_AS_STRING(output);
2970    if (size != strlen(data)) {
2971        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2972        Py_DECREF(output);
2973        return 0;
2974    }
2975    *(PyObject**)addr = output;
2976    return Py_CLEANUP_SUPPORTED;
2977}
2978
2979
2980int
2981PyUnicode_FSDecoder(PyObject* arg, void* addr)
2982{
2983    PyObject *output = NULL;
2984    if (arg == NULL) {
2985        Py_DECREF(*(PyObject**)addr);
2986        return 1;
2987    }
2988    if (PyUnicode_Check(arg)) {
2989        if (PyUnicode_READY(arg))
2990            return 0;
2991        output = arg;
2992        Py_INCREF(output);
2993    }
2994    else {
2995        arg = PyBytes_FromObject(arg);
2996        if (!arg)
2997            return 0;
2998        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
2999                                                  PyBytes_GET_SIZE(arg));
3000        Py_DECREF(arg);
3001        if (!output)
3002            return 0;
3003        if (!PyUnicode_Check(output)) {
3004            Py_DECREF(output);
3005            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3006            return 0;
3007        }
3008    }
3009    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3010                 PyUnicode_GET_LENGTH(output), 0, 1)) {
3011        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3012        Py_DECREF(output);
3013        return 0;
3014    }
3015    *(PyObject**)addr = output;
3016    return Py_CLEANUP_SUPPORTED;
3017}
3018
3019
3020char*
3021PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3022{
3023    PyObject *bytes;
3024    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3025
3026    if (!PyUnicode_Check(unicode)) {
3027        PyErr_BadArgument();
3028        return NULL;
3029    }
3030    if (PyUnicode_READY(u) == -1)
3031        return NULL;
3032
3033    if (PyUnicode_UTF8(unicode) == NULL) {
3034        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3035        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3036        if (bytes == NULL)
3037            return NULL;
3038        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3039        if (_PyUnicode_UTF8(u) == NULL) {
3040            Py_DECREF(bytes);
3041            return NULL;
3042        }
3043        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3044        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
3045        Py_DECREF(bytes);
3046    }
3047
3048    if (psize)
3049        *psize = PyUnicode_UTF8_LENGTH(unicode);
3050    return PyUnicode_UTF8(unicode);
3051}
3052
3053char*
3054PyUnicode_AsUTF8(PyObject *unicode)
3055{
3056    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3057}
3058
3059#ifdef Py_DEBUG
3060int unicode_as_unicode_calls = 0;
3061#endif
3062
3063
3064Py_UNICODE *
3065PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3066{
3067    PyUnicodeObject *u;
3068    const unsigned char *one_byte;
3069#if SIZEOF_WCHAR_T == 4
3070    const Py_UCS2 *two_bytes;
3071#else
3072    const Py_UCS4 *four_bytes;
3073    const Py_UCS4 *ucs4_end;
3074    Py_ssize_t num_surrogates;
3075#endif
3076    wchar_t *w;
3077    wchar_t *wchar_end;
3078
3079    if (!PyUnicode_Check(unicode)) {
3080        PyErr_BadArgument();
3081        return NULL;
3082    }
3083    u = (PyUnicodeObject*)unicode;
3084    if (_PyUnicode_WSTR(u) == NULL) {
3085        /* Non-ASCII compact unicode object */
3086        assert(_PyUnicode_KIND(u) != 0);
3087        assert(PyUnicode_IS_READY(u));
3088
3089#ifdef Py_DEBUG
3090        ++unicode_as_unicode_calls;
3091#endif
3092
3093        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3094#if SIZEOF_WCHAR_T == 2
3095            four_bytes = PyUnicode_4BYTE_DATA(u);
3096            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3097            num_surrogates = 0;
3098
3099            for (; four_bytes < ucs4_end; ++four_bytes) {
3100                if (*four_bytes > 0xFFFF)
3101                    ++num_surrogates;
3102            }
3103
3104            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3105                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3106            if (!_PyUnicode_WSTR(u)) {
3107                PyErr_NoMemory();
3108                return NULL;
3109            }
3110            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3111
3112            w = _PyUnicode_WSTR(u);
3113            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3114            four_bytes = PyUnicode_4BYTE_DATA(u);
3115            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3116                if (*four_bytes > 0xFFFF) {
3117                    /* encode surrogate pair in this case */
3118                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3119                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3120                }
3121                else
3122                    *w = *four_bytes;
3123
3124                if (w > wchar_end) {
3125                    assert(0 && "Miscalculated string end");
3126                }
3127            }
3128            *w = 0;
3129#else
3130            /* sizeof(wchar_t) == 4 */
3131            Py_FatalError("Impossible unicode object state, wstr and str "
3132                          "should share memory already.");
3133            return NULL;
3134#endif
3135        }
3136        else {
3137            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3138                                                  (_PyUnicode_LENGTH(u) + 1));
3139            if (!_PyUnicode_WSTR(u)) {
3140                PyErr_NoMemory();
3141                return NULL;
3142            }
3143            if (!PyUnicode_IS_COMPACT_ASCII(u))
3144                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3145            w = _PyUnicode_WSTR(u);
3146            wchar_end = w + _PyUnicode_LENGTH(u);
3147
3148            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3149                one_byte = PyUnicode_1BYTE_DATA(u);
3150                for (; w < wchar_end; ++one_byte, ++w)
3151                    *w = *one_byte;
3152                /* null-terminate the wstr */
3153                *w = 0;
3154            }
3155            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3156#if SIZEOF_WCHAR_T == 4
3157                two_bytes = PyUnicode_2BYTE_DATA(u);
3158                for (; w < wchar_end; ++two_bytes, ++w)
3159                    *w = *two_bytes;
3160                /* null-terminate the wstr */
3161                *w = 0;
3162#else
3163                /* sizeof(wchar_t) == 2 */
3164                PyObject_FREE(_PyUnicode_WSTR(u));
3165                _PyUnicode_WSTR(u) = NULL;
3166                Py_FatalError("Impossible unicode object state, wstr "
3167                              "and str should share memory already.");
3168                return NULL;
3169#endif
3170            }
3171            else {
3172                assert(0 && "This should never happen.");
3173            }
3174        }
3175    }
3176    if (size != NULL)
3177        *size = PyUnicode_WSTR_LENGTH(u);
3178    return _PyUnicode_WSTR(u);
3179}
3180
3181Py_UNICODE *
3182PyUnicode_AsUnicode(PyObject *unicode)
3183{
3184    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3185}
3186
3187
3188Py_ssize_t
3189PyUnicode_GetSize(PyObject *unicode)
3190{
3191    if (!PyUnicode_Check(unicode)) {
3192        PyErr_BadArgument();
3193        goto onError;
3194    }
3195    return PyUnicode_GET_SIZE(unicode);
3196
3197  onError:
3198    return -1;
3199}
3200
3201Py_ssize_t
3202PyUnicode_GetLength(PyObject *unicode)
3203{
3204    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3205        PyErr_BadArgument();
3206        return -1;
3207    }
3208
3209    return PyUnicode_GET_LENGTH(unicode);
3210}
3211
3212Py_UCS4
3213PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3214{
3215    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3216        PyErr_BadArgument();
3217        return (Py_UCS4)-1;
3218    }
3219    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3220        PyErr_SetString(PyExc_IndexError, "string index out of range");
3221        return (Py_UCS4)-1;
3222    }
3223    return PyUnicode_READ_CHAR(unicode, index);
3224}
3225
3226int
3227PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3228{
3229    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3230        PyErr_BadArgument();
3231        return -1;
3232    }
3233    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3234        PyErr_SetString(PyExc_IndexError, "string index out of range");
3235        return -1;
3236    }
3237    if (_PyUnicode_Dirty(unicode))
3238        return -1;
3239    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3240                    index, ch);
3241    return 0;
3242}
3243
3244const char *
3245PyUnicode_GetDefaultEncoding(void)
3246{
3247    return "utf-8";
3248}
3249
3250/* create or adjust a UnicodeDecodeError */
3251static void
3252make_decode_exception(PyObject **exceptionObject,
3253                      const char *encoding,
3254                      const char *input, Py_ssize_t length,
3255                      Py_ssize_t startpos, Py_ssize_t endpos,
3256                      const char *reason)
3257{
3258    if (*exceptionObject == NULL) {
3259        *exceptionObject = PyUnicodeDecodeError_Create(
3260            encoding, input, length, startpos, endpos, reason);
3261    }
3262    else {
3263        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3264            goto onError;
3265        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3266            goto onError;
3267        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3268            goto onError;
3269    }
3270    return;
3271
3272onError:
3273    Py_DECREF(*exceptionObject);
3274    *exceptionObject = NULL;
3275}
3276
3277/* error handling callback helper:
3278   build arguments, call the callback and check the arguments,
3279   if no exception occurred, copy the replacement to the output
3280   and adjust various state variables.
3281   return 0 on success, -1 on error
3282*/
3283
3284static int
3285unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3286                                 const char *encoding, const char *reason,
3287                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3288                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3289                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3290{
3291    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3292
3293    PyObject *restuple = NULL;
3294    PyObject *repunicode = NULL;
3295    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3296    Py_ssize_t insize;
3297    Py_ssize_t requiredsize;
3298    Py_ssize_t newpos;
3299    const Py_UNICODE *repptr;
3300    PyObject *inputobj = NULL;
3301    Py_ssize_t repsize;
3302    int res = -1;
3303
3304    if (*errorHandler == NULL) {
3305        *errorHandler = PyCodec_LookupError(errors);
3306        if (*errorHandler == NULL)
3307            goto onError;
3308    }
3309
3310    make_decode_exception(exceptionObject,
3311        encoding,
3312        *input, *inend - *input,
3313        *startinpos, *endinpos,
3314        reason);
3315    if (*exceptionObject == NULL)
3316        goto onError;
3317
3318    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3319    if (restuple == NULL)
3320        goto onError;
3321    if (!PyTuple_Check(restuple)) {
3322        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3323        goto onError;
3324    }
3325    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3326        goto onError;
3327
3328    /* Copy back the bytes variables, which might have been modified by the
3329       callback */
3330    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3331    if (!inputobj)
3332        goto onError;
3333    if (!PyBytes_Check(inputobj)) {
3334        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3335    }
3336    *input = PyBytes_AS_STRING(inputobj);
3337    insize = PyBytes_GET_SIZE(inputobj);
3338    *inend = *input + insize;
3339    /* we can DECREF safely, as the exception has another reference,
3340       so the object won't go away. */
3341    Py_DECREF(inputobj);
3342
3343    if (newpos<0)
3344        newpos = insize+newpos;
3345    if (newpos<0 || newpos>insize) {
3346        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3347        goto onError;
3348    }
3349
3350    /* need more space? (at least enough for what we
3351       have+the replacement+the rest of the string (starting
3352       at the new input position), so we won't have to check space
3353       when there are no errors in the rest of the string) */
3354    repptr = PyUnicode_AS_UNICODE(repunicode);
3355    repsize = PyUnicode_GET_SIZE(repunicode);
3356    requiredsize = *outpos + repsize + insize-newpos;
3357    if (requiredsize > outsize) {
3358        if (requiredsize<2*outsize)
3359            requiredsize = 2*outsize;
3360        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3361            goto onError;
3362        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3363    }
3364    *endinpos = newpos;
3365    *inptr = *input + newpos;
3366    Py_UNICODE_COPY(*outptr, repptr, repsize);
3367    *outptr += repsize;
3368    *outpos += repsize;
3369
3370    /* we made it! */
3371    res = 0;
3372
3373  onError:
3374    Py_XDECREF(restuple);
3375    return res;
3376}
3377
3378/* --- UTF-7 Codec -------------------------------------------------------- */
3379
3380/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3381
3382/* Three simple macros defining base-64. */
3383
3384/* Is c a base-64 character? */
3385
3386#define IS_BASE64(c) \
3387    (((c) >= 'A' && (c) <= 'Z') ||     \
3388     ((c) >= 'a' && (c) <= 'z') ||     \
3389     ((c) >= '0' && (c) <= '9') ||     \
3390     (c) == '+' || (c) == '/')
3391
3392/* given that c is a base-64 character, what is its base-64 value? */
3393
3394#define FROM_BASE64(c)                                                  \
3395    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3396     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3397     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3398     (c) == '+' ? 62 : 63)
3399
3400/* What is the base-64 character of the bottom 6 bits of n? */
3401
3402#define TO_BASE64(n)  \
3403    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3404
3405/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3406 * decoded as itself.  We are permissive on decoding; the only ASCII
3407 * byte not decoding to itself is the + which begins a base64
3408 * string. */
3409
3410#define DECODE_DIRECT(c)                                \
3411    ((c) <= 127 && (c) != '+')
3412
3413/* The UTF-7 encoder treats ASCII characters differently according to
3414 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3415 * the above).  See RFC2152.  This array identifies these different
3416 * sets:
3417 * 0 : "Set D"
3418 *     alphanumeric and '(),-./:?
3419 * 1 : "Set O"
3420 *     !"#$%&*;<=>@[]^_`{|}
3421 * 2 : "whitespace"
3422 *     ht nl cr sp
3423 * 3 : special (must be base64 encoded)
3424 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3425 */
3426
3427static
3428char utf7_category[128] = {
3429/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3430    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3431/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3432    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3433/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3434    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3435/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3436    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3437/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3438    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3439/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3440    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3441/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3442    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3443/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3444    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3445};
3446
3447/* ENCODE_DIRECT: this character should be encoded as itself.  The
3448 * answer depends on whether we are encoding set O as itself, and also
3449 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3450 * clear that the answers to these questions vary between
3451 * applications, so this code needs to be flexible.  */
3452
3453#define ENCODE_DIRECT(c, directO, directWS)             \
3454    ((c) < 128 && (c) > 0 &&                            \
3455     ((utf7_category[(c)] == 0) ||                      \
3456      (directWS && (utf7_category[(c)] == 2)) ||        \
3457      (directO && (utf7_category[(c)] == 1))))
3458
3459PyObject *
3460PyUnicode_DecodeUTF7(const char *s,
3461                     Py_ssize_t size,
3462                     const char *errors)
3463{
3464    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3465}
3466
3467/* The decoder.  The only state we preserve is our read position,
3468 * i.e. how many characters we have consumed.  So if we end in the
3469 * middle of a shift sequence we have to back off the read position
3470 * and the output to the beginning of the sequence, otherwise we lose
3471 * all the shift state (seen bits, number of bits seen, high
3472 * surrogate). */
3473
3474PyObject *
3475PyUnicode_DecodeUTF7Stateful(const char *s,
3476                             Py_ssize_t size,
3477                             const char *errors,
3478                             Py_ssize_t *consumed)
3479{
3480    const char *starts = s;
3481    Py_ssize_t startinpos;
3482    Py_ssize_t endinpos;
3483    Py_ssize_t outpos;
3484    const char *e;
3485    PyUnicodeObject *unicode;
3486    Py_UNICODE *p;
3487    const char *errmsg = "";
3488    int inShift = 0;
3489    Py_UNICODE *shiftOutStart;
3490    unsigned int base64bits = 0;
3491    unsigned long base64buffer = 0;
3492    Py_UNICODE surrogate = 0;
3493    PyObject *errorHandler = NULL;
3494    PyObject *exc = NULL;
3495
3496    unicode = _PyUnicode_New(size);
3497    if (!unicode)
3498        return NULL;
3499    if (size == 0) {
3500        if (consumed)
3501            *consumed = 0;
3502        return (PyObject *)unicode;
3503    }
3504
3505    p = PyUnicode_AS_UNICODE(unicode);
3506    shiftOutStart = p;
3507    e = s + size;
3508
3509    while (s < e) {
3510        Py_UNICODE ch;
3511      restart:
3512        ch = (unsigned char) *s;
3513
3514        if (inShift) { /* in a base-64 section */
3515            if (IS_BASE64(ch)) { /* consume a base-64 character */
3516                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3517                base64bits += 6;
3518                s++;
3519                if (base64bits >= 16) {
3520                    /* we have enough bits for a UTF-16 value */
3521                    Py_UNICODE outCh = (Py_UNICODE)
3522                                       (base64buffer >> (base64bits-16));
3523                    base64bits -= 16;
3524                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3525                    if (surrogate) {
3526                        /* expecting a second surrogate */
3527                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3528#ifdef Py_UNICODE_WIDE
3529                            *p++ = (((surrogate & 0x3FF)<<10)
3530                                    | (outCh & 0x3FF)) + 0x10000;
3531#else
3532                            *p++ = surrogate;
3533                            *p++ = outCh;
3534#endif
3535                            surrogate = 0;
3536                        }
3537                        else {
3538                            surrogate = 0;
3539                            errmsg = "second surrogate missing";
3540                            goto utf7Error;
3541                        }
3542                    }
3543                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3544                        /* first surrogate */
3545                        surrogate = outCh;
3546                    }
3547                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3548                        errmsg = "unexpected second surrogate";
3549                        goto utf7Error;
3550                    }
3551                    else {
3552                        *p++ = outCh;
3553                    }
3554                }
3555            }
3556            else { /* now leaving a base-64 section */
3557                inShift = 0;
3558                s++;
3559                if (surrogate) {
3560                    errmsg = "second surrogate missing at end of shift sequence";
3561                    goto utf7Error;
3562                }
3563                if (base64bits > 0) { /* left-over bits */
3564                    if (base64bits >= 6) {
3565                        /* We've seen at least one base-64 character */
3566                        errmsg = "partial character in shift sequence";
3567                        goto utf7Error;
3568                    }
3569                    else {
3570                        /* Some bits remain; they should be zero */
3571                        if (base64buffer != 0) {
3572                            errmsg = "non-zero padding bits in shift sequence";
3573                            goto utf7Error;
3574                        }
3575                    }
3576                }
3577                if (ch != '-') {
3578                    /* '-' is absorbed; other terminating
3579                       characters are preserved */
3580                    *p++ = ch;
3581                }
3582            }
3583        }
3584        else if ( ch == '+' ) {
3585            startinpos = s-starts;
3586            s++; /* consume '+' */
3587            if (s < e && *s == '-') { /* '+-' encodes '+' */
3588                s++;
3589                *p++ = '+';
3590            }
3591            else { /* begin base64-encoded section */
3592                inShift = 1;
3593                shiftOutStart = p;
3594                base64bits = 0;
3595            }
3596        }
3597        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3598            *p++ = ch;
3599            s++;
3600        }
3601        else {
3602            startinpos = s-starts;
3603            s++;
3604            errmsg = "unexpected special character";
3605            goto utf7Error;
3606        }
3607        continue;
3608utf7Error:
3609        outpos = p-PyUnicode_AS_UNICODE(unicode);
3610        endinpos = s-starts;
3611        if (unicode_decode_call_errorhandler(
3612                errors, &errorHandler,
3613                "utf7", errmsg,
3614                &starts, &e, &startinpos, &endinpos, &exc, &s,
3615                &unicode, &outpos, &p))
3616            goto onError;
3617    }
3618
3619    /* end of string */
3620
3621    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3622        /* if we're in an inconsistent state, that's an error */
3623        if (surrogate ||
3624                (base64bits >= 6) ||
3625                (base64bits > 0 && base64buffer != 0)) {
3626            outpos = p-PyUnicode_AS_UNICODE(unicode);
3627            endinpos = size;
3628            if (unicode_decode_call_errorhandler(
3629                    errors, &errorHandler,
3630                    "utf7", "unterminated shift sequence",
3631                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3632                    &unicode, &outpos, &p))
3633                goto onError;
3634            if (s < e)
3635                goto restart;
3636        }
3637    }
3638
3639    /* return state */
3640    if (consumed) {
3641        if (inShift) {
3642            p = shiftOutStart; /* back off output */
3643            *consumed = startinpos;
3644        }
3645        else {
3646            *consumed = s-starts;
3647        }
3648    }
3649
3650    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3651        goto onError;
3652
3653    Py_XDECREF(errorHandler);
3654    Py_XDECREF(exc);
3655    if (_PyUnicode_READY_REPLACE(&unicode)) {
3656        Py_DECREF(unicode);
3657        return NULL;
3658    }
3659    return (PyObject *)unicode;
3660
3661  onError:
3662    Py_XDECREF(errorHandler);
3663    Py_XDECREF(exc);
3664    Py_DECREF(unicode);
3665    return NULL;
3666}
3667
3668
3669PyObject *
3670PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3671                     Py_ssize_t size,
3672                     int base64SetO,
3673                     int base64WhiteSpace,
3674                     const char *errors)
3675{
3676    PyObject *v;
3677    /* It might be possible to tighten this worst case */
3678    Py_ssize_t allocated = 8 * size;
3679    int inShift = 0;
3680    Py_ssize_t i = 0;
3681    unsigned int base64bits = 0;
3682    unsigned long base64buffer = 0;
3683    char * out;
3684    char * start;
3685
3686    if (size == 0)
3687        return PyBytes_FromStringAndSize(NULL, 0);
3688
3689    if (allocated / 8 != size)
3690        return PyErr_NoMemory();
3691
3692    v = PyBytes_FromStringAndSize(NULL, allocated);
3693    if (v == NULL)
3694        return NULL;
3695
3696    start = out = PyBytes_AS_STRING(v);
3697    for (;i < size; ++i) {
3698        Py_UNICODE ch = s[i];
3699
3700        if (inShift) {
3701            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3702                /* shifting out */
3703                if (base64bits) { /* output remaining bits */
3704                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3705                    base64buffer = 0;
3706                    base64bits = 0;
3707                }
3708                inShift = 0;
3709                /* Characters not in the BASE64 set implicitly unshift the sequence
3710                   so no '-' is required, except if the character is itself a '-' */
3711                if (IS_BASE64(ch) || ch == '-') {
3712                    *out++ = '-';
3713                }
3714                *out++ = (char) ch;
3715            }
3716            else {
3717                goto encode_char;
3718            }
3719        }
3720        else { /* not in a shift sequence */
3721            if (ch == '+') {
3722                *out++ = '+';
3723                        *out++ = '-';
3724            }
3725            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3726                *out++ = (char) ch;
3727            }
3728            else {
3729                *out++ = '+';
3730                inShift = 1;
3731                goto encode_char;
3732            }
3733        }
3734        continue;
3735encode_char:
3736#ifdef Py_UNICODE_WIDE
3737        if (ch >= 0x10000) {
3738            /* code first surrogate */
3739            base64bits += 16;
3740            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3741            while (base64bits >= 6) {
3742                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3743                base64bits -= 6;
3744            }
3745            /* prepare second surrogate */
3746            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
3747        }
3748#endif
3749        base64bits += 16;
3750        base64buffer = (base64buffer << 16) | ch;
3751        while (base64bits >= 6) {
3752            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3753            base64bits -= 6;
3754        }
3755    }
3756    if (base64bits)
3757        *out++= TO_BASE64(base64buffer << (6-base64bits) );
3758    if (inShift)
3759        *out++ = '-';
3760    if (_PyBytes_Resize(&v, out - start) < 0)
3761        return NULL;
3762    return v;
3763}
3764
3765#undef IS_BASE64
3766#undef FROM_BASE64
3767#undef TO_BASE64
3768#undef DECODE_DIRECT
3769#undef ENCODE_DIRECT
3770
3771/* --- UTF-8 Codec -------------------------------------------------------- */
3772
3773static
3774char utf8_code_length[256] = {
3775    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
3776       illegal prefix.  See RFC 3629 for details */
3777    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3778    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3779    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3780    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3781    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3782    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3783    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3784    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3785    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
3786    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3787    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3788    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3789    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3790    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3791    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3792    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
3793};
3794
3795PyObject *
3796PyUnicode_DecodeUTF8(const char *s,
3797                     Py_ssize_t size,
3798                     const char *errors)
3799{
3800    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3801}
3802
3803/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3804#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3805
3806/* Mask to quickly check whether a C 'long' contains a
3807   non-ASCII, UTF8-encoded char. */
3808#if (SIZEOF_LONG == 8)
3809# define ASCII_CHAR_MASK 0x8080808080808080L
3810#elif (SIZEOF_LONG == 4)
3811# define ASCII_CHAR_MASK 0x80808080L
3812#else
3813# error C 'long' size should be either 4 or 8!
3814#endif
3815
3816/* Scans a UTF-8 string and returns the maximum character to be expected,
3817   the size of the decoded unicode string and if any major errors were
3818   encountered.
3819
3820   This function does check basic UTF-8 sanity, it does however NOT CHECK
3821   if the string contains surrogates, and if all continuation bytes are
3822   within the correct ranges, these checks are performed in
3823   PyUnicode_DecodeUTF8Stateful.
3824
3825   If it sets has_errors to 1, it means the value of unicode_size and max_char
3826   will be bogus and you should not rely on useful information in them.
3827   */
3828static Py_UCS4
3829utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3830                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3831                                  int *has_errors)
3832{
3833    Py_ssize_t n;
3834    Py_ssize_t char_count = 0;
3835    Py_UCS4 max_char = 127, new_max;
3836    Py_UCS4 upper_bound;
3837    const unsigned char *p = (const unsigned char *)s;
3838    const unsigned char *end = p + string_size;
3839    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3840    int err = 0;
3841
3842    for (; p < end && !err; ++p, ++char_count) {
3843        /* Only check value if it's not a ASCII char... */
3844        if (*p < 0x80) {
3845            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3846               an explanation. */
3847            if (!((size_t) p & LONG_PTR_MASK)) {
3848                /* Help register allocation */
3849                register const unsigned char *_p = p;
3850                while (_p < aligned_end) {
3851                    unsigned long value = *(unsigned long *) _p;
3852                    if (value & ASCII_CHAR_MASK)
3853                        break;
3854                    _p += SIZEOF_LONG;
3855                    char_count += SIZEOF_LONG;
3856                }
3857                p = _p;
3858                if (p == end)
3859                    break;
3860            }
3861        }
3862        if (*p >= 0x80) {
3863            n = utf8_code_length[*p];
3864            new_max = max_char;
3865            switch (n) {
3866            /* invalid start byte */
3867            case 0:
3868                err = 1;
3869                break;
3870            case 2:
3871                /* Code points between 0x00FF and 0x07FF inclusive.
3872                   Approximate the upper bound of the code point,
3873                   if this flips over 255 we can be sure it will be more
3874                   than 255 and the string will need 2 bytes per code coint,
3875                   if it stays under or equal to 255, we can be sure 1 byte
3876                   is enough.
3877                   ((*p & 0b00011111) << 6) | 0b00111111 */
3878                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3879                if (max_char < upper_bound)
3880                    new_max = upper_bound;
3881                /* Ensure we track at least that we left ASCII space. */
3882                if (new_max < 128)
3883                    new_max = 128;
3884                break;
3885            case 3:
3886                /* Between 0x0FFF and 0xFFFF inclusive, so values are
3887                   always > 255 and <= 65535 and will always need 2 bytes. */
3888                if (max_char < 65535)
3889                    new_max = 65535;
3890                break;
3891            case 4:
3892                /* Code point will be above 0xFFFF for sure in this case. */
3893                new_max = 65537;
3894                break;
3895            /* Internal error, this should be caught by the first if */
3896            case 1:
3897            default:
3898                assert(0 && "Impossible case in utf8_max_char_and_size");
3899                err = 1;
3900            }
3901            /* Instead of number of overall bytes for this code point,
3902               n containts the number of following bytes: */
3903            --n;
3904            /* Check if the follow up chars are all valid continuation bytes */
3905            if (n >= 1) {
3906                const unsigned char *cont;
3907                if ((p + n) >= end) {
3908                    if (consumed == 0)
3909                        /* incomplete data, non-incremental decoding */
3910                        err = 1;
3911                    break;
3912                }
3913                for (cont = p + 1; cont < (p + n); ++cont) {
3914                    if ((*cont & 0xc0) != 0x80) {
3915                        err = 1;
3916                        break;
3917                    }
3918                }
3919                p += n;
3920            }
3921            else
3922                err = 1;
3923            max_char = new_max;
3924        }
3925    }
3926
3927    if (unicode_size)
3928        *unicode_size = char_count;
3929    if (has_errors)
3930        *has_errors = err;
3931    return max_char;
3932}
3933
3934/* Similar to PyUnicode_WRITE but can also write into wstr field
3935   of the legacy unicode representation */
3936#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3937    do { \
3938        const int k_ = (kind); \
3939        if (k_ == PyUnicode_WCHAR_KIND) \
3940            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3941        else if (k_ == PyUnicode_1BYTE_KIND) \
3942            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3943        else if (k_ == PyUnicode_2BYTE_KIND) \
3944            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3945        else \
3946            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
3947    } while (0)
3948
3949PyObject *
3950PyUnicode_DecodeUTF8Stateful(const char *s,
3951                             Py_ssize_t size,
3952                             const char *errors,
3953                             Py_ssize_t *consumed)
3954{
3955    const char *starts = s;
3956    int n;
3957    int k;
3958    Py_ssize_t startinpos;
3959    Py_ssize_t endinpos;
3960    const char *e, *aligned_end;
3961    PyUnicodeObject *unicode;
3962    const char *errmsg = "";
3963    PyObject *errorHandler = NULL;
3964    PyObject *exc = NULL;
3965    Py_UCS4 maxchar = 0;
3966    Py_ssize_t unicode_size;
3967    Py_ssize_t i;
3968    int kind;
3969    void *data;
3970    int has_errors;
3971    Py_UNICODE *error_outptr;
3972#if SIZEOF_WCHAR_T == 2
3973    Py_ssize_t wchar_offset = 0;
3974#endif
3975
3976    if (size == 0) {
3977        if (consumed)
3978            *consumed = 0;
3979        return (PyObject *)PyUnicode_New(0, 0);
3980    }
3981    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
3982                                                consumed, &has_errors);
3983    if (has_errors) {
3984        unicode = _PyUnicode_New(size);
3985        if (!unicode)
3986            return NULL;
3987        kind = PyUnicode_WCHAR_KIND;
3988        data = PyUnicode_AS_UNICODE(unicode);
3989        assert(data != NULL);
3990    }
3991    else {
3992        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
3993        if (!unicode)
3994            return NULL;
3995        /* When the string is ASCII only, just use memcpy and return.
3996           unicode_size may be != size if there is an incomplete UTF-8
3997           sequence at the end of the ASCII block.  */
3998        if (maxchar < 128 && size == unicode_size) {
3999            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4000            return (PyObject *)unicode;
4001        }
4002        kind = PyUnicode_KIND(unicode);
4003        data = PyUnicode_DATA(unicode);
4004    }
4005    /* Unpack UTF-8 encoded data */
4006    i = 0;
4007    e = s + size;
4008    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4009
4010    while (s < e) {
4011        Py_UCS4 ch = (unsigned char)*s;
4012
4013        if (ch < 0x80) {
4014            /* Fast path for runs of ASCII characters. Given that common UTF-8
4015               input will consist of an overwhelming majority of ASCII
4016               characters, we try to optimize for this case by checking
4017               as many characters as a C 'long' can contain.
4018               First, check if we can do an aligned read, as most CPUs have
4019               a penalty for unaligned reads.
4020            */
4021            if (!((size_t) s & LONG_PTR_MASK)) {
4022                /* Help register allocation */
4023                register const char *_s = s;
4024                register Py_ssize_t _i = i;
4025                while (_s < aligned_end) {
4026                    /* Read a whole long at a time (either 4 or 8 bytes),
4027                       and do a fast unrolled copy if it only contains ASCII
4028                       characters. */
4029                    unsigned long value = *(unsigned long *) _s;
4030                    if (value & ASCII_CHAR_MASK)
4031                        break;
4032                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4033                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4034                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4035                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4036#if (SIZEOF_LONG == 8)
4037                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4038                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4039                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4040                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4041#endif
4042                    _s += SIZEOF_LONG;
4043                    _i += SIZEOF_LONG;
4044                }
4045                s = _s;
4046                i = _i;
4047                if (s == e)
4048                    break;
4049                ch = (unsigned char)*s;
4050            }
4051        }
4052
4053        if (ch < 0x80) {
4054            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4055            s++;
4056            continue;
4057        }
4058
4059        n = utf8_code_length[ch];
4060
4061        if (s + n > e) {
4062            if (consumed)
4063                break;
4064            else {
4065                errmsg = "unexpected end of data";
4066                startinpos = s-starts;
4067                endinpos = startinpos+1;
4068                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4069                    endinpos++;
4070                goto utf8Error;
4071            }
4072        }
4073
4074        switch (n) {
4075
4076        case 0:
4077            errmsg = "invalid start byte";
4078            startinpos = s-starts;
4079            endinpos = startinpos+1;
4080            goto utf8Error;
4081
4082        case 1:
4083            errmsg = "internal error";
4084            startinpos = s-starts;
4085            endinpos = startinpos+1;
4086            goto utf8Error;
4087
4088        case 2:
4089            if ((s[1] & 0xc0) != 0x80) {
4090                errmsg = "invalid continuation byte";
4091                startinpos = s-starts;
4092                endinpos = startinpos + 1;
4093                goto utf8Error;
4094            }
4095            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4096            assert ((ch > 0x007F) && (ch <= 0x07FF));
4097            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4098            break;
4099
4100        case 3:
4101            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4102               will result in surrogates in range d800-dfff. Surrogates are
4103               not valid UTF-8 so they are rejected.
4104               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4105               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4106            if ((s[1] & 0xc0) != 0x80 ||
4107                (s[2] & 0xc0) != 0x80 ||
4108                ((unsigned char)s[0] == 0xE0 &&
4109                 (unsigned char)s[1] < 0xA0) ||
4110                ((unsigned char)s[0] == 0xED &&
4111                 (unsigned char)s[1] > 0x9F)) {
4112                errmsg = "invalid continuation byte";
4113                startinpos = s-starts;
4114                endinpos = startinpos + 1;
4115
4116                /* if s[1] first two bits are 1 and 0, then the invalid
4117                   continuation byte is s[2], so increment endinpos by 1,
4118                   if not, s[1] is invalid and endinpos doesn't need to
4119                   be incremented. */
4120                if ((s[1] & 0xC0) == 0x80)
4121                    endinpos++;
4122                goto utf8Error;
4123            }
4124            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4125            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4126            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4127            break;
4128
4129        case 4:
4130            if ((s[1] & 0xc0) != 0x80 ||
4131                (s[2] & 0xc0) != 0x80 ||
4132                (s[3] & 0xc0) != 0x80 ||
4133                ((unsigned char)s[0] == 0xF0 &&
4134                 (unsigned char)s[1] < 0x90) ||
4135                ((unsigned char)s[0] == 0xF4 &&
4136                 (unsigned char)s[1] > 0x8F)) {
4137                errmsg = "invalid continuation byte";
4138                startinpos = s-starts;
4139                endinpos = startinpos + 1;
4140                if ((s[1] & 0xC0) == 0x80) {
4141                    endinpos++;
4142                    if ((s[2] & 0xC0) == 0x80)
4143                        endinpos++;
4144                }
4145                goto utf8Error;
4146            }
4147            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4148                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4149            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4150
4151            /* If the string is flexible or we have native UCS-4, write
4152               directly.. */
4153            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4154                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4155
4156            else {
4157                /* compute and append the two surrogates: */
4158
4159                /* translate from 10000..10FFFF to 0..FFFF */
4160                ch -= 0x10000;
4161
4162                /* high surrogate = top 10 bits added to D800 */
4163                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4164                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4165
4166                /* low surrogate = bottom 10 bits added to DC00 */
4167                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4168                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4169            }
4170#if SIZEOF_WCHAR_T == 2
4171            wchar_offset++;
4172#endif
4173            break;
4174        }
4175        s += n;
4176        continue;
4177
4178      utf8Error:
4179        /* If this is not yet a resizable string, make it one.. */
4180        if (kind != PyUnicode_WCHAR_KIND) {
4181            const Py_UNICODE *u;
4182            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4183            if (!new_unicode)
4184                goto onError;
4185            u = PyUnicode_AsUnicode((PyObject *)unicode);
4186            if (!u)
4187                goto onError;
4188#if SIZEOF_WCHAR_T == 2
4189            i += wchar_offset;
4190#endif
4191            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4192            Py_DECREF(unicode);
4193            unicode = new_unicode;
4194            kind = 0;
4195            data = PyUnicode_AS_UNICODE(new_unicode);
4196            assert(data != NULL);
4197        }
4198        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4199        if (unicode_decode_call_errorhandler(
4200                errors, &errorHandler,
4201                "utf8", errmsg,
4202                &starts, &e, &startinpos, &endinpos, &exc, &s,
4203                &unicode, &i, &error_outptr))
4204            goto onError;
4205        /* Update data because unicode_decode_call_errorhandler might have
4206           re-created or resized the unicode object. */
4207        data = PyUnicode_AS_UNICODE(unicode);
4208        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4209    }
4210    /* Ensure the unicode_size calculation above was correct: */
4211    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4212
4213    if (consumed)
4214        *consumed = s-starts;
4215
4216    /* Adjust length and ready string when it contained errors and
4217       is of the old resizable kind. */
4218    if (kind == PyUnicode_WCHAR_KIND) {
4219        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
4220            goto onError;
4221    }
4222
4223    Py_XDECREF(errorHandler);
4224    Py_XDECREF(exc);
4225    if (_PyUnicode_READY_REPLACE(&unicode)) {
4226        Py_DECREF(unicode);
4227        return NULL;
4228    }
4229    return (PyObject *)unicode;
4230
4231  onError:
4232    Py_XDECREF(errorHandler);
4233    Py_XDECREF(exc);
4234    Py_DECREF(unicode);
4235    return NULL;
4236}
4237
4238#undef WRITE_FLEXIBLE_OR_WSTR
4239
4240#ifdef __APPLE__
4241
4242/* Simplified UTF-8 decoder using surrogateescape error handler,
4243   used to decode the command line arguments on Mac OS X. */
4244
4245wchar_t*
4246_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4247{
4248    int n;
4249    const char *e;
4250    wchar_t *unicode, *p;
4251
4252    /* Note: size will always be longer than the resulting Unicode
4253       character count */
4254    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4255        PyErr_NoMemory();
4256        return NULL;
4257    }
4258    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4259    if (!unicode)
4260        return NULL;
4261
4262    /* Unpack UTF-8 encoded data */
4263    p = unicode;
4264    e = s + size;
4265    while (s < e) {
4266        Py_UCS4 ch = (unsigned char)*s;
4267
4268        if (ch < 0x80) {
4269            *p++ = (wchar_t)ch;
4270            s++;
4271            continue;
4272        }
4273
4274        n = utf8_code_length[ch];
4275        if (s + n > e) {
4276            goto surrogateescape;
4277        }
4278
4279        switch (n) {
4280        case 0:
4281        case 1:
4282            goto surrogateescape;
4283
4284        case 2:
4285            if ((s[1] & 0xc0) != 0x80)
4286                goto surrogateescape;
4287            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4288            assert ((ch > 0x007F) && (ch <= 0x07FF));
4289            *p++ = (wchar_t)ch;
4290            break;
4291
4292        case 3:
4293            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4294               will result in surrogates in range d800-dfff. Surrogates are
4295               not valid UTF-8 so they are rejected.
4296               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4297               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4298            if ((s[1] & 0xc0) != 0x80 ||
4299                (s[2] & 0xc0) != 0x80 ||
4300                ((unsigned char)s[0] == 0xE0 &&
4301                 (unsigned char)s[1] < 0xA0) ||
4302                ((unsigned char)s[0] == 0xED &&
4303                 (unsigned char)s[1] > 0x9F)) {
4304
4305                goto surrogateescape;
4306            }
4307            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4308            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4309            *p++ = (wchar_t)ch;
4310            break;
4311
4312        case 4:
4313            if ((s[1] & 0xc0) != 0x80 ||
4314                (s[2] & 0xc0) != 0x80 ||
4315                (s[3] & 0xc0) != 0x80 ||
4316                ((unsigned char)s[0] == 0xF0 &&
4317                 (unsigned char)s[1] < 0x90) ||
4318                ((unsigned char)s[0] == 0xF4 &&
4319                 (unsigned char)s[1] > 0x8F)) {
4320                goto surrogateescape;
4321            }
4322            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4323                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4324            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4325
4326#if SIZEOF_WCHAR_T == 4
4327            *p++ = (wchar_t)ch;
4328#else
4329            /*  compute and append the two surrogates: */
4330
4331            /*  translate from 10000..10FFFF to 0..FFFF */
4332            ch -= 0x10000;
4333
4334            /*  high surrogate = top 10 bits added to D800 */
4335            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4336
4337            /*  low surrogate = bottom 10 bits added to DC00 */
4338            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4339#endif
4340            break;
4341        }
4342        s += n;
4343        continue;
4344
4345      surrogateescape:
4346        *p++ = 0xDC00 + ch;
4347        s++;
4348    }
4349    *p = L'\0';
4350    return unicode;
4351}
4352
4353#endif /* __APPLE__ */
4354
4355/* Primary internal function which creates utf8 encoded bytes objects.
4356
4357   Allocation strategy:  if the string is short, convert into a stack buffer
4358   and allocate exactly as much space needed at the end.  Else allocate the
4359   maximum possible needed (4 result bytes per Unicode character), and return
4360   the excess memory at the end.
4361*/
4362PyObject *
4363_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4364{
4365#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4366
4367    Py_ssize_t i;                /* index into s of next input byte */
4368    PyObject *result;            /* result string object */
4369    char *p;                     /* next free byte in output buffer */
4370    Py_ssize_t nallocated;      /* number of result bytes allocated */
4371    Py_ssize_t nneeded;            /* number of result bytes needed */
4372    char stackbuf[MAX_SHORT_UNICHARS * 4];
4373    PyObject *errorHandler = NULL;
4374    PyObject *exc = NULL;
4375    int kind;
4376    void *data;
4377    Py_ssize_t size;
4378    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4379#if SIZEOF_WCHAR_T == 2
4380    Py_ssize_t wchar_offset = 0;
4381#endif
4382
4383    if (!PyUnicode_Check(unicode)) {
4384        PyErr_BadArgument();
4385        return NULL;
4386    }
4387
4388    if (PyUnicode_READY(unicode) == -1)
4389        return NULL;
4390
4391    if (PyUnicode_UTF8(unicode))
4392        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4393                                         PyUnicode_UTF8_LENGTH(unicode));
4394
4395    kind = PyUnicode_KIND(unicode);
4396    data = PyUnicode_DATA(unicode);
4397    size = PyUnicode_GET_LENGTH(unicode);
4398
4399    assert(size >= 0);
4400
4401    if (size <= MAX_SHORT_UNICHARS) {
4402        /* Write into the stack buffer; nallocated can't overflow.
4403         * At the end, we'll allocate exactly as much heap space as it
4404         * turns out we need.
4405         */
4406        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4407        result = NULL;   /* will allocate after we're done */
4408        p = stackbuf;
4409    }
4410    else {
4411        /* Overallocate on the heap, and give the excess back at the end. */
4412        nallocated = size * 4;
4413        if (nallocated / 4 != size)  /* overflow! */
4414            return PyErr_NoMemory();
4415        result = PyBytes_FromStringAndSize(NULL, nallocated);
4416        if (result == NULL)
4417            return NULL;
4418        p = PyBytes_AS_STRING(result);
4419    }
4420
4421    for (i = 0; i < size;) {
4422        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4423
4424        if (ch < 0x80)
4425            /* Encode ASCII */
4426            *p++ = (char) ch;
4427
4428        else if (ch < 0x0800) {
4429            /* Encode Latin-1 */
4430            *p++ = (char)(0xc0 | (ch >> 6));
4431            *p++ = (char)(0x80 | (ch & 0x3f));
4432        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4433            Py_ssize_t newpos;
4434            PyObject *rep;
4435            Py_ssize_t repsize, k, startpos;
4436            startpos = i-1;
4437#if SIZEOF_WCHAR_T == 2
4438            startpos += wchar_offset;
4439#endif
4440            rep = unicode_encode_call_errorhandler(
4441                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4442                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4443                  &exc, startpos, startpos+1, &newpos);
4444            if (!rep)
4445                goto error;
4446
4447            if (PyBytes_Check(rep))
4448                repsize = PyBytes_GET_SIZE(rep);
4449            else
4450                repsize = PyUnicode_GET_SIZE(rep);
4451
4452            if (repsize > 4) {
4453                Py_ssize_t offset;
4454
4455                if (result == NULL)
4456                    offset = p - stackbuf;
4457                else
4458                    offset = p - PyBytes_AS_STRING(result);
4459
4460                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4461                    /* integer overflow */
4462                    PyErr_NoMemory();
4463                    goto error;
4464                }
4465                nallocated += repsize - 4;
4466                if (result != NULL) {
4467                    if (_PyBytes_Resize(&result, nallocated) < 0)
4468                        goto error;
4469                } else {
4470                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4471                    if (result == NULL)
4472                        goto error;
4473                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4474                }
4475                p = PyBytes_AS_STRING(result) + offset;
4476            }
4477
4478            if (PyBytes_Check(rep)) {
4479                char *prep = PyBytes_AS_STRING(rep);
4480                for(k = repsize; k > 0; k--)
4481                    *p++ = *prep++;
4482            } else /* rep is unicode */ {
4483                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4484                Py_UNICODE c;
4485
4486                for(k=0; k<repsize; k++) {
4487                    c = prep[k];
4488                    if (0x80 <= c) {
4489                        raise_encode_exception(&exc, "utf-8",
4490                                               PyUnicode_AS_UNICODE(unicode),
4491                                               size, i-1, i,
4492                                               "surrogates not allowed");
4493                        goto error;
4494                    }
4495                    *p++ = (char)prep[k];
4496                }
4497            }
4498            Py_DECREF(rep);
4499        } else if (ch < 0x10000) {
4500            *p++ = (char)(0xe0 | (ch >> 12));
4501            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4502            *p++ = (char)(0x80 | (ch & 0x3f));
4503        } else /* ch >= 0x10000 */ {
4504            /* Encode UCS4 Unicode ordinals */
4505            *p++ = (char)(0xf0 | (ch >> 18));
4506            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4507            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4508            *p++ = (char)(0x80 | (ch & 0x3f));
4509#if SIZEOF_WCHAR_T == 2
4510            wchar_offset++;
4511#endif
4512        }
4513    }
4514
4515    if (result == NULL) {
4516        /* This was stack allocated. */
4517        nneeded = p - stackbuf;
4518        assert(nneeded <= nallocated);
4519        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4520    }
4521    else {
4522        /* Cut back to size actually needed. */
4523        nneeded = p - PyBytes_AS_STRING(result);
4524        assert(nneeded <= nallocated);
4525        _PyBytes_Resize(&result, nneeded);
4526    }
4527
4528    Py_XDECREF(errorHandler);
4529    Py_XDECREF(exc);
4530    return result;
4531 error:
4532    Py_XDECREF(errorHandler);
4533    Py_XDECREF(exc);
4534    Py_XDECREF(result);
4535    return NULL;
4536
4537#undef MAX_SHORT_UNICHARS
4538}
4539
4540PyObject *
4541PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4542                     Py_ssize_t size,
4543                     const char *errors)
4544{
4545    PyObject *v, *unicode;
4546
4547    unicode = PyUnicode_FromUnicode(s, size);
4548    if (unicode == NULL)
4549        return NULL;
4550    v = _PyUnicode_AsUTF8String(unicode, errors);
4551    Py_DECREF(unicode);
4552    return v;
4553}
4554
4555PyObject *
4556PyUnicode_AsUTF8String(PyObject *unicode)
4557{
4558    return _PyUnicode_AsUTF8String(unicode, NULL);
4559}
4560
4561/* --- UTF-32 Codec ------------------------------------------------------- */
4562
4563PyObject *
4564PyUnicode_DecodeUTF32(const char *s,
4565                      Py_ssize_t size,
4566                      const char *errors,
4567                      int *byteorder)
4568{
4569    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4570}
4571
4572PyObject *
4573PyUnicode_DecodeUTF32Stateful(const char *s,
4574                              Py_ssize_t size,
4575                              const char *errors,
4576                              int *byteorder,
4577                              Py_ssize_t *consumed)
4578{
4579    const char *starts = s;
4580    Py_ssize_t startinpos;
4581    Py_ssize_t endinpos;
4582    Py_ssize_t outpos;
4583    PyUnicodeObject *unicode;
4584    Py_UNICODE *p;
4585#ifndef Py_UNICODE_WIDE
4586    int pairs = 0;
4587    const unsigned char *qq;
4588#else
4589    const int pairs = 0;
4590#endif
4591    const unsigned char *q, *e;
4592    int bo = 0;       /* assume native ordering by default */
4593    const char *errmsg = "";
4594    /* Offsets from q for retrieving bytes in the right order. */
4595#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4596    int iorder[] = {0, 1, 2, 3};
4597#else
4598    int iorder[] = {3, 2, 1, 0};
4599#endif
4600    PyObject *errorHandler = NULL;
4601    PyObject *exc = NULL;
4602
4603    q = (unsigned char *)s;
4604    e = q + size;
4605
4606    if (byteorder)
4607        bo = *byteorder;
4608
4609    /* Check for BOM marks (U+FEFF) in the input and adjust current
4610       byte order setting accordingly. In native mode, the leading BOM
4611       mark is skipped, in all other modes, it is copied to the output
4612       stream as-is (giving a ZWNBSP character). */
4613    if (bo == 0) {
4614        if (size >= 4) {
4615            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4616                (q[iorder[1]] << 8) | q[iorder[0]];
4617#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4618            if (bom == 0x0000FEFF) {
4619                q += 4;
4620                bo = -1;
4621            }
4622            else if (bom == 0xFFFE0000) {
4623                q += 4;
4624                bo = 1;
4625            }
4626#else
4627            if (bom == 0x0000FEFF) {
4628                q += 4;
4629                bo = 1;
4630            }
4631            else if (bom == 0xFFFE0000) {
4632                q += 4;
4633                bo = -1;
4634            }
4635#endif
4636        }
4637    }
4638
4639    if (bo == -1) {
4640        /* force LE */
4641        iorder[0] = 0;
4642        iorder[1] = 1;
4643        iorder[2] = 2;
4644        iorder[3] = 3;
4645    }
4646    else if (bo == 1) {
4647        /* force BE */
4648        iorder[0] = 3;
4649        iorder[1] = 2;
4650        iorder[2] = 1;
4651        iorder[3] = 0;
4652    }
4653
4654    /* On narrow builds we split characters outside the BMP into two
4655       codepoints => count how much extra space we need. */
4656#ifndef Py_UNICODE_WIDE
4657    for (qq = q; qq < e; qq += 4)
4658        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4659            pairs++;
4660#endif
4661
4662    /* This might be one to much, because of a BOM */
4663    unicode = _PyUnicode_New((size+3)/4+pairs);
4664    if (!unicode)
4665        return NULL;
4666    if (size == 0)
4667        return (PyObject *)unicode;
4668
4669    /* Unpack UTF-32 encoded data */
4670    p = PyUnicode_AS_UNICODE(unicode);
4671
4672    while (q < e) {
4673        Py_UCS4 ch;
4674        /* remaining bytes at the end? (size should be divisible by 4) */
4675        if (e-q<4) {
4676            if (consumed)
4677                break;
4678            errmsg = "truncated data";
4679            startinpos = ((const char *)q)-starts;
4680            endinpos = ((const char *)e)-starts;
4681            goto utf32Error;
4682            /* The remaining input chars are ignored if the callback
4683               chooses to skip the input */
4684        }
4685        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4686            (q[iorder[1]] << 8) | q[iorder[0]];
4687
4688        if (ch >= 0x110000)
4689        {
4690            errmsg = "codepoint not in range(0x110000)";
4691            startinpos = ((const char *)q)-starts;
4692            endinpos = startinpos+4;
4693            goto utf32Error;
4694        }
4695#ifndef Py_UNICODE_WIDE
4696        if (ch >= 0x10000)
4697        {
4698            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4699            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4700        }
4701        else
4702#endif
4703            *p++ = ch;
4704        q += 4;
4705        continue;
4706      utf32Error:
4707        outpos = p-PyUnicode_AS_UNICODE(unicode);
4708        if (unicode_decode_call_errorhandler(
4709                errors, &errorHandler,
4710                "utf32", errmsg,
4711                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4712                &unicode, &outpos, &p))
4713            goto onError;
4714    }
4715
4716    if (byteorder)
4717        *byteorder = bo;
4718
4719    if (consumed)
4720        *consumed = (const char *)q-starts;
4721
4722    /* Adjust length */
4723    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4724        goto onError;
4725
4726    Py_XDECREF(errorHandler);
4727    Py_XDECREF(exc);
4728    if (_PyUnicode_READY_REPLACE(&unicode)) {
4729        Py_DECREF(unicode);
4730        return NULL;
4731    }
4732    return (PyObject *)unicode;
4733
4734  onError:
4735    Py_DECREF(unicode);
4736    Py_XDECREF(errorHandler);
4737    Py_XDECREF(exc);
4738    return NULL;
4739}
4740
4741PyObject *
4742PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4743                      Py_ssize_t size,
4744                      const char *errors,
4745                      int byteorder)
4746{
4747    PyObject *v;
4748    unsigned char *p;
4749    Py_ssize_t nsize, bytesize;
4750#ifndef Py_UNICODE_WIDE
4751    Py_ssize_t i, pairs;
4752#else
4753    const int pairs = 0;
4754#endif
4755    /* Offsets from p for storing byte pairs in the right order. */
4756#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4757    int iorder[] = {0, 1, 2, 3};
4758#else
4759    int iorder[] = {3, 2, 1, 0};
4760#endif
4761
4762#define STORECHAR(CH)                           \
4763    do {                                        \
4764        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4765        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4766        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4767        p[iorder[0]] = (CH) & 0xff;             \
4768        p += 4;                                 \
4769    } while(0)
4770
4771    /* In narrow builds we can output surrogate pairs as one codepoint,
4772       so we need less space. */
4773#ifndef Py_UNICODE_WIDE
4774    for (i = pairs = 0; i < size-1; i++)
4775        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4776            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4777            pairs++;
4778#endif
4779    nsize = (size - pairs + (byteorder == 0));
4780    bytesize = nsize * 4;
4781    if (bytesize / 4 != nsize)
4782        return PyErr_NoMemory();
4783    v = PyBytes_FromStringAndSize(NULL, bytesize);
4784    if (v == NULL)
4785        return NULL;
4786
4787    p = (unsigned char *)PyBytes_AS_STRING(v);
4788    if (byteorder == 0)
4789        STORECHAR(0xFEFF);
4790    if (size == 0)
4791        goto done;
4792
4793    if (byteorder == -1) {
4794        /* force LE */
4795        iorder[0] = 0;
4796        iorder[1] = 1;
4797        iorder[2] = 2;
4798        iorder[3] = 3;
4799    }
4800    else if (byteorder == 1) {
4801        /* force BE */
4802        iorder[0] = 3;
4803        iorder[1] = 2;
4804        iorder[2] = 1;
4805        iorder[3] = 0;
4806    }
4807
4808    while (size-- > 0) {
4809        Py_UCS4 ch = *s++;
4810#ifndef Py_UNICODE_WIDE
4811        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4812            Py_UCS4 ch2 = *s;
4813            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4814                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4815                s++;
4816                size--;
4817            }
4818        }
4819#endif
4820        STORECHAR(ch);
4821    }
4822
4823  done:
4824    return v;
4825#undef STORECHAR
4826}
4827
4828PyObject *
4829PyUnicode_AsUTF32String(PyObject *unicode)
4830{
4831    if (!PyUnicode_Check(unicode)) {
4832        PyErr_BadArgument();
4833        return NULL;
4834    }
4835    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
4836                                 PyUnicode_GET_SIZE(unicode),
4837                                 NULL,
4838                                 0);
4839}
4840
4841/* --- UTF-16 Codec ------------------------------------------------------- */
4842
4843PyObject *
4844PyUnicode_DecodeUTF16(const char *s,
4845                      Py_ssize_t size,
4846                      const char *errors,
4847                      int *byteorder)
4848{
4849    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4850}
4851
4852/* Two masks for fast checking of whether a C 'long' may contain
4853   UTF16-encoded surrogate characters. This is an efficient heuristic,
4854   assuming that non-surrogate characters with a code point >= 0x8000 are
4855   rare in most input.
4856   FAST_CHAR_MASK is used when the input is in native byte ordering,
4857   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
4858*/
4859#if (SIZEOF_LONG == 8)
4860# define FAST_CHAR_MASK         0x8000800080008000L
4861# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4862#elif (SIZEOF_LONG == 4)
4863# define FAST_CHAR_MASK         0x80008000L
4864# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4865#else
4866# error C 'long' size should be either 4 or 8!
4867#endif
4868
4869PyObject *
4870PyUnicode_DecodeUTF16Stateful(const char *s,
4871                              Py_ssize_t size,
4872                              const char *errors,
4873                              int *byteorder,
4874                              Py_ssize_t *consumed)
4875{
4876    const char *starts = s;
4877    Py_ssize_t startinpos;
4878    Py_ssize_t endinpos;
4879    Py_ssize_t outpos;
4880    PyUnicodeObject *unicode;
4881    Py_UNICODE *p;
4882    const unsigned char *q, *e, *aligned_end;
4883    int bo = 0;       /* assume native ordering by default */
4884    int native_ordering = 0;
4885    const char *errmsg = "";
4886    /* Offsets from q for retrieving byte pairs in the right order. */
4887#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4888    int ihi = 1, ilo = 0;
4889#else
4890    int ihi = 0, ilo = 1;
4891#endif
4892    PyObject *errorHandler = NULL;
4893    PyObject *exc = NULL;
4894
4895    /* Note: size will always be longer than the resulting Unicode
4896       character count */
4897    unicode = _PyUnicode_New(size);
4898    if (!unicode)
4899        return NULL;
4900    if (size == 0)
4901        return (PyObject *)unicode;
4902
4903    /* Unpack UTF-16 encoded data */
4904    p = PyUnicode_AS_UNICODE(unicode);
4905    q = (unsigned char *)s;
4906    e = q + size - 1;
4907
4908    if (byteorder)
4909        bo = *byteorder;
4910
4911    /* Check for BOM marks (U+FEFF) in the input and adjust current
4912       byte order setting accordingly. In native mode, the leading BOM
4913       mark is skipped, in all other modes, it is copied to the output
4914       stream as-is (giving a ZWNBSP character). */
4915    if (bo == 0) {
4916        if (size >= 2) {
4917            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
4918#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4919            if (bom == 0xFEFF) {
4920                q += 2;
4921                bo = -1;
4922            }
4923            else if (bom == 0xFFFE) {
4924                q += 2;
4925                bo = 1;
4926            }
4927#else
4928            if (bom == 0xFEFF) {
4929                q += 2;
4930                bo = 1;
4931            }
4932            else if (bom == 0xFFFE) {
4933                q += 2;
4934                bo = -1;
4935            }
4936#endif
4937        }
4938    }
4939
4940    if (bo == -1) {
4941        /* force LE */
4942        ihi = 1;
4943        ilo = 0;
4944    }
4945    else if (bo == 1) {
4946        /* force BE */
4947        ihi = 0;
4948        ilo = 1;
4949    }
4950#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4951    native_ordering = ilo < ihi;
4952#else
4953    native_ordering = ilo > ihi;
4954#endif
4955
4956    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
4957    while (q < e) {
4958        Py_UNICODE ch;
4959        /* First check for possible aligned read of a C 'long'. Unaligned
4960           reads are more expensive, better to defer to another iteration. */
4961        if (!((size_t) q & LONG_PTR_MASK)) {
4962            /* Fast path for runs of non-surrogate chars. */
4963            register const unsigned char *_q = q;
4964            Py_UNICODE *_p = p;
4965            if (native_ordering) {
4966                /* Native ordering is simple: as long as the input cannot
4967                   possibly contain a surrogate char, do an unrolled copy
4968                   of several 16-bit code points to the target object.
4969                   The non-surrogate check is done on several input bytes
4970                   at a time (as many as a C 'long' can contain). */
4971                while (_q < aligned_end) {
4972                    unsigned long data = * (unsigned long *) _q;
4973                    if (data & FAST_CHAR_MASK)
4974                        break;
4975                    _p[0] = ((unsigned short *) _q)[0];
4976                    _p[1] = ((unsigned short *) _q)[1];
4977#if (SIZEOF_LONG == 8)
4978                    _p[2] = ((unsigned short *) _q)[2];
4979                    _p[3] = ((unsigned short *) _q)[3];
4980#endif
4981                    _q += SIZEOF_LONG;
4982                    _p += SIZEOF_LONG / 2;
4983                }
4984            }
4985            else {
4986                /* Byteswapped ordering is similar, but we must decompose
4987                   the copy bytewise, and take care of zero'ing out the
4988                   upper bytes if the target object is in 32-bit units
4989                   (that is, in UCS-4 builds). */
4990                while (_q < aligned_end) {
4991                    unsigned long data = * (unsigned long *) _q;
4992                    if (data & SWAPPED_FAST_CHAR_MASK)
4993                        break;
4994                    /* Zero upper bytes in UCS-4 builds */
4995#if (Py_UNICODE_SIZE > 2)
4996                    _p[0] = 0;
4997                    _p[1] = 0;
4998#if (SIZEOF_LONG == 8)
4999                    _p[2] = 0;
5000                    _p[3] = 0;
5001#endif
5002#endif
5003                    /* Issue #4916; UCS-4 builds on big endian machines must
5004                       fill the two last bytes of each 4-byte unit. */
5005#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5006# define OFF 2
5007#else
5008# define OFF 0
5009#endif
5010                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5011                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5012                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5013                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5014#if (SIZEOF_LONG == 8)
5015                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5016                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5017                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5018                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5019#endif
5020#undef OFF
5021                    _q += SIZEOF_LONG;
5022                    _p += SIZEOF_LONG / 2;
5023                }
5024            }
5025            p = _p;
5026            q = _q;
5027            if (q >= e)
5028                break;
5029        }
5030        ch = (q[ihi] << 8) | q[ilo];
5031
5032        q += 2;
5033
5034        if (ch < 0xD800 || ch > 0xDFFF) {
5035            *p++ = ch;
5036            continue;
5037        }
5038
5039        /* UTF-16 code pair: */
5040        if (q > e) {
5041            errmsg = "unexpected end of data";
5042            startinpos = (((const char *)q) - 2) - starts;
5043            endinpos = ((const char *)e) + 1 - starts;
5044            goto utf16Error;
5045        }
5046        if (0xD800 <= ch && ch <= 0xDBFF) {
5047            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5048            q += 2;
5049            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5050#ifndef Py_UNICODE_WIDE
5051                *p++ = ch;
5052                *p++ = ch2;
5053#else
5054                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5055#endif
5056                continue;
5057            }
5058            else {
5059                errmsg = "illegal UTF-16 surrogate";
5060                startinpos = (((const char *)q)-4)-starts;
5061                endinpos = startinpos+2;
5062                goto utf16Error;
5063            }
5064
5065        }
5066        errmsg = "illegal encoding";
5067        startinpos = (((const char *)q)-2)-starts;
5068        endinpos = startinpos+2;
5069        /* Fall through to report the error */
5070
5071      utf16Error:
5072        outpos = p - PyUnicode_AS_UNICODE(unicode);
5073        if (unicode_decode_call_errorhandler(
5074                errors,
5075                &errorHandler,
5076                "utf16", errmsg,
5077                &starts,
5078                (const char **)&e,
5079                &startinpos,
5080                &endinpos,
5081                &exc,
5082                (const char **)&q,
5083                &unicode,
5084                &outpos,
5085                &p))
5086            goto onError;
5087    }
5088    /* remaining byte at the end? (size should be even) */
5089    if (e == q) {
5090        if (!consumed) {
5091            errmsg = "truncated data";
5092            startinpos = ((const char *)q) - starts;
5093            endinpos = ((const char *)e) + 1 - starts;
5094            outpos = p - PyUnicode_AS_UNICODE(unicode);
5095            if (unicode_decode_call_errorhandler(
5096                    errors,
5097                    &errorHandler,
5098                    "utf16", errmsg,
5099                    &starts,
5100                    (const char **)&e,
5101                    &startinpos,
5102                    &endinpos,
5103                    &exc,
5104                    (const char **)&q,
5105                    &unicode,
5106                    &outpos,
5107                    &p))
5108                goto onError;
5109            /* The remaining input chars are ignored if the callback
5110               chooses to skip the input */
5111        }
5112    }
5113
5114    if (byteorder)
5115        *byteorder = bo;
5116
5117    if (consumed)
5118        *consumed = (const char *)q-starts;
5119
5120    /* Adjust length */
5121    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5122        goto onError;
5123
5124    Py_XDECREF(errorHandler);
5125    Py_XDECREF(exc);
5126    if (_PyUnicode_READY_REPLACE(&unicode)) {
5127        Py_DECREF(unicode);
5128        return NULL;
5129    }
5130    return (PyObject *)unicode;
5131
5132  onError:
5133    Py_DECREF(unicode);
5134    Py_XDECREF(errorHandler);
5135    Py_XDECREF(exc);
5136    return NULL;
5137}
5138
5139#undef FAST_CHAR_MASK
5140#undef SWAPPED_FAST_CHAR_MASK
5141
5142PyObject *
5143PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5144                      Py_ssize_t size,
5145                      const char *errors,
5146                      int byteorder)
5147{
5148    PyObject *v;
5149    unsigned char *p;
5150    Py_ssize_t nsize, bytesize;
5151#ifdef Py_UNICODE_WIDE
5152    Py_ssize_t i, pairs;
5153#else
5154    const int pairs = 0;
5155#endif
5156    /* Offsets from p for storing byte pairs in the right order. */
5157#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5158    int ihi = 1, ilo = 0;
5159#else
5160    int ihi = 0, ilo = 1;
5161#endif
5162
5163#define STORECHAR(CH)                           \
5164    do {                                        \
5165        p[ihi] = ((CH) >> 8) & 0xff;            \
5166        p[ilo] = (CH) & 0xff;                   \
5167        p += 2;                                 \
5168    } while(0)
5169
5170#ifdef Py_UNICODE_WIDE
5171    for (i = pairs = 0; i < size; i++)
5172        if (s[i] >= 0x10000)
5173            pairs++;
5174#endif
5175    /* 2 * (size + pairs + (byteorder == 0)) */
5176    if (size > PY_SSIZE_T_MAX ||
5177        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5178        return PyErr_NoMemory();
5179    nsize = size + pairs + (byteorder == 0);
5180    bytesize = nsize * 2;
5181    if (bytesize / 2 != nsize)
5182        return PyErr_NoMemory();
5183    v = PyBytes_FromStringAndSize(NULL, bytesize);
5184    if (v == NULL)
5185        return NULL;
5186
5187    p = (unsigned char *)PyBytes_AS_STRING(v);
5188    if (byteorder == 0)
5189        STORECHAR(0xFEFF);
5190    if (size == 0)
5191        goto done;
5192
5193    if (byteorder == -1) {
5194        /* force LE */
5195        ihi = 1;
5196        ilo = 0;
5197    }
5198    else if (byteorder == 1) {
5199        /* force BE */
5200        ihi = 0;
5201        ilo = 1;
5202    }
5203
5204    while (size-- > 0) {
5205        Py_UNICODE ch = *s++;
5206        Py_UNICODE ch2 = 0;
5207#ifdef Py_UNICODE_WIDE
5208        if (ch >= 0x10000) {
5209            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5210            ch  = 0xD800 | ((ch-0x10000) >> 10);
5211        }
5212#endif
5213        STORECHAR(ch);
5214        if (ch2)
5215            STORECHAR(ch2);
5216    }
5217
5218  done:
5219    return v;
5220#undef STORECHAR
5221}
5222
5223PyObject *
5224PyUnicode_AsUTF16String(PyObject *unicode)
5225{
5226    if (!PyUnicode_Check(unicode)) {
5227        PyErr_BadArgument();
5228        return NULL;
5229    }
5230    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5231                                 PyUnicode_GET_SIZE(unicode),
5232                                 NULL,
5233                                 0);
5234}
5235
5236/* --- Unicode Escape Codec ----------------------------------------------- */
5237
5238/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5239   if all the escapes in the string make it still a valid ASCII string.
5240   Returns -1 if any escapes were found which cause the string to
5241   pop out of ASCII range.  Otherwise returns the length of the
5242   required buffer to hold the string.
5243   */
5244Py_ssize_t
5245length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5246{
5247    const unsigned char *p = (const unsigned char *)s;
5248    const unsigned char *end = p + size;
5249    Py_ssize_t length = 0;
5250
5251    if (size < 0)
5252        return -1;
5253
5254    for (; p < end; ++p) {
5255        if (*p > 127) {
5256            /* Non-ASCII */
5257            return -1;
5258        }
5259        else if (*p != '\\') {
5260            /* Normal character */
5261            ++length;
5262        }
5263        else {
5264            /* Backslash-escape, check next char */
5265            ++p;
5266            /* Escape sequence reaches till end of string or
5267               non-ASCII follow-up. */
5268            if (p >= end || *p > 127)
5269                return -1;
5270            switch (*p) {
5271            case '\n':
5272                /* backslash + \n result in zero characters */
5273                break;
5274            case '\\': case '\'': case '\"':
5275            case 'b': case 'f': case 't':
5276            case 'n': case 'r': case 'v': case 'a':
5277                ++length;
5278                break;
5279            case '0': case '1': case '2': case '3':
5280            case '4': case '5': case '6': case '7':
5281            case 'x': case 'u': case 'U': case 'N':
5282                /* these do not guarantee ASCII characters */
5283                return -1;
5284            default:
5285                /* count the backslash + the other character */
5286                length += 2;
5287            }
5288        }
5289    }
5290    return length;
5291}
5292
5293/* Similar to PyUnicode_WRITE but either write into wstr field
5294   or treat string as ASCII. */
5295#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5296    do { \
5297        if ((kind) != PyUnicode_WCHAR_KIND) \
5298            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5299        else \
5300            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5301    } while (0)
5302
5303#define WRITE_WSTR(buf, index, value) \
5304    assert(kind == PyUnicode_WCHAR_KIND), \
5305    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5306
5307
5308static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5309
5310PyObject *
5311PyUnicode_DecodeUnicodeEscape(const char *s,
5312                              Py_ssize_t size,
5313                              const char *errors)
5314{
5315    const char *starts = s;
5316    Py_ssize_t startinpos;
5317    Py_ssize_t endinpos;
5318    int j;
5319    PyUnicodeObject *v;
5320    Py_UNICODE *p;
5321    const char *end;
5322    char* message;
5323    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5324    PyObject *errorHandler = NULL;
5325    PyObject *exc = NULL;
5326    Py_ssize_t ascii_length;
5327    Py_ssize_t i;
5328    int kind;
5329    void *data;
5330
5331    ascii_length = length_of_escaped_ascii_string(s, size);
5332
5333    /* After length_of_escaped_ascii_string() there are two alternatives,
5334       either the string is pure ASCII with named escapes like \n, etc.
5335       and we determined it's exact size (common case)
5336       or it contains \x, \u, ... escape sequences.  then we create a
5337       legacy wchar string and resize it at the end of this function. */
5338    if (ascii_length >= 0) {
5339        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5340        if (!v)
5341            goto onError;
5342        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5343        kind = PyUnicode_1BYTE_KIND;
5344        data = PyUnicode_DATA(v);
5345    }
5346    else {
5347        /* Escaped strings will always be longer than the resulting
5348           Unicode string, so we start with size here and then reduce the
5349           length after conversion to the true value.
5350           (but if the error callback returns a long replacement string
5351           we'll have to allocate more space) */
5352        v = _PyUnicode_New(size);
5353        if (!v)
5354            goto onError;
5355        kind = PyUnicode_WCHAR_KIND;
5356        data = PyUnicode_AS_UNICODE(v);
5357    }
5358
5359    if (size == 0)
5360        return (PyObject *)v;
5361    i = 0;
5362    end = s + size;
5363
5364    while (s < end) {
5365        unsigned char c;
5366        Py_UNICODE x;
5367        int digits;
5368
5369        if (kind == PyUnicode_WCHAR_KIND) {
5370            assert(i < _PyUnicode_WSTR_LENGTH(v));
5371        }
5372        else {
5373            /* The only case in which i == ascii_length is a backslash
5374               followed by a newline. */
5375            assert(i <= ascii_length);
5376        }
5377
5378        /* Non-escape characters are interpreted as Unicode ordinals */
5379        if (*s != '\\') {
5380            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5381            continue;
5382        }
5383
5384        startinpos = s-starts;
5385        /* \ - Escapes */
5386        s++;
5387        c = *s++;
5388        if (s > end)
5389            c = '\0'; /* Invalid after \ */
5390
5391        if (kind == PyUnicode_WCHAR_KIND) {
5392            assert(i < _PyUnicode_WSTR_LENGTH(v));
5393        }
5394        else {
5395            /* The only case in which i == ascii_length is a backslash
5396               followed by a newline. */
5397            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5398        }
5399
5400        switch (c) {
5401
5402            /* \x escapes */
5403        case '\n': break;
5404        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5405        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5406        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5407        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5408        /* FF */
5409        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5410        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5411        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5412        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5413        /* VT */
5414        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5415        /* BEL, not classic C */
5416        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5417
5418            /* \OOO (octal) escapes */
5419        case '0': case '1': case '2': case '3':
5420        case '4': case '5': case '6': case '7':
5421            x = s[-1] - '0';
5422            if (s < end && '0' <= *s && *s <= '7') {
5423                x = (x<<3) + *s++ - '0';
5424                if (s < end && '0' <= *s && *s <= '7')
5425                    x = (x<<3) + *s++ - '0';
5426            }
5427            WRITE_WSTR(data, i++, x);
5428            break;
5429
5430            /* hex escapes */
5431            /* \xXX */
5432        case 'x':
5433            digits = 2;
5434            message = "truncated \\xXX escape";
5435            goto hexescape;
5436
5437            /* \uXXXX */
5438        case 'u':
5439            digits = 4;
5440            message = "truncated \\uXXXX escape";
5441            goto hexescape;
5442
5443            /* \UXXXXXXXX */
5444        case 'U':
5445            digits = 8;
5446            message = "truncated \\UXXXXXXXX escape";
5447        hexescape:
5448            chr = 0;
5449            p = PyUnicode_AS_UNICODE(v) + i;
5450            if (s+digits>end) {
5451                endinpos = size;
5452                if (unicode_decode_call_errorhandler(
5453                        errors, &errorHandler,
5454                        "unicodeescape", "end of string in escape sequence",
5455                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5456                        &v, &i, &p))
5457                    goto onError;
5458                data = PyUnicode_AS_UNICODE(v);
5459                goto nextByte;
5460            }
5461            for (j = 0; j < digits; ++j) {
5462                c = (unsigned char) s[j];
5463                if (!Py_ISXDIGIT(c)) {
5464                    endinpos = (s+j+1)-starts;
5465                    p = PyUnicode_AS_UNICODE(v) + i;
5466                    if (unicode_decode_call_errorhandler(
5467                            errors, &errorHandler,
5468                            "unicodeescape", message,
5469                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5470                            &v, &i, &p))
5471                        goto onError;
5472                    data = PyUnicode_AS_UNICODE(v);
5473                    goto nextByte;
5474                }
5475                chr = (chr<<4) & ~0xF;
5476                if (c >= '0' && c <= '9')
5477                    chr += c - '0';
5478                else if (c >= 'a' && c <= 'f')
5479                    chr += 10 + c - 'a';
5480                else
5481                    chr += 10 + c - 'A';
5482            }
5483            s += j;
5484            if (chr == 0xffffffff && PyErr_Occurred())
5485                /* _decoding_error will have already written into the
5486                   target buffer. */
5487                break;
5488        store:
5489            /* when we get here, chr is a 32-bit unicode character */
5490            if (chr <= 0xffff)
5491                /* UCS-2 character */
5492                WRITE_WSTR(data, i++, chr);
5493            else if (chr <= 0x10ffff) {
5494                /* UCS-4 character. Either store directly, or as
5495                   surrogate pair. */
5496#ifdef Py_UNICODE_WIDE
5497                WRITE_WSTR(data, i++, chr);
5498#else
5499                chr -= 0x10000L;
5500                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5501                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5502#endif
5503            } else {
5504                endinpos = s-starts;
5505                p = PyUnicode_AS_UNICODE(v) + i;
5506                if (unicode_decode_call_errorhandler(
5507                        errors, &errorHandler,
5508                        "unicodeescape", "illegal Unicode character",
5509                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5510                        &v, &i, &p))
5511                    goto onError;
5512                data = PyUnicode_AS_UNICODE(v);
5513            }
5514            break;
5515
5516            /* \N{name} */
5517        case 'N':
5518            message = "malformed \\N character escape";
5519            if (ucnhash_CAPI == NULL) {
5520                /* load the unicode data module */
5521                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5522                                                PyUnicodeData_CAPSULE_NAME, 1);
5523                if (ucnhash_CAPI == NULL)
5524                    goto ucnhashError;
5525            }
5526            if (*s == '{') {
5527                const char *start = s+1;
5528                /* look for the closing brace */
5529                while (*s != '}' && s < end)
5530                    s++;
5531                if (s > start && s < end && *s == '}') {
5532                    /* found a name.  look it up in the unicode database */
5533                    message = "unknown Unicode character name";
5534                    s++;
5535                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5536                                              &chr))
5537                        goto store;
5538                }
5539            }
5540            endinpos = s-starts;
5541            p = PyUnicode_AS_UNICODE(v) + i;
5542            if (unicode_decode_call_errorhandler(
5543                    errors, &errorHandler,
5544                    "unicodeescape", message,
5545                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5546                    &v, &i, &p))
5547                goto onError;
5548            data = PyUnicode_AS_UNICODE(v);
5549            break;
5550
5551        default:
5552            if (s > end) {
5553                assert(kind == PyUnicode_WCHAR_KIND);
5554                message = "\\ at end of string";
5555                s--;
5556                endinpos = s-starts;
5557                p = PyUnicode_AS_UNICODE(v) + i;
5558                if (unicode_decode_call_errorhandler(
5559                        errors, &errorHandler,
5560                        "unicodeescape", message,
5561                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5562                        &v, &i, &p))
5563                    goto onError;
5564                data = PyUnicode_AS_UNICODE(v);
5565            }
5566            else {
5567                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5568                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5569            }
5570            break;
5571        }
5572      nextByte:
5573        ;
5574    }
5575    /* Ensure the length prediction worked in case of ASCII strings */
5576    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5577
5578    if (kind == PyUnicode_WCHAR_KIND)
5579    {
5580        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5581            goto onError;
5582    }
5583    Py_XDECREF(errorHandler);
5584    Py_XDECREF(exc);
5585    if (_PyUnicode_READY_REPLACE(&v)) {
5586        Py_DECREF(v);
5587        return NULL;
5588    }
5589    return (PyObject *)v;
5590
5591  ucnhashError:
5592    PyErr_SetString(
5593        PyExc_UnicodeError,
5594        "\\N escapes not supported (can't load unicodedata module)"
5595        );
5596    Py_XDECREF(v);
5597    Py_XDECREF(errorHandler);
5598    Py_XDECREF(exc);
5599    return NULL;
5600
5601  onError:
5602    Py_XDECREF(v);
5603    Py_XDECREF(errorHandler);
5604    Py_XDECREF(exc);
5605    return NULL;
5606}
5607
5608#undef WRITE_ASCII_OR_WSTR
5609#undef WRITE_WSTR
5610
5611/* Return a Unicode-Escape string version of the Unicode object.
5612
5613   If quotes is true, the string is enclosed in u"" or u'' quotes as
5614   appropriate.
5615
5616*/
5617
5618static const char *hexdigits = "0123456789abcdef";
5619
5620PyObject *
5621PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5622                              Py_ssize_t size)
5623{
5624    PyObject *repr;
5625    char *p;
5626
5627#ifdef Py_UNICODE_WIDE
5628    const Py_ssize_t expandsize = 10;
5629#else
5630    const Py_ssize_t expandsize = 6;
5631#endif
5632
5633    /* XXX(nnorwitz): rather than over-allocating, it would be
5634       better to choose a different scheme.  Perhaps scan the
5635       first N-chars of the string and allocate based on that size.
5636    */
5637    /* Initial allocation is based on the longest-possible unichr
5638       escape.
5639
5640       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5641       unichr, so in this case it's the longest unichr escape. In
5642       narrow (UTF-16) builds this is five chars per source unichr
5643       since there are two unichrs in the surrogate pair, so in narrow
5644       (UTF-16) builds it's not the longest unichr escape.
5645
5646       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5647       so in the narrow (UTF-16) build case it's the longest unichr
5648       escape.
5649    */
5650
5651    if (size == 0)
5652        return PyBytes_FromStringAndSize(NULL, 0);
5653
5654    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5655        return PyErr_NoMemory();
5656
5657    repr = PyBytes_FromStringAndSize(NULL,
5658                                     2
5659                                     + expandsize*size
5660                                     + 1);
5661    if (repr == NULL)
5662        return NULL;
5663
5664    p = PyBytes_AS_STRING(repr);
5665
5666    while (size-- > 0) {
5667        Py_UNICODE ch = *s++;
5668
5669        /* Escape backslashes */
5670        if (ch == '\\') {
5671            *p++ = '\\';
5672            *p++ = (char) ch;
5673            continue;
5674        }
5675
5676#ifdef Py_UNICODE_WIDE
5677        /* Map 21-bit characters to '\U00xxxxxx' */
5678        else if (ch >= 0x10000) {
5679            *p++ = '\\';
5680            *p++ = 'U';
5681            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5682            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5683            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5684            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5685            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5686            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5687            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5688            *p++ = hexdigits[ch & 0x0000000F];
5689            continue;
5690        }
5691#else
5692        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5693        else if (ch >= 0xD800 && ch < 0xDC00) {
5694            Py_UNICODE ch2;
5695            Py_UCS4 ucs;
5696
5697            ch2 = *s++;
5698            size--;
5699            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5700                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5701                *p++ = '\\';
5702                *p++ = 'U';
5703                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5704                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5705                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5706                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5707                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5708                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5709                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5710                *p++ = hexdigits[ucs & 0x0000000F];
5711                continue;
5712            }
5713            /* Fall through: isolated surrogates are copied as-is */
5714            s--;
5715            size++;
5716        }
5717#endif
5718
5719        /* Map 16-bit characters to '\uxxxx' */
5720        if (ch >= 256) {
5721            *p++ = '\\';
5722            *p++ = 'u';
5723            *p++ = hexdigits[(ch >> 12) & 0x000F];
5724            *p++ = hexdigits[(ch >> 8) & 0x000F];
5725            *p++ = hexdigits[(ch >> 4) & 0x000F];
5726            *p++ = hexdigits[ch & 0x000F];
5727        }
5728
5729        /* Map special whitespace to '\t', \n', '\r' */
5730        else if (ch == '\t') {
5731            *p++ = '\\';
5732            *p++ = 't';
5733        }
5734        else if (ch == '\n') {
5735            *p++ = '\\';
5736            *p++ = 'n';
5737        }
5738        else if (ch == '\r') {
5739            *p++ = '\\';
5740            *p++ = 'r';
5741        }
5742
5743        /* Map non-printable US ASCII to '\xhh' */
5744        else if (ch < ' ' || ch >= 0x7F) {
5745            *p++ = '\\';
5746            *p++ = 'x';
5747            *p++ = hexdigits[(ch >> 4) & 0x000F];
5748            *p++ = hexdigits[ch & 0x000F];
5749        }
5750
5751        /* Copy everything else as-is */
5752        else
5753            *p++ = (char) ch;
5754    }
5755
5756    assert(p - PyBytes_AS_STRING(repr) > 0);
5757    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5758        return NULL;
5759    return repr;
5760}
5761
5762PyObject *
5763PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5764{
5765    PyObject *s;
5766    if (!PyUnicode_Check(unicode)) {
5767        PyErr_BadArgument();
5768        return NULL;
5769    }
5770    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5771                                      PyUnicode_GET_SIZE(unicode));
5772    return s;
5773}
5774
5775/* --- Raw Unicode Escape Codec ------------------------------------------- */
5776
5777PyObject *
5778PyUnicode_DecodeRawUnicodeEscape(const char *s,
5779                                 Py_ssize_t size,
5780                                 const char *errors)
5781{
5782    const char *starts = s;
5783    Py_ssize_t startinpos;
5784    Py_ssize_t endinpos;
5785    Py_ssize_t outpos;
5786    PyUnicodeObject *v;
5787    Py_UNICODE *p;
5788    const char *end;
5789    const char *bs;
5790    PyObject *errorHandler = NULL;
5791    PyObject *exc = NULL;
5792
5793    /* Escaped strings will always be longer than the resulting
5794       Unicode string, so we start with size here and then reduce the
5795       length after conversion to the true value. (But decoding error
5796       handler might have to resize the string) */
5797    v = _PyUnicode_New(size);
5798    if (v == NULL)
5799        goto onError;
5800    if (size == 0)
5801        return (PyObject *)v;
5802    p = PyUnicode_AS_UNICODE(v);
5803    end = s + size;
5804    while (s < end) {
5805        unsigned char c;
5806        Py_UCS4 x;
5807        int i;
5808        int count;
5809
5810        /* Non-escape characters are interpreted as Unicode ordinals */
5811        if (*s != '\\') {
5812            *p++ = (unsigned char)*s++;
5813            continue;
5814        }
5815        startinpos = s-starts;
5816
5817        /* \u-escapes are only interpreted iff the number of leading
5818           backslashes if odd */
5819        bs = s;
5820        for (;s < end;) {
5821            if (*s != '\\')
5822                break;
5823            *p++ = (unsigned char)*s++;
5824        }
5825        if (((s - bs) & 1) == 0 ||
5826            s >= end ||
5827            (*s != 'u' && *s != 'U')) {
5828            continue;
5829        }
5830        p--;
5831        count = *s=='u' ? 4 : 8;
5832        s++;
5833
5834        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5835        outpos = p-PyUnicode_AS_UNICODE(v);
5836        for (x = 0, i = 0; i < count; ++i, ++s) {
5837            c = (unsigned char)*s;
5838            if (!Py_ISXDIGIT(c)) {
5839                endinpos = s-starts;
5840                if (unicode_decode_call_errorhandler(
5841                        errors, &errorHandler,
5842                        "rawunicodeescape", "truncated \\uXXXX",
5843                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5844                        &v, &outpos, &p))
5845                    goto onError;
5846                goto nextByte;
5847            }
5848            x = (x<<4) & ~0xF;
5849            if (c >= '0' && c <= '9')
5850                x += c - '0';
5851            else if (c >= 'a' && c <= 'f')
5852                x += 10 + c - 'a';
5853            else
5854                x += 10 + c - 'A';
5855        }
5856        if (x <= 0xffff)
5857            /* UCS-2 character */
5858            *p++ = (Py_UNICODE) x;
5859        else if (x <= 0x10ffff) {
5860            /* UCS-4 character. Either store directly, or as
5861               surrogate pair. */
5862#ifdef Py_UNICODE_WIDE
5863            *p++ = (Py_UNICODE) x;
5864#else
5865            x -= 0x10000L;
5866            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5867            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
5868#endif
5869        } else {
5870            endinpos = s-starts;
5871            outpos = p-PyUnicode_AS_UNICODE(v);
5872            if (unicode_decode_call_errorhandler(
5873                    errors, &errorHandler,
5874                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5875                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5876                    &v, &outpos, &p))
5877                goto onError;
5878        }
5879      nextByte:
5880        ;
5881    }
5882    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5883        goto onError;
5884    Py_XDECREF(errorHandler);
5885    Py_XDECREF(exc);
5886    if (_PyUnicode_READY_REPLACE(&v)) {
5887        Py_DECREF(v);
5888        return NULL;
5889    }
5890    return (PyObject *)v;
5891
5892  onError:
5893    Py_XDECREF(v);
5894    Py_XDECREF(errorHandler);
5895    Py_XDECREF(exc);
5896    return NULL;
5897}
5898
5899PyObject *
5900PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5901                                 Py_ssize_t size)
5902{
5903    PyObject *repr;
5904    char *p;
5905    char *q;
5906
5907#ifdef Py_UNICODE_WIDE
5908    const Py_ssize_t expandsize = 10;
5909#else
5910    const Py_ssize_t expandsize = 6;
5911#endif
5912
5913    if (size > PY_SSIZE_T_MAX / expandsize)
5914        return PyErr_NoMemory();
5915
5916    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
5917    if (repr == NULL)
5918        return NULL;
5919    if (size == 0)
5920        return repr;
5921
5922    p = q = PyBytes_AS_STRING(repr);
5923    while (size-- > 0) {
5924        Py_UNICODE ch = *s++;
5925#ifdef Py_UNICODE_WIDE
5926        /* Map 32-bit characters to '\Uxxxxxxxx' */
5927        if (ch >= 0x10000) {
5928            *p++ = '\\';
5929            *p++ = 'U';
5930            *p++ = hexdigits[(ch >> 28) & 0xf];
5931            *p++ = hexdigits[(ch >> 24) & 0xf];
5932            *p++ = hexdigits[(ch >> 20) & 0xf];
5933            *p++ = hexdigits[(ch >> 16) & 0xf];
5934            *p++ = hexdigits[(ch >> 12) & 0xf];
5935            *p++ = hexdigits[(ch >> 8) & 0xf];
5936            *p++ = hexdigits[(ch >> 4) & 0xf];
5937            *p++ = hexdigits[ch & 15];
5938        }
5939        else
5940#else
5941            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5942            if (ch >= 0xD800 && ch < 0xDC00) {
5943                Py_UNICODE ch2;
5944                Py_UCS4 ucs;
5945
5946                ch2 = *s++;
5947                size--;
5948                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5949                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5950                    *p++ = '\\';
5951                    *p++ = 'U';
5952                    *p++ = hexdigits[(ucs >> 28) & 0xf];
5953                    *p++ = hexdigits[(ucs >> 24) & 0xf];
5954                    *p++ = hexdigits[(ucs >> 20) & 0xf];
5955                    *p++ = hexdigits[(ucs >> 16) & 0xf];
5956                    *p++ = hexdigits[(ucs >> 12) & 0xf];
5957                    *p++ = hexdigits[(ucs >> 8) & 0xf];
5958                    *p++ = hexdigits[(ucs >> 4) & 0xf];
5959                    *p++ = hexdigits[ucs & 0xf];
5960                    continue;
5961                }
5962                /* Fall through: isolated surrogates are copied as-is */
5963                s--;
5964                size++;
5965            }
5966#endif
5967        /* Map 16-bit characters to '\uxxxx' */
5968        if (ch >= 256) {
5969            *p++ = '\\';
5970            *p++ = 'u';
5971            *p++ = hexdigits[(ch >> 12) & 0xf];
5972            *p++ = hexdigits[(ch >> 8) & 0xf];
5973            *p++ = hexdigits[(ch >> 4) & 0xf];
5974            *p++ = hexdigits[ch & 15];
5975        }
5976        /* Copy everything else as-is */
5977        else
5978            *p++ = (char) ch;
5979    }
5980    size = p - q;
5981
5982    assert(size > 0);
5983    if (_PyBytes_Resize(&repr, size) < 0)
5984        return NULL;
5985    return repr;
5986}
5987
5988PyObject *
5989PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5990{
5991    PyObject *s;
5992    if (!PyUnicode_Check(unicode)) {
5993        PyErr_BadArgument();
5994        return NULL;
5995    }
5996    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5997                                         PyUnicode_GET_SIZE(unicode));
5998
5999    return s;
6000}
6001
6002/* --- Unicode Internal Codec ------------------------------------------- */
6003
6004PyObject *
6005_PyUnicode_DecodeUnicodeInternal(const char *s,
6006                                 Py_ssize_t size,
6007                                 const char *errors)
6008{
6009    const char *starts = s;
6010    Py_ssize_t startinpos;
6011    Py_ssize_t endinpos;
6012    Py_ssize_t outpos;
6013    PyUnicodeObject *v;
6014    Py_UNICODE *p;
6015    const char *end;
6016    const char *reason;
6017    PyObject *errorHandler = NULL;
6018    PyObject *exc = NULL;
6019
6020#ifdef Py_UNICODE_WIDE
6021    Py_UNICODE unimax = PyUnicode_GetMax();
6022#endif
6023
6024    /* XXX overflow detection missing */
6025    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6026    if (v == NULL)
6027        goto onError;
6028    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6029       as string was created with the old API. */
6030    if (PyUnicode_GET_SIZE(v) == 0)
6031        return (PyObject *)v;
6032    p = PyUnicode_AS_UNICODE(v);
6033    end = s + size;
6034
6035    while (s < end) {
6036        memcpy(p, s, sizeof(Py_UNICODE));
6037        /* We have to sanity check the raw data, otherwise doom looms for
6038           some malformed UCS-4 data. */
6039        if (
6040#ifdef Py_UNICODE_WIDE
6041            *p > unimax || *p < 0 ||
6042#endif
6043            end-s < Py_UNICODE_SIZE
6044            )
6045        {
6046            startinpos = s - starts;
6047            if (end-s < Py_UNICODE_SIZE) {
6048                endinpos = end-starts;
6049                reason = "truncated input";
6050            }
6051            else {
6052                endinpos = s - starts + Py_UNICODE_SIZE;
6053                reason = "illegal code point (> 0x10FFFF)";
6054            }
6055            outpos = p - PyUnicode_AS_UNICODE(v);
6056            if (unicode_decode_call_errorhandler(
6057                    errors, &errorHandler,
6058                    "unicode_internal", reason,
6059                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6060                    &v, &outpos, &p)) {
6061                goto onError;
6062            }
6063        }
6064        else {
6065            p++;
6066            s += Py_UNICODE_SIZE;
6067        }
6068    }
6069
6070    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6071        goto onError;
6072    Py_XDECREF(errorHandler);
6073    Py_XDECREF(exc);
6074    if (_PyUnicode_READY_REPLACE(&v)) {
6075        Py_DECREF(v);
6076        return NULL;
6077    }
6078    return (PyObject *)v;
6079
6080  onError:
6081    Py_XDECREF(v);
6082    Py_XDECREF(errorHandler);
6083    Py_XDECREF(exc);
6084    return NULL;
6085}
6086
6087/* --- Latin-1 Codec ------------------------------------------------------ */
6088
6089PyObject *
6090PyUnicode_DecodeLatin1(const char *s,
6091                       Py_ssize_t size,
6092                       const char *errors)
6093{
6094    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6095    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6096}
6097
6098/* create or adjust a UnicodeEncodeError */
6099static void
6100make_encode_exception(PyObject **exceptionObject,
6101                      const char *encoding,
6102                      const Py_UNICODE *unicode, Py_ssize_t size,
6103                      Py_ssize_t startpos, Py_ssize_t endpos,
6104                      const char *reason)
6105{
6106    if (*exceptionObject == NULL) {
6107        *exceptionObject = PyUnicodeEncodeError_Create(
6108            encoding, unicode, size, startpos, endpos, reason);
6109    }
6110    else {
6111        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6112            goto onError;
6113        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6114            goto onError;
6115        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6116            goto onError;
6117        return;
6118      onError:
6119        Py_DECREF(*exceptionObject);
6120        *exceptionObject = NULL;
6121    }
6122}
6123
6124/* raises a UnicodeEncodeError */
6125static void
6126raise_encode_exception(PyObject **exceptionObject,
6127                       const char *encoding,
6128                       const Py_UNICODE *unicode, Py_ssize_t size,
6129                       Py_ssize_t startpos, Py_ssize_t endpos,
6130                       const char *reason)
6131{
6132    make_encode_exception(exceptionObject,
6133                          encoding, unicode, size, startpos, endpos, reason);
6134    if (*exceptionObject != NULL)
6135        PyCodec_StrictErrors(*exceptionObject);
6136}
6137
6138/* error handling callback helper:
6139   build arguments, call the callback and check the arguments,
6140   put the result into newpos and return the replacement string, which
6141   has to be freed by the caller */
6142static PyObject *
6143unicode_encode_call_errorhandler(const char *errors,
6144                                 PyObject **errorHandler,
6145                                 const char *encoding, const char *reason,
6146                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6147                                 Py_ssize_t startpos, Py_ssize_t endpos,
6148                                 Py_ssize_t *newpos)
6149{
6150    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6151
6152    PyObject *restuple;
6153    PyObject *resunicode;
6154
6155    if (*errorHandler == NULL) {
6156        *errorHandler = PyCodec_LookupError(errors);
6157        if (*errorHandler == NULL)
6158            return NULL;
6159    }
6160
6161    make_encode_exception(exceptionObject,
6162                          encoding, unicode, size, startpos, endpos, reason);
6163    if (*exceptionObject == NULL)
6164        return NULL;
6165
6166    restuple = PyObject_CallFunctionObjArgs(
6167        *errorHandler, *exceptionObject, NULL);
6168    if (restuple == NULL)
6169        return NULL;
6170    if (!PyTuple_Check(restuple)) {
6171        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6172        Py_DECREF(restuple);
6173        return NULL;
6174    }
6175    if (!PyArg_ParseTuple(restuple, argparse,
6176                          &resunicode, newpos)) {
6177        Py_DECREF(restuple);
6178        return NULL;
6179    }
6180    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6181        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6182        Py_DECREF(restuple);
6183        return NULL;
6184    }
6185    if (*newpos<0)
6186        *newpos = size+*newpos;
6187    if (*newpos<0 || *newpos>size) {
6188        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6189        Py_DECREF(restuple);
6190        return NULL;
6191    }
6192    Py_INCREF(resunicode);
6193    Py_DECREF(restuple);
6194    return resunicode;
6195}
6196
6197static PyObject *
6198unicode_encode_ucs1(const Py_UNICODE *p,
6199                    Py_ssize_t size,
6200                    const char *errors,
6201                    int limit)
6202{
6203    /* output object */
6204    PyObject *res;
6205    /* pointers to the beginning and end+1 of input */
6206    const Py_UNICODE *startp = p;
6207    const Py_UNICODE *endp = p + size;
6208    /* pointer to the beginning of the unencodable characters */
6209    /* const Py_UNICODE *badp = NULL; */
6210    /* pointer into the output */
6211    char *str;
6212    /* current output position */
6213    Py_ssize_t ressize;
6214    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6215    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6216    PyObject *errorHandler = NULL;
6217    PyObject *exc = NULL;
6218    /* the following variable is used for caching string comparisons
6219     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6220    int known_errorHandler = -1;
6221
6222    /* allocate enough for a simple encoding without
6223       replacements, if we need more, we'll resize */
6224    if (size == 0)
6225        return PyBytes_FromStringAndSize(NULL, 0);
6226    res = PyBytes_FromStringAndSize(NULL, size);
6227    if (res == NULL)
6228        return NULL;
6229    str = PyBytes_AS_STRING(res);
6230    ressize = size;
6231
6232    while (p<endp) {
6233        Py_UNICODE c = *p;
6234
6235        /* can we encode this? */
6236        if (c<limit) {
6237            /* no overflow check, because we know that the space is enough */
6238            *str++ = (char)c;
6239            ++p;
6240        }
6241        else {
6242            Py_ssize_t unicodepos = p-startp;
6243            Py_ssize_t requiredsize;
6244            PyObject *repunicode;
6245            Py_ssize_t repsize;
6246            Py_ssize_t newpos;
6247            Py_ssize_t respos;
6248            Py_UNICODE *uni2;
6249            /* startpos for collecting unencodable chars */
6250            const Py_UNICODE *collstart = p;
6251            const Py_UNICODE *collend = p;
6252            /* find all unecodable characters */
6253            while ((collend < endp) && ((*collend)>=limit))
6254                ++collend;
6255            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6256            if (known_errorHandler==-1) {
6257                if ((errors==NULL) || (!strcmp(errors, "strict")))
6258                    known_errorHandler = 1;
6259                else if (!strcmp(errors, "replace"))
6260                    known_errorHandler = 2;
6261                else if (!strcmp(errors, "ignore"))
6262                    known_errorHandler = 3;
6263                else if (!strcmp(errors, "xmlcharrefreplace"))
6264                    known_errorHandler = 4;
6265                else
6266                    known_errorHandler = 0;
6267            }
6268            switch (known_errorHandler) {
6269            case 1: /* strict */
6270                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6271                goto onError;
6272            case 2: /* replace */
6273                while (collstart++<collend)
6274                    *str++ = '?'; /* fall through */
6275            case 3: /* ignore */
6276                p = collend;
6277                break;
6278            case 4: /* xmlcharrefreplace */
6279                respos = str - PyBytes_AS_STRING(res);
6280                /* determine replacement size (temporarily (mis)uses p) */
6281                for (p = collstart, repsize = 0; p < collend; ++p) {
6282                    if (*p<10)
6283                        repsize += 2+1+1;
6284                    else if (*p<100)
6285                        repsize += 2+2+1;
6286                    else if (*p<1000)
6287                        repsize += 2+3+1;
6288                    else if (*p<10000)
6289                        repsize += 2+4+1;
6290#ifndef Py_UNICODE_WIDE
6291                    else
6292                        repsize += 2+5+1;
6293#else
6294                    else if (*p<100000)
6295                        repsize += 2+5+1;
6296                    else if (*p<1000000)
6297                        repsize += 2+6+1;
6298                    else
6299                        repsize += 2+7+1;
6300#endif
6301                }
6302                requiredsize = respos+repsize+(endp-collend);
6303                if (requiredsize > ressize) {
6304                    if (requiredsize<2*ressize)
6305                        requiredsize = 2*ressize;
6306                    if (_PyBytes_Resize(&res, requiredsize))
6307                        goto onError;
6308                    str = PyBytes_AS_STRING(res) + respos;
6309                    ressize = requiredsize;
6310                }
6311                /* generate replacement (temporarily (mis)uses p) */
6312                for (p = collstart; p < collend; ++p) {
6313                    str += sprintf(str, "&#%d;", (int)*p);
6314                }
6315                p = collend;
6316                break;
6317            default:
6318                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6319                                                              encoding, reason, startp, size, &exc,
6320                                                              collstart-startp, collend-startp, &newpos);
6321                if (repunicode == NULL)
6322                    goto onError;
6323                if (PyBytes_Check(repunicode)) {
6324                    /* Directly copy bytes result to output. */
6325                    repsize = PyBytes_Size(repunicode);
6326                    if (repsize > 1) {
6327                        /* Make room for all additional bytes. */
6328                        respos = str - PyBytes_AS_STRING(res);
6329                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6330                            Py_DECREF(repunicode);
6331                            goto onError;
6332                        }
6333                        str = PyBytes_AS_STRING(res) + respos;
6334                        ressize += repsize-1;
6335                    }
6336                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6337                    str += repsize;
6338                    p = startp + newpos;
6339                    Py_DECREF(repunicode);
6340                    break;
6341                }
6342                /* need more space? (at least enough for what we
6343                   have+the replacement+the rest of the string, so
6344                   we won't have to check space for encodable characters) */
6345                respos = str - PyBytes_AS_STRING(res);
6346                repsize = PyUnicode_GET_SIZE(repunicode);
6347                requiredsize = respos+repsize+(endp-collend);
6348                if (requiredsize > ressize) {
6349                    if (requiredsize<2*ressize)
6350                        requiredsize = 2*ressize;
6351                    if (_PyBytes_Resize(&res, requiredsize)) {
6352                        Py_DECREF(repunicode);
6353                        goto onError;
6354                    }
6355                    str = PyBytes_AS_STRING(res) + respos;
6356                    ressize = requiredsize;
6357                }
6358                /* check if there is anything unencodable in the replacement
6359                   and copy it to the output */
6360                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6361                    c = *uni2;
6362                    if (c >= limit) {
6363                        raise_encode_exception(&exc, encoding, startp, size,
6364                                               unicodepos, unicodepos+1, reason);
6365                        Py_DECREF(repunicode);
6366                        goto onError;
6367                    }
6368                    *str = (char)c;
6369                }
6370                p = startp + newpos;
6371                Py_DECREF(repunicode);
6372            }
6373        }
6374    }
6375    /* Resize if we allocated to much */
6376    size = str - PyBytes_AS_STRING(res);
6377    if (size < ressize) { /* If this falls res will be NULL */
6378        assert(size >= 0);
6379        if (_PyBytes_Resize(&res, size) < 0)
6380            goto onError;
6381    }
6382
6383    Py_XDECREF(errorHandler);
6384    Py_XDECREF(exc);
6385    return res;
6386
6387  onError:
6388    Py_XDECREF(res);
6389    Py_XDECREF(errorHandler);
6390    Py_XDECREF(exc);
6391    return NULL;
6392}
6393
6394PyObject *
6395PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6396                       Py_ssize_t size,
6397                       const char *errors)
6398{
6399    return unicode_encode_ucs1(p, size, errors, 256);
6400}
6401
6402PyObject *
6403_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6404{
6405    if (!PyUnicode_Check(unicode)) {
6406        PyErr_BadArgument();
6407        return NULL;
6408    }
6409    if (PyUnicode_READY(unicode) == -1)
6410        return NULL;
6411    /* Fast path: if it is a one-byte string, construct
6412       bytes object directly. */
6413    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6414        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6415                                         PyUnicode_GET_LENGTH(unicode));
6416    /* Non-Latin-1 characters present. Defer to above function to
6417       raise the exception. */
6418    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6419                                  PyUnicode_GET_SIZE(unicode),
6420                                  errors);
6421}
6422
6423PyObject*
6424PyUnicode_AsLatin1String(PyObject *unicode)
6425{
6426    return _PyUnicode_AsLatin1String(unicode, NULL);
6427}
6428
6429/* --- 7-bit ASCII Codec -------------------------------------------------- */
6430
6431PyObject *
6432PyUnicode_DecodeASCII(const char *s,
6433                      Py_ssize_t size,
6434                      const char *errors)
6435{
6436    const char *starts = s;
6437    PyUnicodeObject *v;
6438    Py_UNICODE *p;
6439    Py_ssize_t startinpos;
6440    Py_ssize_t endinpos;
6441    Py_ssize_t outpos;
6442    const char *e;
6443    unsigned char* d;
6444    PyObject *errorHandler = NULL;
6445    PyObject *exc = NULL;
6446    Py_ssize_t i;
6447
6448    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6449    if (size == 1 && *(unsigned char*)s < 128)
6450        return PyUnicode_FromOrdinal(*(unsigned char*)s);
6451
6452    /* Fast path. Assume the input actually *is* ASCII, and allocate
6453       a single-block Unicode object with that assumption. If there is
6454       an error, drop the object and start over. */
6455    v = (PyUnicodeObject*)PyUnicode_New(size, 127);
6456    if (v == NULL)
6457        goto onError;
6458    d = PyUnicode_1BYTE_DATA(v);
6459    for (i = 0; i < size; i++) {
6460        unsigned char ch = ((unsigned char*)s)[i];
6461        if (ch < 128)
6462            d[i] = ch;
6463        else
6464            break;
6465    }
6466    if (i == size)
6467        return (PyObject*)v;
6468    Py_DECREF(v); /* start over */
6469
6470    v = _PyUnicode_New(size);
6471    if (v == NULL)
6472        goto onError;
6473    if (size == 0)
6474        return (PyObject *)v;
6475    p = PyUnicode_AS_UNICODE(v);
6476    e = s + size;
6477    while (s < e) {
6478        register unsigned char c = (unsigned char)*s;
6479        if (c < 128) {
6480            *p++ = c;
6481            ++s;
6482        }
6483        else {
6484            startinpos = s-starts;
6485            endinpos = startinpos + 1;
6486            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6487            if (unicode_decode_call_errorhandler(
6488                    errors, &errorHandler,
6489                    "ascii", "ordinal not in range(128)",
6490                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6491                    &v, &outpos, &p))
6492                goto onError;
6493        }
6494    }
6495    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6496        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6497            goto onError;
6498    Py_XDECREF(errorHandler);
6499    Py_XDECREF(exc);
6500    if (_PyUnicode_READY_REPLACE(&v)) {
6501        Py_DECREF(v);
6502        return NULL;
6503    }
6504    return (PyObject *)v;
6505
6506  onError:
6507    Py_XDECREF(v);
6508    Py_XDECREF(errorHandler);
6509    Py_XDECREF(exc);
6510    return NULL;
6511}
6512
6513PyObject *
6514PyUnicode_EncodeASCII(const Py_UNICODE *p,
6515                      Py_ssize_t size,
6516                      const char *errors)
6517{
6518    return unicode_encode_ucs1(p, size, errors, 128);
6519}
6520
6521PyObject *
6522_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6523{
6524    if (!PyUnicode_Check(unicode)) {
6525        PyErr_BadArgument();
6526        return NULL;
6527    }
6528    if (PyUnicode_READY(unicode) == -1)
6529        return NULL;
6530    /* Fast path: if it is an ASCII-only string, construct bytes object
6531       directly. Else defer to above function to raise the exception. */
6532    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6533        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6534                                         PyUnicode_GET_LENGTH(unicode));
6535    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6536                                 PyUnicode_GET_SIZE(unicode),
6537                                 errors);
6538}
6539
6540PyObject *
6541PyUnicode_AsASCIIString(PyObject *unicode)
6542{
6543    return _PyUnicode_AsASCIIString(unicode, NULL);
6544}
6545
6546#ifdef HAVE_MBCS
6547
6548/* --- MBCS codecs for Windows -------------------------------------------- */
6549
6550#if SIZEOF_INT < SIZEOF_SIZE_T
6551#define NEED_RETRY
6552#endif
6553
6554/* XXX This code is limited to "true" double-byte encodings, as
6555   a) it assumes an incomplete character consists of a single byte, and
6556   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6557   encodings, see IsDBCSLeadByteEx documentation. */
6558
6559static int
6560is_dbcs_lead_byte(const char *s, int offset)
6561{
6562    const char *curr = s + offset;
6563
6564    if (IsDBCSLeadByte(*curr)) {
6565        const char *prev = CharPrev(s, curr);
6566        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6567    }
6568    return 0;
6569}
6570
6571/*
6572 * Decode MBCS string into unicode object. If 'final' is set, converts
6573 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6574 */
6575static int
6576decode_mbcs(PyUnicodeObject **v,
6577            const char *s, /* MBCS string */
6578            int size, /* sizeof MBCS string */
6579            int final,
6580            const char *errors)
6581{
6582    Py_UNICODE *p;
6583    Py_ssize_t n;
6584    DWORD usize;
6585    DWORD flags;
6586
6587    assert(size >= 0);
6588
6589    /* check and handle 'errors' arg */
6590    if (errors==NULL || strcmp(errors, "strict")==0)
6591        flags = MB_ERR_INVALID_CHARS;
6592    else if (strcmp(errors, "ignore")==0)
6593        flags = 0;
6594    else {
6595        PyErr_Format(PyExc_ValueError,
6596                     "mbcs encoding does not support errors='%s'",
6597                     errors);
6598        return -1;
6599    }
6600
6601    /* Skip trailing lead-byte unless 'final' is set */
6602    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6603        --size;
6604
6605    /* First get the size of the result */
6606    if (size > 0) {
6607        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6608        if (usize==0)
6609            goto mbcs_decode_error;
6610    } else
6611        usize = 0;
6612
6613    if (*v == NULL) {
6614        /* Create unicode object */
6615        *v = _PyUnicode_New(usize);
6616        if (*v == NULL)
6617            return -1;
6618        n = 0;
6619    }
6620    else {
6621        /* Extend unicode object */
6622        n = PyUnicode_GET_SIZE(*v);
6623        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6624            return -1;
6625    }
6626
6627    /* Do the conversion */
6628    if (usize > 0) {
6629        p = PyUnicode_AS_UNICODE(*v) + n;
6630        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6631            goto mbcs_decode_error;
6632        }
6633    }
6634    return size;
6635
6636mbcs_decode_error:
6637    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6638       we raise a UnicodeDecodeError - else it is a 'generic'
6639       windows error
6640     */
6641    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6642        /* Ideally, we should get reason from FormatMessage - this
6643           is the Windows 2000 English version of the message
6644        */
6645        PyObject *exc = NULL;
6646        const char *reason = "No mapping for the Unicode character exists "
6647                             "in the target multi-byte code page.";
6648        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6649        if (exc != NULL) {
6650            PyCodec_StrictErrors(exc);
6651            Py_DECREF(exc);
6652        }
6653    } else {
6654        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6655    }
6656    return -1;
6657}
6658
6659PyObject *
6660PyUnicode_DecodeMBCSStateful(const char *s,
6661                             Py_ssize_t size,
6662                             const char *errors,
6663                             Py_ssize_t *consumed)
6664{
6665    PyUnicodeObject *v = NULL;
6666    int done;
6667
6668    if (consumed)
6669        *consumed = 0;
6670
6671#ifdef NEED_RETRY
6672  retry:
6673    if (size > INT_MAX)
6674        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6675    else
6676#endif
6677        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6678
6679    if (done < 0) {
6680        Py_XDECREF(v);
6681        return NULL;
6682    }
6683
6684    if (consumed)
6685        *consumed += done;
6686
6687#ifdef NEED_RETRY
6688    if (size > INT_MAX) {
6689        s += done;
6690        size -= done;
6691        goto retry;
6692    }
6693#endif
6694    if (_PyUnicode_READY_REPLACE(&v)) {
6695        Py_DECREF(v);
6696        return NULL;
6697    }
6698    return (PyObject *)v;
6699}
6700
6701PyObject *
6702PyUnicode_DecodeMBCS(const char *s,
6703                     Py_ssize_t size,
6704                     const char *errors)
6705{
6706    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6707}
6708
6709/*
6710 * Convert unicode into string object (MBCS).
6711 * Returns 0 if succeed, -1 otherwise.
6712 */
6713static int
6714encode_mbcs(PyObject **repr,
6715            const Py_UNICODE *p, /* unicode */
6716            int size, /* size of unicode */
6717            const char* errors)
6718{
6719    BOOL usedDefaultChar = FALSE;
6720    BOOL *pusedDefaultChar;
6721    int mbcssize;
6722    Py_ssize_t n;
6723    PyObject *exc = NULL;
6724    DWORD flags;
6725
6726    assert(size >= 0);
6727
6728    /* check and handle 'errors' arg */
6729    if (errors==NULL || strcmp(errors, "strict")==0) {
6730        flags = WC_NO_BEST_FIT_CHARS;
6731        pusedDefaultChar = &usedDefaultChar;
6732    } else if (strcmp(errors, "replace")==0) {
6733        flags = 0;
6734        pusedDefaultChar = NULL;
6735    } else {
6736         PyErr_Format(PyExc_ValueError,
6737                      "mbcs encoding does not support errors='%s'",
6738                      errors);
6739         return -1;
6740    }
6741
6742    /* First get the size of the result */
6743    if (size > 0) {
6744        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6745                                       NULL, pusedDefaultChar);
6746        if (mbcssize == 0) {
6747            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6748            return -1;
6749        }
6750        /* If we used a default char, then we failed! */
6751        if (pusedDefaultChar && *pusedDefaultChar)
6752            goto mbcs_encode_error;
6753    } else {
6754        mbcssize = 0;
6755    }
6756
6757    if (*repr == NULL) {
6758        /* Create string object */
6759        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6760        if (*repr == NULL)
6761            return -1;
6762        n = 0;
6763    }
6764    else {
6765        /* Extend string object */
6766        n = PyBytes_Size(*repr);
6767        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6768            return -1;
6769    }
6770
6771    /* Do the conversion */
6772    if (size > 0) {
6773        char *s = PyBytes_AS_STRING(*repr) + n;
6774        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6775                                     NULL, pusedDefaultChar)) {
6776            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6777            return -1;
6778        }
6779        if (pusedDefaultChar && *pusedDefaultChar)
6780            goto mbcs_encode_error;
6781    }
6782    return 0;
6783
6784mbcs_encode_error:
6785    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6786    Py_XDECREF(exc);
6787    return -1;
6788}
6789
6790PyObject *
6791PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6792                     Py_ssize_t size,
6793                     const char *errors)
6794{
6795    PyObject *repr = NULL;
6796    int ret;
6797
6798#ifdef NEED_RETRY
6799  retry:
6800    if (size > INT_MAX)
6801        ret = encode_mbcs(&repr, p, INT_MAX, errors);
6802    else
6803#endif
6804        ret = encode_mbcs(&repr, p, (int)size, errors);
6805
6806    if (ret < 0) {
6807        Py_XDECREF(repr);
6808        return NULL;
6809    }
6810
6811#ifdef NEED_RETRY
6812    if (size > INT_MAX) {
6813        p += INT_MAX;
6814        size -= INT_MAX;
6815        goto retry;
6816    }
6817#endif
6818
6819    return repr;
6820}
6821
6822PyObject *
6823PyUnicode_AsMBCSString(PyObject *unicode)
6824{
6825    if (!PyUnicode_Check(unicode)) {
6826        PyErr_BadArgument();
6827        return NULL;
6828    }
6829    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
6830                                PyUnicode_GET_SIZE(unicode),
6831                                NULL);
6832}
6833
6834#undef NEED_RETRY
6835
6836#endif /* HAVE_MBCS */
6837
6838/* --- Character Mapping Codec -------------------------------------------- */
6839
6840PyObject *
6841PyUnicode_DecodeCharmap(const char *s,
6842                        Py_ssize_t size,
6843                        PyObject *mapping,
6844                        const char *errors)
6845{
6846    const char *starts = s;
6847    Py_ssize_t startinpos;
6848    Py_ssize_t endinpos;
6849    Py_ssize_t outpos;
6850    const char *e;
6851    PyUnicodeObject *v;
6852    Py_UNICODE *p;
6853    Py_ssize_t extrachars = 0;
6854    PyObject *errorHandler = NULL;
6855    PyObject *exc = NULL;
6856    Py_UNICODE *mapstring = NULL;
6857    Py_ssize_t maplen = 0;
6858
6859    /* Default to Latin-1 */
6860    if (mapping == NULL)
6861        return PyUnicode_DecodeLatin1(s, size, errors);
6862
6863    v = _PyUnicode_New(size);
6864    if (v == NULL)
6865        goto onError;
6866    if (size == 0)
6867        return (PyObject *)v;
6868    p = PyUnicode_AS_UNICODE(v);
6869    e = s + size;
6870    if (PyUnicode_CheckExact(mapping)) {
6871        mapstring = PyUnicode_AS_UNICODE(mapping);
6872        maplen = PyUnicode_GET_SIZE(mapping);
6873        while (s < e) {
6874            unsigned char ch = *s;
6875            Py_UNICODE x = 0xfffe; /* illegal value */
6876
6877            if (ch < maplen)
6878                x = mapstring[ch];
6879
6880            if (x == 0xfffe) {
6881                /* undefined mapping */
6882                outpos = p-PyUnicode_AS_UNICODE(v);
6883                startinpos = s-starts;
6884                endinpos = startinpos+1;
6885                if (unicode_decode_call_errorhandler(
6886                        errors, &errorHandler,
6887                        "charmap", "character maps to <undefined>",
6888                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6889                        &v, &outpos, &p)) {
6890                    goto onError;
6891                }
6892                continue;
6893            }
6894            *p++ = x;
6895            ++s;
6896        }
6897    }
6898    else {
6899        while (s < e) {
6900            unsigned char ch = *s;
6901            PyObject *w, *x;
6902
6903            /* Get mapping (char ordinal -> integer, Unicode char or None) */
6904            w = PyLong_FromLong((long)ch);
6905            if (w == NULL)
6906                goto onError;
6907            x = PyObject_GetItem(mapping, w);
6908            Py_DECREF(w);
6909            if (x == NULL) {
6910                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6911                    /* No mapping found means: mapping is undefined. */
6912                    PyErr_Clear();
6913                    x = Py_None;
6914                    Py_INCREF(x);
6915                } else
6916                    goto onError;
6917            }
6918
6919            /* Apply mapping */
6920            if (PyLong_Check(x)) {
6921                long value = PyLong_AS_LONG(x);
6922                if (value < 0 || value > 65535) {
6923                    PyErr_SetString(PyExc_TypeError,
6924                                    "character mapping must be in range(65536)");
6925                    Py_DECREF(x);
6926                    goto onError;
6927                }
6928                *p++ = (Py_UNICODE)value;
6929            }
6930            else if (x == Py_None) {
6931                /* undefined mapping */
6932                outpos = p-PyUnicode_AS_UNICODE(v);
6933                startinpos = s-starts;
6934                endinpos = startinpos+1;
6935                if (unicode_decode_call_errorhandler(
6936                        errors, &errorHandler,
6937                        "charmap", "character maps to <undefined>",
6938                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6939                        &v, &outpos, &p)) {
6940                    Py_DECREF(x);
6941                    goto onError;
6942                }
6943                Py_DECREF(x);
6944                continue;
6945            }
6946            else if (PyUnicode_Check(x)) {
6947                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
6948
6949                if (targetsize == 1)
6950                    /* 1-1 mapping */
6951                    *p++ = *PyUnicode_AS_UNICODE(x);
6952
6953                else if (targetsize > 1) {
6954                    /* 1-n mapping */
6955                    if (targetsize > extrachars) {
6956                        /* resize first */
6957                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
6958                        Py_ssize_t needed = (targetsize - extrachars) + \
6959                            (targetsize << 2);
6960                        extrachars += needed;
6961                        /* XXX overflow detection missing */
6962                        if (PyUnicode_Resize((PyObject**)&v,
6963                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
6964                            Py_DECREF(x);
6965                            goto onError;
6966                        }
6967                        p = PyUnicode_AS_UNICODE(v) + oldpos;
6968                    }
6969                    Py_UNICODE_COPY(p,
6970                                    PyUnicode_AS_UNICODE(x),
6971                                    targetsize);
6972                    p += targetsize;
6973                    extrachars -= targetsize;
6974                }
6975                /* 1-0 mapping: skip the character */
6976            }
6977            else {
6978                /* wrong return value */
6979                PyErr_SetString(PyExc_TypeError,
6980                                "character mapping must return integer, None or str");
6981                Py_DECREF(x);
6982                goto onError;
6983            }
6984            Py_DECREF(x);
6985            ++s;
6986        }
6987    }
6988    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6989        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6990            goto onError;
6991    Py_XDECREF(errorHandler);
6992    Py_XDECREF(exc);
6993    if (_PyUnicode_READY_REPLACE(&v)) {
6994        Py_DECREF(v);
6995        return NULL;
6996    }
6997    return (PyObject *)v;
6998
6999  onError:
7000    Py_XDECREF(errorHandler);
7001    Py_XDECREF(exc);
7002    Py_XDECREF(v);
7003    return NULL;
7004}
7005
7006/* Charmap encoding: the lookup table */
7007
7008struct encoding_map {
7009    PyObject_HEAD
7010    unsigned char level1[32];
7011    int count2, count3;
7012    unsigned char level23[1];
7013};
7014
7015static PyObject*
7016encoding_map_size(PyObject *obj, PyObject* args)
7017{
7018    struct encoding_map *map = (struct encoding_map*)obj;
7019    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7020                           128*map->count3);
7021}
7022
7023static PyMethodDef encoding_map_methods[] = {
7024    {"size", encoding_map_size, METH_NOARGS,
7025     PyDoc_STR("Return the size (in bytes) of this object") },
7026    { 0 }
7027};
7028
7029static void
7030encoding_map_dealloc(PyObject* o)
7031{
7032    PyObject_FREE(o);
7033}
7034
7035static PyTypeObject EncodingMapType = {
7036    PyVarObject_HEAD_INIT(NULL, 0)
7037    "EncodingMap",          /*tp_name*/
7038    sizeof(struct encoding_map),   /*tp_basicsize*/
7039    0,                      /*tp_itemsize*/
7040    /* methods */
7041    encoding_map_dealloc,   /*tp_dealloc*/
7042    0,                      /*tp_print*/
7043    0,                      /*tp_getattr*/
7044    0,                      /*tp_setattr*/
7045    0,                      /*tp_reserved*/
7046    0,                      /*tp_repr*/
7047    0,                      /*tp_as_number*/
7048    0,                      /*tp_as_sequence*/
7049    0,                      /*tp_as_mapping*/
7050    0,                      /*tp_hash*/
7051    0,                      /*tp_call*/
7052    0,                      /*tp_str*/
7053    0,                      /*tp_getattro*/
7054    0,                      /*tp_setattro*/
7055    0,                      /*tp_as_buffer*/
7056    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7057    0,                      /*tp_doc*/
7058    0,                      /*tp_traverse*/
7059    0,                      /*tp_clear*/
7060    0,                      /*tp_richcompare*/
7061    0,                      /*tp_weaklistoffset*/
7062    0,                      /*tp_iter*/
7063    0,                      /*tp_iternext*/
7064    encoding_map_methods,   /*tp_methods*/
7065    0,                      /*tp_members*/
7066    0,                      /*tp_getset*/
7067    0,                      /*tp_base*/
7068    0,                      /*tp_dict*/
7069    0,                      /*tp_descr_get*/
7070    0,                      /*tp_descr_set*/
7071    0,                      /*tp_dictoffset*/
7072    0,                      /*tp_init*/
7073    0,                      /*tp_alloc*/
7074    0,                      /*tp_new*/
7075    0,                      /*tp_free*/
7076    0,                      /*tp_is_gc*/
7077};
7078
7079PyObject*
7080PyUnicode_BuildEncodingMap(PyObject* string)
7081{
7082    PyObject *result;
7083    struct encoding_map *mresult;
7084    int i;
7085    int need_dict = 0;
7086    unsigned char level1[32];
7087    unsigned char level2[512];
7088    unsigned char *mlevel1, *mlevel2, *mlevel3;
7089    int count2 = 0, count3 = 0;
7090    int kind;
7091    void *data;
7092    Py_UCS4 ch;
7093
7094    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7095        PyErr_BadArgument();
7096        return NULL;
7097    }
7098    kind = PyUnicode_KIND(string);
7099    data = PyUnicode_DATA(string);
7100    memset(level1, 0xFF, sizeof level1);
7101    memset(level2, 0xFF, sizeof level2);
7102
7103    /* If there isn't a one-to-one mapping of NULL to \0,
7104       or if there are non-BMP characters, we need to use
7105       a mapping dictionary. */
7106    if (PyUnicode_READ(kind, data, 0) != 0)
7107        need_dict = 1;
7108    for (i = 1; i < 256; i++) {
7109        int l1, l2;
7110        ch = PyUnicode_READ(kind, data, i);
7111        if (ch == 0 || ch > 0xFFFF) {
7112            need_dict = 1;
7113            break;
7114        }
7115        if (ch == 0xFFFE)
7116            /* unmapped character */
7117            continue;
7118        l1 = ch >> 11;
7119        l2 = ch >> 7;
7120        if (level1[l1] == 0xFF)
7121            level1[l1] = count2++;
7122        if (level2[l2] == 0xFF)
7123            level2[l2] = count3++;
7124    }
7125
7126    if (count2 >= 0xFF || count3 >= 0xFF)
7127        need_dict = 1;
7128
7129    if (need_dict) {
7130        PyObject *result = PyDict_New();
7131        PyObject *key, *value;
7132        if (!result)
7133            return NULL;
7134        for (i = 0; i < 256; i++) {
7135            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7136            value = PyLong_FromLong(i);
7137            if (!key || !value)
7138                goto failed1;
7139            if (PyDict_SetItem(result, key, value) == -1)
7140                goto failed1;
7141            Py_DECREF(key);
7142            Py_DECREF(value);
7143        }
7144        return result;
7145      failed1:
7146        Py_XDECREF(key);
7147        Py_XDECREF(value);
7148        Py_DECREF(result);
7149        return NULL;
7150    }
7151
7152    /* Create a three-level trie */
7153    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7154                             16*count2 + 128*count3 - 1);
7155    if (!result)
7156        return PyErr_NoMemory();
7157    PyObject_Init(result, &EncodingMapType);
7158    mresult = (struct encoding_map*)result;
7159    mresult->count2 = count2;
7160    mresult->count3 = count3;
7161    mlevel1 = mresult->level1;
7162    mlevel2 = mresult->level23;
7163    mlevel3 = mresult->level23 + 16*count2;
7164    memcpy(mlevel1, level1, 32);
7165    memset(mlevel2, 0xFF, 16*count2);
7166    memset(mlevel3, 0, 128*count3);
7167    count3 = 0;
7168    for (i = 1; i < 256; i++) {
7169        int o1, o2, o3, i2, i3;
7170        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7171            /* unmapped character */
7172            continue;
7173        o1 = PyUnicode_READ(kind, data, i)>>11;
7174        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7175        i2 = 16*mlevel1[o1] + o2;
7176        if (mlevel2[i2] == 0xFF)
7177            mlevel2[i2] = count3++;
7178        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7179        i3 = 128*mlevel2[i2] + o3;
7180        mlevel3[i3] = i;
7181    }
7182    return result;
7183}
7184
7185static int
7186encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7187{
7188    struct encoding_map *map = (struct encoding_map*)mapping;
7189    int l1 = c>>11;
7190    int l2 = (c>>7) & 0xF;
7191    int l3 = c & 0x7F;
7192    int i;
7193
7194#ifdef Py_UNICODE_WIDE
7195    if (c > 0xFFFF) {
7196        return -1;
7197    }
7198#endif
7199    if (c == 0)
7200        return 0;
7201    /* level 1*/
7202    i = map->level1[l1];
7203    if (i == 0xFF) {
7204        return -1;
7205    }
7206    /* level 2*/
7207    i = map->level23[16*i+l2];
7208    if (i == 0xFF) {
7209        return -1;
7210    }
7211    /* level 3 */
7212    i = map->level23[16*map->count2 + 128*i + l3];
7213    if (i == 0) {
7214        return -1;
7215    }
7216    return i;
7217}
7218
7219/* Lookup the character ch in the mapping. If the character
7220   can't be found, Py_None is returned (or NULL, if another
7221   error occurred). */
7222static PyObject *
7223charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7224{
7225    PyObject *w = PyLong_FromLong((long)c);
7226    PyObject *x;
7227
7228    if (w == NULL)
7229        return NULL;
7230    x = PyObject_GetItem(mapping, w);
7231    Py_DECREF(w);
7232    if (x == NULL) {
7233        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7234            /* No mapping found means: mapping is undefined. */
7235            PyErr_Clear();
7236            x = Py_None;
7237            Py_INCREF(x);
7238            return x;
7239        } else
7240            return NULL;
7241    }
7242    else if (x == Py_None)
7243        return x;
7244    else if (PyLong_Check(x)) {
7245        long value = PyLong_AS_LONG(x);
7246        if (value < 0 || value > 255) {
7247            PyErr_SetString(PyExc_TypeError,
7248                            "character mapping must be in range(256)");
7249            Py_DECREF(x);
7250            return NULL;
7251        }
7252        return x;
7253    }
7254    else if (PyBytes_Check(x))
7255        return x;
7256    else {
7257        /* wrong return value */
7258        PyErr_Format(PyExc_TypeError,
7259                     "character mapping must return integer, bytes or None, not %.400s",
7260                     x->ob_type->tp_name);
7261        Py_DECREF(x);
7262        return NULL;
7263    }
7264}
7265
7266static int
7267charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7268{
7269    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7270    /* exponentially overallocate to minimize reallocations */
7271    if (requiredsize < 2*outsize)
7272        requiredsize = 2*outsize;
7273    if (_PyBytes_Resize(outobj, requiredsize))
7274        return -1;
7275    return 0;
7276}
7277
7278typedef enum charmapencode_result {
7279    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7280} charmapencode_result;
7281/* lookup the character, put the result in the output string and adjust
7282   various state variables. Resize the output bytes object if not enough
7283   space is available. Return a new reference to the object that
7284   was put in the output buffer, or Py_None, if the mapping was undefined
7285   (in which case no character was written) or NULL, if a
7286   reallocation error occurred. The caller must decref the result */
7287static charmapencode_result
7288charmapencode_output(Py_UNICODE c, PyObject *mapping,
7289                     PyObject **outobj, Py_ssize_t *outpos)
7290{
7291    PyObject *rep;
7292    char *outstart;
7293    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7294
7295    if (Py_TYPE(mapping) == &EncodingMapType) {
7296        int res = encoding_map_lookup(c, mapping);
7297        Py_ssize_t requiredsize = *outpos+1;
7298        if (res == -1)
7299            return enc_FAILED;
7300        if (outsize<requiredsize)
7301            if (charmapencode_resize(outobj, outpos, requiredsize))
7302                return enc_EXCEPTION;
7303        outstart = PyBytes_AS_STRING(*outobj);
7304        outstart[(*outpos)++] = (char)res;
7305        return enc_SUCCESS;
7306    }
7307
7308    rep = charmapencode_lookup(c, mapping);
7309    if (rep==NULL)
7310        return enc_EXCEPTION;
7311    else if (rep==Py_None) {
7312        Py_DECREF(rep);
7313        return enc_FAILED;
7314    } else {
7315        if (PyLong_Check(rep)) {
7316            Py_ssize_t requiredsize = *outpos+1;
7317            if (outsize<requiredsize)
7318                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7319                    Py_DECREF(rep);
7320                    return enc_EXCEPTION;
7321                }
7322            outstart = PyBytes_AS_STRING(*outobj);
7323            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7324        }
7325        else {
7326            const char *repchars = PyBytes_AS_STRING(rep);
7327            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7328            Py_ssize_t requiredsize = *outpos+repsize;
7329            if (outsize<requiredsize)
7330                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7331                    Py_DECREF(rep);
7332                    return enc_EXCEPTION;
7333                }
7334            outstart = PyBytes_AS_STRING(*outobj);
7335            memcpy(outstart + *outpos, repchars, repsize);
7336            *outpos += repsize;
7337        }
7338    }
7339    Py_DECREF(rep);
7340    return enc_SUCCESS;
7341}
7342
7343/* handle an error in PyUnicode_EncodeCharmap
7344   Return 0 on success, -1 on error */
7345static int
7346charmap_encoding_error(
7347    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7348    PyObject **exceptionObject,
7349    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7350    PyObject **res, Py_ssize_t *respos)
7351{
7352    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7353    Py_ssize_t repsize;
7354    Py_ssize_t newpos;
7355    Py_UNICODE *uni2;
7356    /* startpos for collecting unencodable chars */
7357    Py_ssize_t collstartpos = *inpos;
7358    Py_ssize_t collendpos = *inpos+1;
7359    Py_ssize_t collpos;
7360    char *encoding = "charmap";
7361    char *reason = "character maps to <undefined>";
7362    charmapencode_result x;
7363
7364    /* find all unencodable characters */
7365    while (collendpos < size) {
7366        PyObject *rep;
7367        if (Py_TYPE(mapping) == &EncodingMapType) {
7368            int res = encoding_map_lookup(p[collendpos], mapping);
7369            if (res != -1)
7370                break;
7371            ++collendpos;
7372            continue;
7373        }
7374
7375        rep = charmapencode_lookup(p[collendpos], mapping);
7376        if (rep==NULL)
7377            return -1;
7378        else if (rep!=Py_None) {
7379            Py_DECREF(rep);
7380            break;
7381        }
7382        Py_DECREF(rep);
7383        ++collendpos;
7384    }
7385    /* cache callback name lookup
7386     * (if not done yet, i.e. it's the first error) */
7387    if (*known_errorHandler==-1) {
7388        if ((errors==NULL) || (!strcmp(errors, "strict")))
7389            *known_errorHandler = 1;
7390        else if (!strcmp(errors, "replace"))
7391            *known_errorHandler = 2;
7392        else if (!strcmp(errors, "ignore"))
7393            *known_errorHandler = 3;
7394        else if (!strcmp(errors, "xmlcharrefreplace"))
7395            *known_errorHandler = 4;
7396        else
7397            *known_errorHandler = 0;
7398    }
7399    switch (*known_errorHandler) {
7400    case 1: /* strict */
7401        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7402        return -1;
7403    case 2: /* replace */
7404        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7405            x = charmapencode_output('?', mapping, res, respos);
7406            if (x==enc_EXCEPTION) {
7407                return -1;
7408            }
7409            else if (x==enc_FAILED) {
7410                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7411                return -1;
7412            }
7413        }
7414        /* fall through */
7415    case 3: /* ignore */
7416        *inpos = collendpos;
7417        break;
7418    case 4: /* xmlcharrefreplace */
7419        /* generate replacement (temporarily (mis)uses p) */
7420        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7421            char buffer[2+29+1+1];
7422            char *cp;
7423            sprintf(buffer, "&#%d;", (int)p[collpos]);
7424            for (cp = buffer; *cp; ++cp) {
7425                x = charmapencode_output(*cp, mapping, res, respos);
7426                if (x==enc_EXCEPTION)
7427                    return -1;
7428                else if (x==enc_FAILED) {
7429                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7430                    return -1;
7431                }
7432            }
7433        }
7434        *inpos = collendpos;
7435        break;
7436    default:
7437        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7438                                                      encoding, reason, p, size, exceptionObject,
7439                                                      collstartpos, collendpos, &newpos);
7440        if (repunicode == NULL)
7441            return -1;
7442        if (PyBytes_Check(repunicode)) {
7443            /* Directly copy bytes result to output. */
7444            Py_ssize_t outsize = PyBytes_Size(*res);
7445            Py_ssize_t requiredsize;
7446            repsize = PyBytes_Size(repunicode);
7447            requiredsize = *respos + repsize;
7448            if (requiredsize > outsize)
7449                /* Make room for all additional bytes. */
7450                if (charmapencode_resize(res, respos, requiredsize)) {
7451                    Py_DECREF(repunicode);
7452                    return -1;
7453                }
7454            memcpy(PyBytes_AsString(*res) + *respos,
7455                   PyBytes_AsString(repunicode),  repsize);
7456            *respos += repsize;
7457            *inpos = newpos;
7458            Py_DECREF(repunicode);
7459            break;
7460        }
7461        /* generate replacement  */
7462        repsize = PyUnicode_GET_SIZE(repunicode);
7463        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7464            x = charmapencode_output(*uni2, mapping, res, respos);
7465            if (x==enc_EXCEPTION) {
7466                return -1;
7467            }
7468            else if (x==enc_FAILED) {
7469                Py_DECREF(repunicode);
7470                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7471                return -1;
7472            }
7473        }
7474        *inpos = newpos;
7475        Py_DECREF(repunicode);
7476    }
7477    return 0;
7478}
7479
7480PyObject *
7481PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7482                        Py_ssize_t size,
7483                        PyObject *mapping,
7484                        const char *errors)
7485{
7486    /* output object */
7487    PyObject *res = NULL;
7488    /* current input position */
7489    Py_ssize_t inpos = 0;
7490    /* current output position */
7491    Py_ssize_t respos = 0;
7492    PyObject *errorHandler = NULL;
7493    PyObject *exc = NULL;
7494    /* the following variable is used for caching string comparisons
7495     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7496     * 3=ignore, 4=xmlcharrefreplace */
7497    int known_errorHandler = -1;
7498
7499    /* Default to Latin-1 */
7500    if (mapping == NULL)
7501        return PyUnicode_EncodeLatin1(p, size, errors);
7502
7503    /* allocate enough for a simple encoding without
7504       replacements, if we need more, we'll resize */
7505    res = PyBytes_FromStringAndSize(NULL, size);
7506    if (res == NULL)
7507        goto onError;
7508    if (size == 0)
7509        return res;
7510
7511    while (inpos<size) {
7512        /* try to encode it */
7513        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7514        if (x==enc_EXCEPTION) /* error */
7515            goto onError;
7516        if (x==enc_FAILED) { /* unencodable character */
7517            if (charmap_encoding_error(p, size, &inpos, mapping,
7518                                       &exc,
7519                                       &known_errorHandler, &errorHandler, errors,
7520                                       &res, &respos)) {
7521                goto onError;
7522            }
7523        }
7524        else
7525            /* done with this character => adjust input position */
7526            ++inpos;
7527    }
7528
7529    /* Resize if we allocated to much */
7530    if (respos<PyBytes_GET_SIZE(res))
7531        if (_PyBytes_Resize(&res, respos) < 0)
7532            goto onError;
7533
7534    Py_XDECREF(exc);
7535    Py_XDECREF(errorHandler);
7536    return res;
7537
7538  onError:
7539    Py_XDECREF(res);
7540    Py_XDECREF(exc);
7541    Py_XDECREF(errorHandler);
7542    return NULL;
7543}
7544
7545PyObject *
7546PyUnicode_AsCharmapString(PyObject *unicode,
7547                          PyObject *mapping)
7548{
7549    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7550        PyErr_BadArgument();
7551        return NULL;
7552    }
7553    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7554                                   PyUnicode_GET_SIZE(unicode),
7555                                   mapping,
7556                                   NULL);
7557}
7558
7559/* create or adjust a UnicodeTranslateError */
7560static void
7561make_translate_exception(PyObject **exceptionObject,
7562                         PyObject *unicode,
7563                         Py_ssize_t startpos, Py_ssize_t endpos,
7564                         const char *reason)
7565{
7566    if (*exceptionObject == NULL) {
7567        *exceptionObject = _PyUnicodeTranslateError_Create(
7568            unicode, startpos, endpos, reason);
7569    }
7570    else {
7571        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7572            goto onError;
7573        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7574            goto onError;
7575        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7576            goto onError;
7577        return;
7578      onError:
7579        Py_DECREF(*exceptionObject);
7580        *exceptionObject = NULL;
7581    }
7582}
7583
7584/* raises a UnicodeTranslateError */
7585static void
7586raise_translate_exception(PyObject **exceptionObject,
7587                          PyObject *unicode,
7588                          Py_ssize_t startpos, Py_ssize_t endpos,
7589                          const char *reason)
7590{
7591    make_translate_exception(exceptionObject,
7592                             unicode, startpos, endpos, reason);
7593    if (*exceptionObject != NULL)
7594        PyCodec_StrictErrors(*exceptionObject);
7595}
7596
7597/* error handling callback helper:
7598   build arguments, call the callback and check the arguments,
7599   put the result into newpos and return the replacement string, which
7600   has to be freed by the caller */
7601static PyObject *
7602unicode_translate_call_errorhandler(const char *errors,
7603                                    PyObject **errorHandler,
7604                                    const char *reason,
7605                                    PyObject *unicode, PyObject **exceptionObject,
7606                                    Py_ssize_t startpos, Py_ssize_t endpos,
7607                                    Py_ssize_t *newpos)
7608{
7609    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7610
7611    Py_ssize_t i_newpos;
7612    PyObject *restuple;
7613    PyObject *resunicode;
7614
7615    if (*errorHandler == NULL) {
7616        *errorHandler = PyCodec_LookupError(errors);
7617        if (*errorHandler == NULL)
7618            return NULL;
7619    }
7620
7621    make_translate_exception(exceptionObject,
7622                             unicode, startpos, endpos, reason);
7623    if (*exceptionObject == NULL)
7624        return NULL;
7625
7626    restuple = PyObject_CallFunctionObjArgs(
7627        *errorHandler, *exceptionObject, NULL);
7628    if (restuple == NULL)
7629        return NULL;
7630    if (!PyTuple_Check(restuple)) {
7631        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7632        Py_DECREF(restuple);
7633        return NULL;
7634    }
7635    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7636                          &resunicode, &i_newpos)) {
7637        Py_DECREF(restuple);
7638        return NULL;
7639    }
7640    if (i_newpos<0)
7641        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7642    else
7643        *newpos = i_newpos;
7644    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7645        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7646        Py_DECREF(restuple);
7647        return NULL;
7648    }
7649    Py_INCREF(resunicode);
7650    Py_DECREF(restuple);
7651    return resunicode;
7652}
7653
7654/* Lookup the character ch in the mapping and put the result in result,
7655   which must be decrefed by the caller.
7656   Return 0 on success, -1 on error */
7657static int
7658charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7659{
7660    PyObject *w = PyLong_FromLong((long)c);
7661    PyObject *x;
7662
7663    if (w == NULL)
7664        return -1;
7665    x = PyObject_GetItem(mapping, w);
7666    Py_DECREF(w);
7667    if (x == NULL) {
7668        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7669            /* No mapping found means: use 1:1 mapping. */
7670            PyErr_Clear();
7671            *result = NULL;
7672            return 0;
7673        } else
7674            return -1;
7675    }
7676    else if (x == Py_None) {
7677        *result = x;
7678        return 0;
7679    }
7680    else if (PyLong_Check(x)) {
7681        long value = PyLong_AS_LONG(x);
7682        long max = PyUnicode_GetMax();
7683        if (value < 0 || value > max) {
7684            PyErr_Format(PyExc_TypeError,
7685                         "character mapping must be in range(0x%x)", max+1);
7686            Py_DECREF(x);
7687            return -1;
7688        }
7689        *result = x;
7690        return 0;
7691    }
7692    else if (PyUnicode_Check(x)) {
7693        *result = x;
7694        return 0;
7695    }
7696    else {
7697        /* wrong return value */
7698        PyErr_SetString(PyExc_TypeError,
7699                        "character mapping must return integer, None or str");
7700        Py_DECREF(x);
7701        return -1;
7702    }
7703}
7704/* ensure that *outobj is at least requiredsize characters long,
7705   if not reallocate and adjust various state variables.
7706   Return 0 on success, -1 on error */
7707static int
7708charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
7709                               Py_ssize_t requiredsize)
7710{
7711    Py_ssize_t oldsize = *psize;
7712    if (requiredsize > oldsize) {
7713        /* exponentially overallocate to minimize reallocations */
7714        if (requiredsize < 2 * oldsize)
7715            requiredsize = 2 * oldsize;
7716        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7717        if (*outobj == 0)
7718            return -1;
7719        *psize = requiredsize;
7720    }
7721    return 0;
7722}
7723/* lookup the character, put the result in the output string and adjust
7724   various state variables. Return a new reference to the object that
7725   was put in the output buffer in *result, or Py_None, if the mapping was
7726   undefined (in which case no character was written).
7727   The called must decref result.
7728   Return 0 on success, -1 on error. */
7729static int
7730charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7731                        PyObject *mapping, Py_UCS4 **output,
7732                        Py_ssize_t *osize, Py_ssize_t *opos,
7733                        PyObject **res)
7734{
7735    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7736    if (charmaptranslate_lookup(curinp, mapping, res))
7737        return -1;
7738    if (*res==NULL) {
7739        /* not found => default to 1:1 mapping */
7740        (*output)[(*opos)++] = curinp;
7741    }
7742    else if (*res==Py_None)
7743        ;
7744    else if (PyLong_Check(*res)) {
7745        /* no overflow check, because we know that the space is enough */
7746        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
7747    }
7748    else if (PyUnicode_Check(*res)) {
7749        Py_ssize_t repsize;
7750        if (PyUnicode_READY(*res) == -1)
7751            return -1;
7752        repsize = PyUnicode_GET_LENGTH(*res);
7753        if (repsize==1) {
7754            /* no overflow check, because we know that the space is enough */
7755            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
7756        }
7757        else if (repsize!=0) {
7758            /* more than one character */
7759            Py_ssize_t requiredsize = *opos +
7760                (PyUnicode_GET_LENGTH(input) - ipos) +
7761                repsize - 1;
7762            Py_ssize_t i;
7763            if (charmaptranslate_makespace(output, osize, requiredsize))
7764                return -1;
7765            for(i = 0; i < repsize; i++)
7766                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
7767        }
7768    }
7769    else
7770        return -1;
7771    return 0;
7772}
7773
7774PyObject *
7775_PyUnicode_TranslateCharmap(PyObject *input,
7776                            PyObject *mapping,
7777                            const char *errors)
7778{
7779    /* input object */
7780    char *idata;
7781    Py_ssize_t size, i;
7782    int kind;
7783    /* output buffer */
7784    Py_UCS4 *output = NULL;
7785    Py_ssize_t osize;
7786    PyObject *res;
7787    /* current output position */
7788    Py_ssize_t opos;
7789    char *reason = "character maps to <undefined>";
7790    PyObject *errorHandler = NULL;
7791    PyObject *exc = NULL;
7792    /* the following variable is used for caching string comparisons
7793     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7794     * 3=ignore, 4=xmlcharrefreplace */
7795    int known_errorHandler = -1;
7796
7797    if (mapping == NULL) {
7798        PyErr_BadArgument();
7799        return NULL;
7800    }
7801
7802    if (PyUnicode_READY(input) == -1)
7803        return NULL;
7804    idata = (char*)PyUnicode_DATA(input);
7805    kind = PyUnicode_KIND(input);
7806    size = PyUnicode_GET_LENGTH(input);
7807    i = 0;
7808
7809    if (size == 0) {
7810        Py_INCREF(input);
7811        return input;
7812    }
7813
7814    /* allocate enough for a simple 1:1 translation without
7815       replacements, if we need more, we'll resize */
7816    osize = size;
7817    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7818    opos = 0;
7819    if (output == NULL) {
7820        PyErr_NoMemory();
7821        goto onError;
7822    }
7823
7824    while (i<size) {
7825        /* try to encode it */
7826        PyObject *x = NULL;
7827        if (charmaptranslate_output(input, i, mapping,
7828                                    &output, &osize, &opos, &x)) {
7829            Py_XDECREF(x);
7830            goto onError;
7831        }
7832        Py_XDECREF(x);
7833        if (x!=Py_None) /* it worked => adjust input pointer */
7834            ++i;
7835        else { /* untranslatable character */
7836            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7837            Py_ssize_t repsize;
7838            Py_ssize_t newpos;
7839            Py_ssize_t uni2;
7840            /* startpos for collecting untranslatable chars */
7841            Py_ssize_t collstart = i;
7842            Py_ssize_t collend = i+1;
7843            Py_ssize_t coll;
7844
7845            /* find all untranslatable characters */
7846            while (collend < size) {
7847                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
7848                    goto onError;
7849                Py_XDECREF(x);
7850                if (x!=Py_None)
7851                    break;
7852                ++collend;
7853            }
7854            /* cache callback name lookup
7855             * (if not done yet, i.e. it's the first error) */
7856            if (known_errorHandler==-1) {
7857                if ((errors==NULL) || (!strcmp(errors, "strict")))
7858                    known_errorHandler = 1;
7859                else if (!strcmp(errors, "replace"))
7860                    known_errorHandler = 2;
7861                else if (!strcmp(errors, "ignore"))
7862                    known_errorHandler = 3;
7863                else if (!strcmp(errors, "xmlcharrefreplace"))
7864                    known_errorHandler = 4;
7865                else
7866                    known_errorHandler = 0;
7867            }
7868            switch (known_errorHandler) {
7869            case 1: /* strict */
7870                raise_translate_exception(&exc, input, collstart,
7871                                          collend, reason);
7872                goto onError;
7873            case 2: /* replace */
7874                /* No need to check for space, this is a 1:1 replacement */
7875                for (coll = collstart; coll<collend; coll++)
7876                    output[opos++] = '?';
7877                /* fall through */
7878            case 3: /* ignore */
7879                i = collend;
7880                break;
7881            case 4: /* xmlcharrefreplace */
7882                /* generate replacement (temporarily (mis)uses i) */
7883                for (i = collstart; i < collend; ++i) {
7884                    char buffer[2+29+1+1];
7885                    char *cp;
7886                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7887                    if (charmaptranslate_makespace(&output, &osize,
7888                                                   opos+strlen(buffer)+(size-collend)))
7889                        goto onError;
7890                    for (cp = buffer; *cp; ++cp)
7891                        output[opos++] = *cp;
7892                }
7893                i = collend;
7894                break;
7895            default:
7896                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
7897                                                                 reason, input, &exc,
7898                                                                 collstart, collend, &newpos);
7899                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
7900                    goto onError;
7901                /* generate replacement  */
7902                repsize = PyUnicode_GET_LENGTH(repunicode);
7903                if (charmaptranslate_makespace(&output, &osize,
7904                                               opos+repsize+(size-collend))) {
7905                    Py_DECREF(repunicode);
7906                    goto onError;
7907                }
7908                for (uni2 = 0; repsize-->0; ++uni2)
7909                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7910                i = newpos;
7911                Py_DECREF(repunicode);
7912            }
7913        }
7914    }
7915    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
7916    if (!res)
7917        goto onError;
7918    PyMem_Free(output);
7919    Py_XDECREF(exc);
7920    Py_XDECREF(errorHandler);
7921    return res;
7922
7923  onError:
7924    PyMem_Free(output);
7925    Py_XDECREF(exc);
7926    Py_XDECREF(errorHandler);
7927    return NULL;
7928}
7929
7930/* Deprecated. Use PyUnicode_Translate instead. */
7931PyObject *
7932PyUnicode_TranslateCharmap(const Py_UNICODE *p,
7933                           Py_ssize_t size,
7934                           PyObject *mapping,
7935                           const char *errors)
7936{
7937    PyObject *unicode = PyUnicode_FromUnicode(p, size);
7938    if (!unicode)
7939        return NULL;
7940    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
7941}
7942
7943PyObject *
7944PyUnicode_Translate(PyObject *str,
7945                    PyObject *mapping,
7946                    const char *errors)
7947{
7948    PyObject *result;
7949
7950    str = PyUnicode_FromObject(str);
7951    if (str == NULL)
7952        goto onError;
7953    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
7954    Py_DECREF(str);
7955    return result;
7956
7957  onError:
7958    Py_XDECREF(str);
7959    return NULL;
7960}
7961
7962static Py_UCS4
7963fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
7964{
7965    /* No need to call PyUnicode_READY(self) because this function is only
7966       called as a callback from fixup() which does it already. */
7967    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
7968    const int kind = PyUnicode_KIND(self);
7969    void *data = PyUnicode_DATA(self);
7970    Py_UCS4 maxchar = 0, ch, fixed;
7971    Py_ssize_t i;
7972
7973    for (i = 0; i < len; ++i) {
7974        ch = PyUnicode_READ(kind, data, i);
7975        fixed = 0;
7976        if (ch > 127) {
7977            if (Py_UNICODE_ISSPACE(ch))
7978                fixed = ' ';
7979            else {
7980                const int decimal = Py_UNICODE_TODECIMAL(ch);
7981                if (decimal >= 0)
7982                    fixed = '0' + decimal;
7983            }
7984            if (fixed != 0) {
7985                if (fixed > maxchar)
7986                    maxchar = fixed;
7987                PyUnicode_WRITE(kind, data, i, fixed);
7988            }
7989            else if (ch > maxchar)
7990                maxchar = ch;
7991        }
7992        else if (ch > maxchar)
7993            maxchar = ch;
7994    }
7995
7996    return maxchar;
7997}
7998
7999PyObject *
8000_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8001{
8002    if (!PyUnicode_Check(unicode)) {
8003        PyErr_BadInternalCall();
8004        return NULL;
8005    }
8006    if (PyUnicode_READY(unicode) == -1)
8007        return NULL;
8008    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8009        /* If the string is already ASCII, just return the same string */
8010        Py_INCREF(unicode);
8011        return unicode;
8012    }
8013    return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8014}
8015
8016PyObject *
8017PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8018                                  Py_ssize_t length)
8019{
8020    PyObject *result;
8021    Py_UNICODE *p; /* write pointer into result */
8022    Py_ssize_t i;
8023    /* Copy to a new string */
8024    result = (PyObject *)_PyUnicode_New(length);
8025    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8026    if (result == NULL)
8027        return result;
8028    p = PyUnicode_AS_UNICODE(result);
8029    /* Iterate over code points */
8030    for (i = 0; i < length; i++) {
8031        Py_UNICODE ch =s[i];
8032        if (ch > 127) {
8033            int decimal = Py_UNICODE_TODECIMAL(ch);
8034            if (decimal >= 0)
8035                p[i] = '0' + decimal;
8036        }
8037    }
8038    if (PyUnicode_READY((PyUnicodeObject*)result) == -1) {
8039        Py_DECREF(result);
8040        return NULL;
8041    }
8042    return result;
8043}
8044/* --- Decimal Encoder ---------------------------------------------------- */
8045
8046int
8047PyUnicode_EncodeDecimal(Py_UNICODE *s,
8048                        Py_ssize_t length,
8049                        char *output,
8050                        const char *errors)
8051{
8052    Py_UNICODE *p, *end;
8053    PyObject *errorHandler = NULL;
8054    PyObject *exc = NULL;
8055    const char *encoding = "decimal";
8056    const char *reason = "invalid decimal Unicode string";
8057    /* the following variable is used for caching string comparisons
8058     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8059    int known_errorHandler = -1;
8060
8061    if (output == NULL) {
8062        PyErr_BadArgument();
8063        return -1;
8064    }
8065
8066    p = s;
8067    end = s + length;
8068    while (p < end) {
8069        register Py_UNICODE ch = *p;
8070        int decimal;
8071        PyObject *repunicode;
8072        Py_ssize_t repsize;
8073        Py_ssize_t newpos;
8074        Py_UNICODE *uni2;
8075        Py_UNICODE *collstart;
8076        Py_UNICODE *collend;
8077
8078        if (Py_UNICODE_ISSPACE(ch)) {
8079            *output++ = ' ';
8080            ++p;
8081            continue;
8082        }
8083        decimal = Py_UNICODE_TODECIMAL(ch);
8084        if (decimal >= 0) {
8085            *output++ = '0' + decimal;
8086            ++p;
8087            continue;
8088        }
8089        if (0 < ch && ch < 256) {
8090            *output++ = (char)ch;
8091            ++p;
8092            continue;
8093        }
8094        /* All other characters are considered unencodable */
8095        collstart = p;
8096        collend = p+1;
8097        while (collend < end) {
8098            if ((0 < *collend && *collend < 256) ||
8099                !Py_UNICODE_ISSPACE(*collend) ||
8100                Py_UNICODE_TODECIMAL(*collend))
8101                break;
8102        }
8103        /* cache callback name lookup
8104         * (if not done yet, i.e. it's the first error) */
8105        if (known_errorHandler==-1) {
8106            if ((errors==NULL) || (!strcmp(errors, "strict")))
8107                known_errorHandler = 1;
8108            else if (!strcmp(errors, "replace"))
8109                known_errorHandler = 2;
8110            else if (!strcmp(errors, "ignore"))
8111                known_errorHandler = 3;
8112            else if (!strcmp(errors, "xmlcharrefreplace"))
8113                known_errorHandler = 4;
8114            else
8115                known_errorHandler = 0;
8116        }
8117        switch (known_errorHandler) {
8118        case 1: /* strict */
8119            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8120            goto onError;
8121        case 2: /* replace */
8122            for (p = collstart; p < collend; ++p)
8123                *output++ = '?';
8124            /* fall through */
8125        case 3: /* ignore */
8126            p = collend;
8127            break;
8128        case 4: /* xmlcharrefreplace */
8129            /* generate replacement (temporarily (mis)uses p) */
8130            for (p = collstart; p < collend; ++p)
8131                output += sprintf(output, "&#%d;", (int)*p);
8132            p = collend;
8133            break;
8134        default:
8135            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8136                                                          encoding, reason, s, length, &exc,
8137                                                          collstart-s, collend-s, &newpos);
8138            if (repunicode == NULL)
8139                goto onError;
8140            if (!PyUnicode_Check(repunicode)) {
8141                /* Byte results not supported, since they have no decimal property. */
8142                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8143                Py_DECREF(repunicode);
8144                goto onError;
8145            }
8146            /* generate replacement  */
8147            repsize = PyUnicode_GET_SIZE(repunicode);
8148            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8149                Py_UNICODE ch = *uni2;
8150                if (Py_UNICODE_ISSPACE(ch))
8151                    *output++ = ' ';
8152                else {
8153                    decimal = Py_UNICODE_TODECIMAL(ch);
8154                    if (decimal >= 0)
8155                        *output++ = '0' + decimal;
8156                    else if (0 < ch && ch < 256)
8157                        *output++ = (char)ch;
8158                    else {
8159                        Py_DECREF(repunicode);
8160                        raise_encode_exception(&exc, encoding,
8161                                               s, length, collstart-s, collend-s, reason);
8162                        goto onError;
8163                    }
8164                }
8165            }
8166            p = s + newpos;
8167            Py_DECREF(repunicode);
8168        }
8169    }
8170    /* 0-terminate the output string */
8171    *output++ = '\0';
8172    Py_XDECREF(exc);
8173    Py_XDECREF(errorHandler);
8174    return 0;
8175
8176  onError:
8177    Py_XDECREF(exc);
8178    Py_XDECREF(errorHandler);
8179    return -1;
8180}
8181
8182/* --- Helpers ------------------------------------------------------------ */
8183
8184#include "stringlib/ucs1lib.h"
8185#include "stringlib/fastsearch.h"
8186#include "stringlib/partition.h"
8187#include "stringlib/split.h"
8188#include "stringlib/count.h"
8189#include "stringlib/find.h"
8190#include "stringlib/localeutil.h"
8191#include "stringlib/undef.h"
8192
8193#include "stringlib/ucs2lib.h"
8194#include "stringlib/fastsearch.h"
8195#include "stringlib/partition.h"
8196#include "stringlib/split.h"
8197#include "stringlib/count.h"
8198#include "stringlib/find.h"
8199#include "stringlib/localeutil.h"
8200#include "stringlib/undef.h"
8201
8202#include "stringlib/ucs4lib.h"
8203#include "stringlib/fastsearch.h"
8204#include "stringlib/partition.h"
8205#include "stringlib/split.h"
8206#include "stringlib/count.h"
8207#include "stringlib/find.h"
8208#include "stringlib/localeutil.h"
8209#include "stringlib/undef.h"
8210
8211static Py_ssize_t
8212any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8213                                  const Py_UCS1*, Py_ssize_t,
8214                                  Py_ssize_t, Py_ssize_t),
8215               Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8216                                  const Py_UCS2*, Py_ssize_t,
8217                                  Py_ssize_t, Py_ssize_t),
8218               Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8219                                  const Py_UCS4*, Py_ssize_t,
8220                                  Py_ssize_t, Py_ssize_t),
8221               PyObject* s1, PyObject* s2,
8222               Py_ssize_t start,
8223               Py_ssize_t end)
8224{
8225    int kind1, kind2, kind;
8226    void *buf1, *buf2;
8227    Py_ssize_t len1, len2, result;
8228
8229    kind1 = PyUnicode_KIND(s1);
8230    kind2 = PyUnicode_KIND(s2);
8231    kind = kind1 > kind2 ? kind1 : kind2;
8232    buf1 = PyUnicode_DATA(s1);
8233    buf2 = PyUnicode_DATA(s2);
8234    if (kind1 != kind)
8235        buf1 = _PyUnicode_AsKind(s1, kind);
8236    if (!buf1)
8237        return -2;
8238    if (kind2 != kind)
8239        buf2 = _PyUnicode_AsKind(s2, kind);
8240    if (!buf2) {
8241        if (kind1 != kind) PyMem_Free(buf1);
8242        return -2;
8243    }
8244    len1 = PyUnicode_GET_LENGTH(s1);
8245    len2 = PyUnicode_GET_LENGTH(s2);
8246
8247    switch(kind) {
8248    case PyUnicode_1BYTE_KIND:
8249        result = ucs1(buf1, len1, buf2, len2, start, end);
8250        break;
8251    case PyUnicode_2BYTE_KIND:
8252        result = ucs2(buf1, len1, buf2, len2, start, end);
8253        break;
8254    case PyUnicode_4BYTE_KIND:
8255        result = ucs4(buf1, len1, buf2, len2, start, end);
8256        break;
8257    default:
8258        assert(0); result = -2;
8259    }
8260
8261    if (kind1 != kind)
8262        PyMem_Free(buf1);
8263    if (kind2 != kind)
8264        PyMem_Free(buf2);
8265
8266    return result;
8267}
8268
8269Py_ssize_t
8270_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8271                                   Py_ssize_t n_buffer,
8272                                   void *digits, Py_ssize_t n_digits,
8273                                   Py_ssize_t min_width,
8274                                   const char *grouping,
8275                                   const char *thousands_sep)
8276{
8277    switch(kind) {
8278    case PyUnicode_1BYTE_KIND:
8279        return _PyUnicode_ucs1_InsertThousandsGrouping(
8280            (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8281            min_width, grouping, thousands_sep);
8282    case PyUnicode_2BYTE_KIND:
8283        return _PyUnicode_ucs2_InsertThousandsGrouping(
8284            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8285            min_width, grouping, thousands_sep);
8286    case PyUnicode_4BYTE_KIND:
8287        return _PyUnicode_ucs4_InsertThousandsGrouping(
8288            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8289            min_width, grouping, thousands_sep);
8290    }
8291    assert(0);
8292    return -1;
8293}
8294
8295
8296#include "stringlib/unicodedefs.h"
8297#include "stringlib/fastsearch.h"
8298
8299#include "stringlib/count.h"
8300#include "stringlib/find.h"
8301
8302/* helper macro to fixup start/end slice values */
8303#define ADJUST_INDICES(start, end, len)         \
8304    if (end > len)                              \
8305        end = len;                              \
8306    else if (end < 0) {                         \
8307        end += len;                             \
8308        if (end < 0)                            \
8309            end = 0;                            \
8310    }                                           \
8311    if (start < 0) {                            \
8312        start += len;                           \
8313        if (start < 0)                          \
8314            start = 0;                          \
8315    }
8316
8317Py_ssize_t
8318PyUnicode_Count(PyObject *str,
8319                PyObject *substr,
8320                Py_ssize_t start,
8321                Py_ssize_t end)
8322{
8323    Py_ssize_t result;
8324    PyUnicodeObject* str_obj;
8325    PyUnicodeObject* sub_obj;
8326    int kind1, kind2, kind;
8327    void *buf1 = NULL, *buf2 = NULL;
8328    Py_ssize_t len1, len2;
8329
8330    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8331    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8332        return -1;
8333    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8334    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8335        Py_DECREF(str_obj);
8336        return -1;
8337    }
8338
8339    kind1 = PyUnicode_KIND(str_obj);
8340    kind2 = PyUnicode_KIND(sub_obj);
8341    kind = kind1 > kind2 ? kind1 : kind2;
8342    buf1 = PyUnicode_DATA(str_obj);
8343    if (kind1 != kind)
8344        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8345    if (!buf1)
8346        goto onError;
8347    buf2 = PyUnicode_DATA(sub_obj);
8348    if (kind2 != kind)
8349        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8350    if (!buf2)
8351        goto onError;
8352    len1 = PyUnicode_GET_LENGTH(str_obj);
8353    len2 = PyUnicode_GET_LENGTH(sub_obj);
8354
8355    ADJUST_INDICES(start, end, len1);
8356    switch(kind) {
8357    case PyUnicode_1BYTE_KIND:
8358        result = ucs1lib_count(
8359            ((Py_UCS1*)buf1) + start, end - start,
8360            buf2, len2, PY_SSIZE_T_MAX
8361            );
8362        break;
8363    case PyUnicode_2BYTE_KIND:
8364        result = ucs2lib_count(
8365            ((Py_UCS2*)buf1) + start, end - start,
8366            buf2, len2, PY_SSIZE_T_MAX
8367            );
8368        break;
8369    case PyUnicode_4BYTE_KIND:
8370        result = ucs4lib_count(
8371            ((Py_UCS4*)buf1) + start, end - start,
8372            buf2, len2, PY_SSIZE_T_MAX
8373            );
8374        break;
8375    default:
8376        assert(0); result = 0;
8377    }
8378
8379    Py_DECREF(sub_obj);
8380    Py_DECREF(str_obj);
8381
8382    if (kind1 != kind)
8383        PyMem_Free(buf1);
8384    if (kind2 != kind)
8385        PyMem_Free(buf2);
8386
8387    return result;
8388  onError:
8389    Py_DECREF(sub_obj);
8390    Py_DECREF(str_obj);
8391    if (kind1 != kind && buf1)
8392        PyMem_Free(buf1);
8393    if (kind2 != kind && buf2)
8394        PyMem_Free(buf2);
8395    return -1;
8396}
8397
8398Py_ssize_t
8399PyUnicode_Find(PyObject *str,
8400               PyObject *sub,
8401               Py_ssize_t start,
8402               Py_ssize_t end,
8403               int direction)
8404{
8405    Py_ssize_t result;
8406
8407    str = PyUnicode_FromObject(str);
8408    if (!str || PyUnicode_READY(str) == -1)
8409        return -2;
8410    sub = PyUnicode_FromObject(sub);
8411    if (!sub || PyUnicode_READY(sub) == -1) {
8412        Py_DECREF(str);
8413        return -2;
8414    }
8415
8416    if (direction > 0)
8417        result = any_find_slice(
8418            ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8419            str, sub, start, end
8420            );
8421    else
8422        result = any_find_slice(
8423            ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8424            str, sub, start, end
8425            );
8426
8427    Py_DECREF(str);
8428    Py_DECREF(sub);
8429
8430    return result;
8431}
8432
8433Py_ssize_t
8434PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8435                   Py_ssize_t start, Py_ssize_t end,
8436                   int direction)
8437{
8438    char *result;
8439    int kind;
8440    if (PyUnicode_READY(str) == -1)
8441        return -2;
8442    if (start < 0 || end < 0) {
8443        PyErr_SetString(PyExc_IndexError, "string index out of range");
8444        return -2;
8445    }
8446    if (end > PyUnicode_GET_LENGTH(str))
8447        end = PyUnicode_GET_LENGTH(str);
8448    kind = PyUnicode_KIND(str);
8449    result = findchar(PyUnicode_1BYTE_DATA(str)
8450                      + PyUnicode_KIND_SIZE(kind, start),
8451                      kind,
8452                      end-start, ch, direction);
8453    if (!result)
8454        return -1;
8455    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8456}
8457
8458static int
8459tailmatch(PyUnicodeObject *self,
8460          PyUnicodeObject *substring,
8461          Py_ssize_t start,
8462          Py_ssize_t end,
8463          int direction)
8464{
8465    int kind_self;
8466    int kind_sub;
8467    void *data_self;
8468    void *data_sub;
8469    Py_ssize_t offset;
8470    Py_ssize_t i;
8471    Py_ssize_t end_sub;
8472
8473    if (PyUnicode_READY(self) == -1 ||
8474        PyUnicode_READY(substring) == -1)
8475        return 0;
8476
8477    if (PyUnicode_GET_LENGTH(substring) == 0)
8478        return 1;
8479
8480    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8481    end -= PyUnicode_GET_LENGTH(substring);
8482    if (end < start)
8483        return 0;
8484
8485    kind_self = PyUnicode_KIND(self);
8486    data_self = PyUnicode_DATA(self);
8487    kind_sub = PyUnicode_KIND(substring);
8488    data_sub = PyUnicode_DATA(substring);
8489    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8490
8491    if (direction > 0)
8492        offset = end;
8493    else
8494        offset = start;
8495
8496    if (PyUnicode_READ(kind_self, data_self, offset) ==
8497        PyUnicode_READ(kind_sub, data_sub, 0) &&
8498        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8499        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8500        /* If both are of the same kind, memcmp is sufficient */
8501        if (kind_self == kind_sub) {
8502            return ! memcmp((char *)data_self +
8503                                (offset * PyUnicode_CHARACTER_SIZE(substring)),
8504                            data_sub,
8505                            PyUnicode_GET_LENGTH(substring) *
8506                                PyUnicode_CHARACTER_SIZE(substring));
8507        }
8508        /* otherwise we have to compare each character by first accesing it */
8509        else {
8510            /* We do not need to compare 0 and len(substring)-1 because
8511               the if statement above ensured already that they are equal
8512               when we end up here. */
8513            // TODO: honor direction and do a forward or backwards search
8514            for (i = 1; i < end_sub; ++i) {
8515                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8516                    PyUnicode_READ(kind_sub, data_sub, i))
8517                    return 0;
8518            }
8519            return 1;
8520        }
8521    }
8522
8523    return 0;
8524}
8525
8526Py_ssize_t
8527PyUnicode_Tailmatch(PyObject *str,
8528                    PyObject *substr,
8529                    Py_ssize_t start,
8530                    Py_ssize_t end,
8531                    int direction)
8532{
8533    Py_ssize_t result;
8534
8535    str = PyUnicode_FromObject(str);
8536    if (str == NULL)
8537        return -1;
8538    substr = PyUnicode_FromObject(substr);
8539    if (substr == NULL) {
8540        Py_DECREF(str);
8541        return -1;
8542    }
8543
8544    result = tailmatch((PyUnicodeObject *)str,
8545                       (PyUnicodeObject *)substr,
8546                       start, end, direction);
8547    Py_DECREF(str);
8548    Py_DECREF(substr);
8549    return result;
8550}
8551
8552/* Apply fixfct filter to the Unicode object self and return a
8553   reference to the modified object */
8554
8555static PyObject *
8556fixup(PyUnicodeObject *self,
8557      Py_UCS4 (*fixfct)(PyUnicodeObject *s))
8558{
8559    PyObject *u;
8560    Py_UCS4 maxchar_old, maxchar_new = 0;
8561
8562    if (PyUnicode_READY(self) == -1)
8563        return NULL;
8564    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8565    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8566                      maxchar_old);
8567    if (u == NULL)
8568        return NULL;
8569
8570    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8571              PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
8572
8573    /* fix functions return the new maximum character in a string,
8574       if the kind of the resulting unicode object does not change,
8575       everything is fine.  Otherwise we need to change the string kind
8576       and re-run the fix function. */
8577    maxchar_new = fixfct((PyUnicodeObject*)u);
8578    if (maxchar_new == 0)
8579        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8580    else if (maxchar_new <= 127)
8581        maxchar_new = 127;
8582    else if (maxchar_new <= 255)
8583        maxchar_new = 255;
8584    else if (maxchar_new <= 65535)
8585        maxchar_new = 65535;
8586    else
8587        maxchar_new = 1114111; /* 0x10ffff */
8588
8589    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8590        /* fixfct should return TRUE if it modified the buffer. If
8591           FALSE, return a reference to the original buffer instead
8592           (to save space, not time) */
8593        Py_INCREF(self);
8594        Py_DECREF(u);
8595        return (PyObject*) self;
8596    }
8597    else if (maxchar_new == maxchar_old) {
8598        return u;
8599    }
8600    else {
8601        /* In case the maximum character changed, we need to
8602           convert the string to the new category. */
8603        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8604        if (v == NULL) {
8605            Py_DECREF(u);
8606            return NULL;
8607        }
8608        if (maxchar_new > maxchar_old) {
8609            /* If the maxchar increased so that the kind changed, not all
8610               characters are representable anymore and we need to fix the
8611               string again. This only happens in very few cases. */
8612            if (PyUnicode_CopyCharacters(v, 0,
8613                                         (PyObject*)self, 0,
8614                                         PyUnicode_GET_LENGTH(self)) < 0)
8615            {
8616                Py_DECREF(u);
8617                return NULL;
8618            }
8619            maxchar_old = fixfct((PyUnicodeObject*)v);
8620            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8621        }
8622        else {
8623            if (PyUnicode_CopyCharacters(v, 0,
8624                                         u, 0,
8625                                         PyUnicode_GET_LENGTH(self)) < 0)
8626            {
8627                Py_DECREF(u);
8628                return NULL;
8629            }
8630        }
8631
8632        Py_DECREF(u);
8633        return v;
8634    }
8635}
8636
8637static Py_UCS4
8638fixupper(PyUnicodeObject *self)
8639{
8640    /* No need to call PyUnicode_READY(self) because this function is only
8641       called as a callback from fixup() which does it already. */
8642    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8643    const int kind = PyUnicode_KIND(self);
8644    void *data = PyUnicode_DATA(self);
8645    int touched = 0;
8646    Py_UCS4 maxchar = 0;
8647    Py_ssize_t i;
8648
8649    for (i = 0; i < len; ++i) {
8650        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8651        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8652        if (up != ch) {
8653            if (up > maxchar)
8654                maxchar = up;
8655            PyUnicode_WRITE(kind, data, i, up);
8656            touched = 1;
8657        }
8658        else if (ch > maxchar)
8659            maxchar = ch;
8660    }
8661
8662    if (touched)
8663        return maxchar;
8664    else
8665        return 0;
8666}
8667
8668static Py_UCS4
8669fixlower(PyUnicodeObject *self)
8670{
8671    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8672    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8673    const int kind = PyUnicode_KIND(self);
8674    void *data = PyUnicode_DATA(self);
8675    int touched = 0;
8676    Py_UCS4 maxchar = 0;
8677    Py_ssize_t i;
8678
8679    for(i = 0; i < len; ++i) {
8680        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8681        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8682        if (lo != ch) {
8683            if (lo > maxchar)
8684                maxchar = lo;
8685            PyUnicode_WRITE(kind, data, i, lo);
8686            touched = 1;
8687        }
8688        else if (ch > maxchar)
8689            maxchar = ch;
8690    }
8691
8692    if (touched)
8693        return maxchar;
8694    else
8695        return 0;
8696}
8697
8698static Py_UCS4
8699fixswapcase(PyUnicodeObject *self)
8700{
8701    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8702    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8703    const int kind = PyUnicode_KIND(self);
8704    void *data = PyUnicode_DATA(self);
8705    int touched = 0;
8706    Py_UCS4 maxchar = 0;
8707    Py_ssize_t i;
8708
8709    for(i = 0; i < len; ++i) {
8710        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8711        Py_UCS4 nu = 0;
8712
8713        if (Py_UNICODE_ISUPPER(ch))
8714            nu = Py_UNICODE_TOLOWER(ch);
8715        else if (Py_UNICODE_ISLOWER(ch))
8716            nu = Py_UNICODE_TOUPPER(ch);
8717
8718        if (nu != 0) {
8719            if (nu > maxchar)
8720                maxchar = nu;
8721            PyUnicode_WRITE(kind, data, i, nu);
8722            touched = 1;
8723        }
8724        else if (ch > maxchar)
8725            maxchar = ch;
8726    }
8727
8728    if (touched)
8729        return maxchar;
8730    else
8731        return 0;
8732}
8733
8734static Py_UCS4
8735fixcapitalize(PyUnicodeObject *self)
8736{
8737    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8738    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8739    const int kind = PyUnicode_KIND(self);
8740    void *data = PyUnicode_DATA(self);
8741    int touched = 0;
8742    Py_UCS4 maxchar = 0;
8743    Py_ssize_t i = 0;
8744    Py_UCS4 ch;
8745
8746    if (len == 0)
8747        return 0;
8748
8749    ch = PyUnicode_READ(kind, data, i);
8750    if (!Py_UNICODE_ISUPPER(ch)) {
8751        maxchar = Py_UNICODE_TOUPPER(ch);
8752        PyUnicode_WRITE(kind, data, i, maxchar);
8753        touched = 1;
8754    }
8755    ++i;
8756    for(; i < len; ++i) {
8757        ch = PyUnicode_READ(kind, data, i);
8758        if (!Py_UNICODE_ISLOWER(ch)) {
8759            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8760            if (lo > maxchar)
8761                maxchar = lo;
8762            PyUnicode_WRITE(kind, data, i, lo);
8763            touched = 1;
8764        }
8765        else if (ch > maxchar)
8766            maxchar = ch;
8767    }
8768
8769    if (touched)
8770        return maxchar;
8771    else
8772        return 0;
8773}
8774
8775static Py_UCS4
8776fixtitle(PyUnicodeObject *self)
8777{
8778    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8779    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8780    const int kind = PyUnicode_KIND(self);
8781    void *data = PyUnicode_DATA(self);
8782    Py_UCS4 maxchar = 0;
8783    Py_ssize_t i = 0;
8784    int previous_is_cased;
8785
8786    /* Shortcut for single character strings */
8787    if (len == 1) {
8788        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8789        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8790        if (ti != ch) {
8791            PyUnicode_WRITE(kind, data, i, ti);
8792            return ti;
8793        }
8794        else
8795            return 0;
8796    }
8797    previous_is_cased = 0;
8798    for(; i < len; ++i) {
8799        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8800        Py_UCS4 nu;
8801
8802        if (previous_is_cased)
8803            nu = Py_UNICODE_TOLOWER(ch);
8804        else
8805            nu = Py_UNICODE_TOTITLE(ch);
8806
8807        if (nu > maxchar)
8808            maxchar = nu;
8809        PyUnicode_WRITE(kind, data, i, nu);
8810
8811        if (Py_UNICODE_ISLOWER(ch) ||
8812            Py_UNICODE_ISUPPER(ch) ||
8813            Py_UNICODE_ISTITLE(ch))
8814            previous_is_cased = 1;
8815        else
8816            previous_is_cased = 0;
8817    }
8818    return maxchar;
8819}
8820
8821PyObject *
8822PyUnicode_Join(PyObject *separator, PyObject *seq)
8823{
8824    PyObject *sep = NULL;
8825    Py_ssize_t seplen = 1;
8826    PyObject *res = NULL; /* the result */
8827    PyObject *fseq;          /* PySequence_Fast(seq) */
8828    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
8829    PyObject **items;
8830    PyObject *item;
8831    Py_ssize_t sz, i, res_offset;
8832    Py_UCS4 maxchar = 0;
8833    Py_UCS4 item_maxchar;
8834
8835    fseq = PySequence_Fast(seq, "");
8836    if (fseq == NULL) {
8837        return NULL;
8838    }
8839
8840    /* NOTE: the following code can't call back into Python code,
8841     * so we are sure that fseq won't be mutated.
8842     */
8843
8844    seqlen = PySequence_Fast_GET_SIZE(fseq);
8845    /* If empty sequence, return u"". */
8846    if (seqlen == 0) {
8847        res = PyUnicode_New(0, 0);
8848        goto Done;
8849    }
8850    items = PySequence_Fast_ITEMS(fseq);
8851    /* If singleton sequence with an exact Unicode, return that. */
8852    if (seqlen == 1) {
8853        item = items[0];
8854        if (PyUnicode_CheckExact(item)) {
8855            Py_INCREF(item);
8856            res = item;
8857            goto Done;
8858        }
8859    }
8860    else {
8861        /* Set up sep and seplen */
8862        if (separator == NULL) {
8863            /* fall back to a blank space separator */
8864            sep = PyUnicode_FromOrdinal(' ');
8865            if (!sep)
8866                goto onError;
8867        }
8868        else {
8869            if (!PyUnicode_Check(separator)) {
8870                PyErr_Format(PyExc_TypeError,
8871                             "separator: expected str instance,"
8872                             " %.80s found",
8873                             Py_TYPE(separator)->tp_name);
8874                goto onError;
8875            }
8876            if (PyUnicode_READY(separator))
8877                goto onError;
8878            sep = separator;
8879            seplen = PyUnicode_GET_LENGTH(separator);
8880            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8881            /* inc refcount to keep this code path symetric with the
8882               above case of a blank separator */
8883            Py_INCREF(sep);
8884        }
8885    }
8886
8887    /* There are at least two things to join, or else we have a subclass
8888     * of str in the sequence.
8889     * Do a pre-pass to figure out the total amount of space we'll
8890     * need (sz), and see whether all argument are strings.
8891     */
8892    sz = 0;
8893    for (i = 0; i < seqlen; i++) {
8894        const Py_ssize_t old_sz = sz;
8895        item = items[i];
8896        if (!PyUnicode_Check(item)) {
8897            PyErr_Format(PyExc_TypeError,
8898                         "sequence item %zd: expected str instance,"
8899                         " %.80s found",
8900                         i, Py_TYPE(item)->tp_name);
8901            goto onError;
8902        }
8903        if (PyUnicode_READY(item) == -1)
8904            goto onError;
8905        sz += PyUnicode_GET_LENGTH(item);
8906        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8907        if (item_maxchar > maxchar)
8908            maxchar = item_maxchar;
8909        if (i != 0)
8910            sz += seplen;
8911        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
8912            PyErr_SetString(PyExc_OverflowError,
8913                            "join() result is too long for a Python string");
8914            goto onError;
8915        }
8916    }
8917
8918    res = PyUnicode_New(sz, maxchar);
8919    if (res == NULL)
8920        goto onError;
8921
8922    /* Catenate everything. */
8923    for (i = 0, res_offset = 0; i < seqlen; ++i) {
8924        Py_ssize_t itemlen, copied;
8925        item = items[i];
8926        /* Copy item, and maybe the separator. */
8927        if (i && seplen != 0) {
8928            copied = PyUnicode_CopyCharacters(res, res_offset,
8929                                              sep, 0, seplen);
8930            if (copied < 0)
8931                goto onError;
8932#ifdef Py_DEBUG
8933            res_offset += copied;
8934#else
8935            res_offset += seplen;
8936#endif
8937        }
8938        itemlen = PyUnicode_GET_LENGTH(item);
8939        if (itemlen != 0) {
8940            copied = PyUnicode_CopyCharacters(res, res_offset,
8941                                              item, 0, itemlen);
8942            if (copied < 0)
8943                goto onError;
8944#ifdef Py_DEBUG
8945            res_offset += copied;
8946#else
8947            res_offset += itemlen;
8948#endif
8949        }
8950    }
8951    assert(res_offset == PyUnicode_GET_LENGTH(res));
8952
8953  Done:
8954    Py_DECREF(fseq);
8955    Py_XDECREF(sep);
8956    return res;
8957
8958  onError:
8959    Py_DECREF(fseq);
8960    Py_XDECREF(sep);
8961    Py_XDECREF(res);
8962    return NULL;
8963}
8964
8965#define FILL(kind, data, value, start, length) \
8966    do { \
8967        Py_ssize_t i_ = 0; \
8968        assert(kind != PyUnicode_WCHAR_KIND); \
8969        switch ((kind)) { \
8970        case PyUnicode_1BYTE_KIND: { \
8971            unsigned char * to_ = (unsigned char *)((data)) + (start); \
8972            memset(to_, (unsigned char)value, length); \
8973            break; \
8974        } \
8975        case PyUnicode_2BYTE_KIND: { \
8976            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
8977            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8978            break; \
8979        } \
8980        default: { \
8981            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
8982            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
8983            break; \
8984        } \
8985        } \
8986    } while (0)
8987
8988static PyUnicodeObject *
8989pad(PyUnicodeObject *self,
8990    Py_ssize_t left,
8991    Py_ssize_t right,
8992    Py_UCS4 fill)
8993{
8994    PyObject *u;
8995    Py_UCS4 maxchar;
8996    int kind;
8997    void *data;
8998
8999    if (left < 0)
9000        left = 0;
9001    if (right < 0)
9002        right = 0;
9003
9004    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9005        Py_INCREF(self);
9006        return self;
9007    }
9008
9009    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9010        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9011        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9012        return NULL;
9013    }
9014    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9015    if (fill > maxchar)
9016        maxchar = fill;
9017    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9018    if (!u)
9019        return NULL;
9020
9021    kind = PyUnicode_KIND(u);
9022    data = PyUnicode_DATA(u);
9023    if (left)
9024        FILL(kind, data, fill, 0, left);
9025    if (right)
9026        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9027    if (PyUnicode_CopyCharacters(u, left,
9028                                 (PyObject*)self, 0,
9029                                 _PyUnicode_LENGTH(self)) < 0)
9030    {
9031        Py_DECREF(u);
9032        return NULL;
9033    }
9034
9035    return (PyUnicodeObject*)u;
9036}
9037#undef FILL
9038
9039PyObject *
9040PyUnicode_Splitlines(PyObject *string, int keepends)
9041{
9042    PyObject *list;
9043
9044    string = PyUnicode_FromObject(string);
9045    if (string == NULL || PyUnicode_READY(string) == -1)
9046        return NULL;
9047
9048    switch(PyUnicode_KIND(string)) {
9049    case PyUnicode_1BYTE_KIND:
9050        list = ucs1lib_splitlines(
9051            (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9052            PyUnicode_GET_LENGTH(string), keepends);
9053        break;
9054    case PyUnicode_2BYTE_KIND:
9055        list = ucs2lib_splitlines(
9056            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9057            PyUnicode_GET_LENGTH(string), keepends);
9058        break;
9059    case PyUnicode_4BYTE_KIND:
9060        list = ucs4lib_splitlines(
9061            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9062            PyUnicode_GET_LENGTH(string), keepends);
9063        break;
9064    default:
9065        assert(0);
9066        list = 0;
9067    }
9068    Py_DECREF(string);
9069    return list;
9070}
9071
9072static PyObject *
9073split(PyUnicodeObject *self,
9074      PyUnicodeObject *substring,
9075      Py_ssize_t maxcount)
9076{
9077    int kind1, kind2, kind;
9078    void *buf1, *buf2;
9079    Py_ssize_t len1, len2;
9080    PyObject* out;
9081
9082    if (maxcount < 0)
9083        maxcount = PY_SSIZE_T_MAX;
9084
9085    if (PyUnicode_READY(self) == -1)
9086        return NULL;
9087
9088    if (substring == NULL)
9089        switch(PyUnicode_KIND(self)) {
9090        case PyUnicode_1BYTE_KIND:
9091            return ucs1lib_split_whitespace(
9092                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9093                PyUnicode_GET_LENGTH(self), maxcount
9094                );
9095        case PyUnicode_2BYTE_KIND:
9096            return ucs2lib_split_whitespace(
9097                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9098                PyUnicode_GET_LENGTH(self), maxcount
9099                );
9100        case PyUnicode_4BYTE_KIND:
9101            return ucs4lib_split_whitespace(
9102                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9103                PyUnicode_GET_LENGTH(self), maxcount
9104                );
9105        default:
9106            assert(0);
9107            return NULL;
9108        }
9109
9110    if (PyUnicode_READY(substring) == -1)
9111        return NULL;
9112
9113    kind1 = PyUnicode_KIND(self);
9114    kind2 = PyUnicode_KIND(substring);
9115    kind = kind1 > kind2 ? kind1 : kind2;
9116    buf1 = PyUnicode_DATA(self);
9117    buf2 = PyUnicode_DATA(substring);
9118    if (kind1 != kind)
9119        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9120    if (!buf1)
9121        return NULL;
9122    if (kind2 != kind)
9123        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9124    if (!buf2) {
9125        if (kind1 != kind) PyMem_Free(buf1);
9126        return NULL;
9127    }
9128    len1 = PyUnicode_GET_LENGTH(self);
9129    len2 = PyUnicode_GET_LENGTH(substring);
9130
9131    switch(kind) {
9132    case PyUnicode_1BYTE_KIND:
9133        out = ucs1lib_split(
9134            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9135        break;
9136    case PyUnicode_2BYTE_KIND:
9137        out = ucs2lib_split(
9138            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9139        break;
9140    case PyUnicode_4BYTE_KIND:
9141        out = ucs4lib_split(
9142            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9143        break;
9144    default:
9145        out = NULL;
9146    }
9147    if (kind1 != kind)
9148        PyMem_Free(buf1);
9149    if (kind2 != kind)
9150        PyMem_Free(buf2);
9151    return out;
9152}
9153
9154static PyObject *
9155rsplit(PyUnicodeObject *self,
9156       PyUnicodeObject *substring,
9157       Py_ssize_t maxcount)
9158{
9159    int kind1, kind2, kind;
9160    void *buf1, *buf2;
9161    Py_ssize_t len1, len2;
9162    PyObject* out;
9163
9164    if (maxcount < 0)
9165        maxcount = PY_SSIZE_T_MAX;
9166
9167    if (PyUnicode_READY(self) == -1)
9168        return NULL;
9169
9170    if (substring == NULL)
9171        switch(PyUnicode_KIND(self)) {
9172        case PyUnicode_1BYTE_KIND:
9173            return ucs1lib_rsplit_whitespace(
9174                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9175                PyUnicode_GET_LENGTH(self), maxcount
9176                );
9177        case PyUnicode_2BYTE_KIND:
9178            return ucs2lib_rsplit_whitespace(
9179                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9180                PyUnicode_GET_LENGTH(self), maxcount
9181                );
9182        case PyUnicode_4BYTE_KIND:
9183            return ucs4lib_rsplit_whitespace(
9184                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9185                PyUnicode_GET_LENGTH(self), maxcount
9186                );
9187        default:
9188            assert(0);
9189            return NULL;
9190        }
9191
9192    if (PyUnicode_READY(substring) == -1)
9193        return NULL;
9194
9195    kind1 = PyUnicode_KIND(self);
9196    kind2 = PyUnicode_KIND(substring);
9197    kind = kind1 > kind2 ? kind1 : kind2;
9198    buf1 = PyUnicode_DATA(self);
9199    buf2 = PyUnicode_DATA(substring);
9200    if (kind1 != kind)
9201        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9202    if (!buf1)
9203        return NULL;
9204    if (kind2 != kind)
9205        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9206    if (!buf2) {
9207        if (kind1 != kind) PyMem_Free(buf1);
9208        return NULL;
9209    }
9210    len1 = PyUnicode_GET_LENGTH(self);
9211    len2 = PyUnicode_GET_LENGTH(substring);
9212
9213    switch(kind) {
9214    case PyUnicode_1BYTE_KIND:
9215        out = ucs1lib_rsplit(
9216            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9217        break;
9218    case PyUnicode_2BYTE_KIND:
9219        out = ucs2lib_rsplit(
9220            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9221        break;
9222    case PyUnicode_4BYTE_KIND:
9223        out = ucs4lib_rsplit(
9224            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9225        break;
9226    default:
9227        out = NULL;
9228    }
9229    if (kind1 != kind)
9230        PyMem_Free(buf1);
9231    if (kind2 != kind)
9232        PyMem_Free(buf2);
9233    return out;
9234}
9235
9236static Py_ssize_t
9237anylib_find(int kind, void *buf1, Py_ssize_t len1,
9238            void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9239{
9240    switch(kind) {
9241    case PyUnicode_1BYTE_KIND:
9242        return ucs1lib_find(buf1, len1, buf2, len2, offset);
9243    case PyUnicode_2BYTE_KIND:
9244        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9245    case PyUnicode_4BYTE_KIND:
9246        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9247    }
9248    assert(0);
9249    return -1;
9250}
9251
9252static Py_ssize_t
9253anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9254             void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9255{
9256        switch(kind) {
9257        case PyUnicode_1BYTE_KIND:
9258            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9259        case PyUnicode_2BYTE_KIND:
9260            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9261        case PyUnicode_4BYTE_KIND:
9262            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9263        }
9264        assert(0);
9265        return 0;
9266}
9267
9268static PyObject *
9269replace(PyObject *self, PyObject *str1,
9270        PyObject *str2, Py_ssize_t maxcount)
9271{
9272    PyObject *u;
9273    char *sbuf = PyUnicode_DATA(self);
9274    char *buf1 = PyUnicode_DATA(str1);
9275    char *buf2 = PyUnicode_DATA(str2);
9276    int srelease = 0, release1 = 0, release2 = 0;
9277    int skind = PyUnicode_KIND(self);
9278    int kind1 = PyUnicode_KIND(str1);
9279    int kind2 = PyUnicode_KIND(str2);
9280    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9281    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9282    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9283
9284    if (maxcount < 0)
9285        maxcount = PY_SSIZE_T_MAX;
9286    else if (maxcount == 0 || slen == 0)
9287        goto nothing;
9288
9289    if (skind < kind1)
9290        /* substring too wide to be present */
9291        goto nothing;
9292
9293    if (len1 == len2) {
9294        Py_ssize_t i;
9295        /* same length */
9296        if (len1 == 0)
9297            goto nothing;
9298        if (len1 == 1) {
9299            /* replace characters */
9300            Py_UCS4 u1, u2, maxchar;
9301            int mayshrink, rkind;
9302            u1 = PyUnicode_READ_CHAR(str1, 0);
9303            if (!findchar(sbuf, PyUnicode_KIND(self),
9304                          slen, u1, 1))
9305                goto nothing;
9306            u2 = PyUnicode_READ_CHAR(str2, 0);
9307            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9308            /* Replacing u1 with u2 may cause a maxchar reduction in the
9309               result string. */
9310            mayshrink = maxchar > 127;
9311            if (u2 > maxchar) {
9312                maxchar = u2;
9313                mayshrink = 0;
9314            }
9315            u = PyUnicode_New(slen, maxchar);
9316            if (!u)
9317                goto error;
9318            if (PyUnicode_CopyCharacters(u, 0,
9319                                         (PyObject*)self, 0, slen) < 0)
9320            {
9321                Py_DECREF(u);
9322                return NULL;
9323            }
9324            rkind = PyUnicode_KIND(u);
9325            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9326                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9327                    if (--maxcount < 0)
9328                        break;
9329                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9330                }
9331            if (mayshrink) {
9332                PyObject *tmp = u;
9333                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9334                                              PyUnicode_GET_LENGTH(tmp));
9335                Py_DECREF(tmp);
9336            }
9337        } else {
9338            int rkind = skind;
9339            char *res;
9340            if (kind1 < rkind) {
9341                /* widen substring */
9342                buf1 = _PyUnicode_AsKind(str1, rkind);
9343                if (!buf1) goto error;
9344                release1 = 1;
9345            }
9346            i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
9347            if (i < 0)
9348                goto nothing;
9349            if (rkind > kind2) {
9350                /* widen replacement */
9351                buf2 = _PyUnicode_AsKind(str2, rkind);
9352                if (!buf2) goto error;
9353                release2 = 1;
9354            }
9355            else if (rkind < kind2) {
9356                /* widen self and buf1 */
9357                rkind = kind2;
9358                if (release1) PyMem_Free(buf1);
9359                sbuf = _PyUnicode_AsKind(self, rkind);
9360                if (!sbuf) goto error;
9361                srelease = 1;
9362                buf1 = _PyUnicode_AsKind(str1, rkind);
9363                if (!buf1) goto error;
9364                release1 = 1;
9365            }
9366            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9367            if (!res) {
9368                PyErr_NoMemory();
9369                goto error;
9370            }
9371            memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
9372            /* change everything in-place, starting with this one */
9373            memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9374                   buf2,
9375                   PyUnicode_KIND_SIZE(rkind, len2));
9376            i += len1;
9377
9378            while ( --maxcount > 0) {
9379                i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9380                                slen-i,
9381                                buf1, len1, i);
9382                if (i == -1)
9383                    break;
9384                memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9385                       buf2,
9386                       PyUnicode_KIND_SIZE(rkind, len2));
9387                i += len1;
9388            }
9389
9390            u = PyUnicode_FromKindAndData(rkind, res, slen);
9391            PyMem_Free(res);
9392            if (!u) goto error;
9393        }
9394    } else {
9395
9396        Py_ssize_t n, i, j, ires;
9397        Py_ssize_t product, new_size;
9398        int rkind = skind;
9399        char *res;
9400
9401        if (kind1 < rkind) {
9402            buf1 = _PyUnicode_AsKind(str1, rkind);
9403            if (!buf1) goto error;
9404            release1 = 1;
9405        }
9406        n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
9407        if (n == 0)
9408            goto nothing;
9409        if (kind2 < rkind) {
9410            buf2 = _PyUnicode_AsKind(str2, rkind);
9411            if (!buf2) goto error;
9412            release2 = 1;
9413        }
9414        else if (kind2 > rkind) {
9415            rkind = kind2;
9416            sbuf = _PyUnicode_AsKind(self, rkind);
9417            if (!sbuf) goto error;
9418            srelease = 1;
9419            if (release1) PyMem_Free(buf1);
9420            buf1 = _PyUnicode_AsKind(str1, rkind);
9421            if (!buf1) goto error;
9422            release1 = 1;
9423        }
9424        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9425           PyUnicode_GET_LENGTH(str1))); */
9426        product = n * (len2-len1);
9427        if ((product / (len2-len1)) != n) {
9428                PyErr_SetString(PyExc_OverflowError,
9429                                "replace string is too long");
9430                goto error;
9431        }
9432        new_size = slen + product;
9433        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9434            PyErr_SetString(PyExc_OverflowError,
9435                            "replace string is too long");
9436            goto error;
9437        }
9438        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9439        if (!res)
9440            goto error;
9441        ires = i = 0;
9442        if (len1 > 0) {
9443            while (n-- > 0) {
9444                /* look for next match */
9445                j = anylib_find(rkind,
9446                                sbuf + PyUnicode_KIND_SIZE(rkind, i),
9447                                slen-i, buf1, len1, i);
9448                if (j == -1)
9449                    break;
9450                else if (j > i) {
9451                    /* copy unchanged part [i:j] */
9452                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9453                           sbuf + PyUnicode_KIND_SIZE(rkind, i),
9454                           PyUnicode_KIND_SIZE(rkind, j-i));
9455                    ires += j - i;
9456                }
9457                /* copy substitution string */
9458                if (len2 > 0) {
9459                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9460                           buf2,
9461                           PyUnicode_KIND_SIZE(rkind, len2));
9462                    ires += len2;
9463                }
9464                i = j + len1;
9465            }
9466            if (i < slen)
9467                /* copy tail [i:] */
9468                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9469                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9470                       PyUnicode_KIND_SIZE(rkind, slen-i));
9471        } else {
9472            /* interleave */
9473            while (n > 0) {
9474                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9475                       buf2,
9476                       PyUnicode_KIND_SIZE(rkind, len2));
9477                ires += len2;
9478                if (--n <= 0)
9479                    break;
9480                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9481                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9482                       PyUnicode_KIND_SIZE(rkind, 1));
9483                ires++;
9484                i++;
9485            }
9486            memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9487                   sbuf + PyUnicode_KIND_SIZE(rkind, i),
9488                   PyUnicode_KIND_SIZE(rkind, slen-i));
9489        }
9490        u = PyUnicode_FromKindAndData(rkind, res, new_size);
9491        PyMem_Free(res);
9492    }
9493    if (srelease)
9494        PyMem_FREE(sbuf);
9495    if (release1)
9496        PyMem_FREE(buf1);
9497    if (release2)
9498        PyMem_FREE(buf2);
9499    return u;
9500
9501  nothing:
9502    /* nothing to replace; return original string (when possible) */
9503    if (srelease)
9504        PyMem_FREE(sbuf);
9505    if (release1)
9506        PyMem_FREE(buf1);
9507    if (release2)
9508        PyMem_FREE(buf2);
9509    if (PyUnicode_CheckExact(self)) {
9510        Py_INCREF(self);
9511        return (PyObject *) self;
9512    }
9513    return PyUnicode_Copy(self);
9514  error:
9515    if (srelease && sbuf)
9516        PyMem_FREE(sbuf);
9517    if (release1 && buf1)
9518        PyMem_FREE(buf1);
9519    if (release2 && buf2)
9520        PyMem_FREE(buf2);
9521    return NULL;
9522}
9523
9524/* --- Unicode Object Methods --------------------------------------------- */
9525
9526PyDoc_STRVAR(title__doc__,
9527             "S.title() -> str\n\
9528\n\
9529Return a titlecased version of S, i.e. words start with title case\n\
9530characters, all remaining cased characters have lower case.");
9531
9532static PyObject*
9533unicode_title(PyUnicodeObject *self)
9534{
9535    return fixup(self, fixtitle);
9536}
9537
9538PyDoc_STRVAR(capitalize__doc__,
9539             "S.capitalize() -> str\n\
9540\n\
9541Return a capitalized version of S, i.e. make the first character\n\
9542have upper case and the rest lower case.");
9543
9544static PyObject*
9545unicode_capitalize(PyUnicodeObject *self)
9546{
9547    return fixup(self, fixcapitalize);
9548}
9549
9550#if 0
9551PyDoc_STRVAR(capwords__doc__,
9552             "S.capwords() -> str\n\
9553\n\
9554Apply .capitalize() to all words in S and return the result with\n\
9555normalized whitespace (all whitespace strings are replaced by ' ').");
9556
9557static PyObject*
9558unicode_capwords(PyUnicodeObject *self)
9559{
9560    PyObject *list;
9561    PyObject *item;
9562    Py_ssize_t i;
9563
9564    /* Split into words */
9565    list = split(self, NULL, -1);
9566    if (!list)
9567        return NULL;
9568
9569    /* Capitalize each word */
9570    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9571        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9572                     fixcapitalize);
9573        if (item == NULL)
9574            goto onError;
9575        Py_DECREF(PyList_GET_ITEM(list, i));
9576        PyList_SET_ITEM(list, i, item);
9577    }
9578
9579    /* Join the words to form a new string */
9580    item = PyUnicode_Join(NULL, list);
9581
9582  onError:
9583    Py_DECREF(list);
9584    return (PyObject *)item;
9585}
9586#endif
9587
9588/* Argument converter.  Coerces to a single unicode character */
9589
9590static int
9591convert_uc(PyObject *obj, void *addr)
9592{
9593    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9594    PyObject *uniobj;
9595
9596    uniobj = PyUnicode_FromObject(obj);
9597    if (uniobj == NULL) {
9598        PyErr_SetString(PyExc_TypeError,
9599                        "The fill character cannot be converted to Unicode");
9600        return 0;
9601    }
9602    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
9603        PyErr_SetString(PyExc_TypeError,
9604                        "The fill character must be exactly one character long");
9605        Py_DECREF(uniobj);
9606        return 0;
9607    }
9608    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
9609    Py_DECREF(uniobj);
9610    return 1;
9611}
9612
9613PyDoc_STRVAR(center__doc__,
9614             "S.center(width[, fillchar]) -> str\n\
9615\n\
9616Return S centered in a string of length width. Padding is\n\
9617done using the specified fill character (default is a space)");
9618
9619static PyObject *
9620unicode_center(PyUnicodeObject *self, PyObject *args)
9621{
9622    Py_ssize_t marg, left;
9623    Py_ssize_t width;
9624    Py_UCS4 fillchar = ' ';
9625
9626    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
9627        return NULL;
9628
9629    if (PyUnicode_READY(self) == -1)
9630        return NULL;
9631
9632    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
9633        Py_INCREF(self);
9634        return (PyObject*) self;
9635    }
9636
9637    marg = width - _PyUnicode_LENGTH(self);
9638    left = marg / 2 + (marg & width & 1);
9639
9640    return (PyObject*) pad(self, left, marg - left, fillchar);
9641}
9642
9643#if 0
9644
9645/* This code should go into some future Unicode collation support
9646   module. The basic comparison should compare ordinals on a naive
9647   basis (this is what Java does and thus Jython too). */
9648
9649/* speedy UTF-16 code point order comparison */
9650/* gleaned from: */
9651/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9652
9653static short utf16Fixup[32] =
9654{
9655    0, 0, 0, 0, 0, 0, 0, 0,
9656    0, 0, 0, 0, 0, 0, 0, 0,
9657    0, 0, 0, 0, 0, 0, 0, 0,
9658    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
9659};
9660
9661static int
9662unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9663{
9664    Py_ssize_t len1, len2;
9665
9666    Py_UNICODE *s1 = str1->str;
9667    Py_UNICODE *s2 = str2->str;
9668
9669    len1 = str1->_base._base.length;
9670    len2 = str2->_base._base.length;
9671
9672    while (len1 > 0 && len2 > 0) {
9673        Py_UNICODE c1, c2;
9674
9675        c1 = *s1++;
9676        c2 = *s2++;
9677
9678        if (c1 > (1<<11) * 26)
9679            c1 += utf16Fixup[c1>>11];
9680        if (c2 > (1<<11) * 26)
9681            c2 += utf16Fixup[c2>>11];
9682        /* now c1 and c2 are in UTF-32-compatible order */
9683
9684        if (c1 != c2)
9685            return (c1 < c2) ? -1 : 1;
9686
9687        len1--; len2--;
9688    }
9689
9690    return (len1 < len2) ? -1 : (len1 != len2);
9691}
9692
9693#else
9694
9695/* This function assumes that str1 and str2 are readied by the caller. */
9696
9697static int
9698unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9699{
9700    int kind1, kind2;
9701    void *data1, *data2;
9702    Py_ssize_t len1, len2, i;
9703
9704    kind1 = PyUnicode_KIND(str1);
9705    kind2 = PyUnicode_KIND(str2);
9706    data1 = PyUnicode_DATA(str1);
9707    data2 = PyUnicode_DATA(str2);
9708    len1 = PyUnicode_GET_LENGTH(str1);
9709    len2 = PyUnicode_GET_LENGTH(str2);
9710
9711    for (i = 0; i < len1 && i < len2; ++i) {
9712        Py_UCS4 c1, c2;
9713        c1 = PyUnicode_READ(kind1, data1, i);
9714        c2 = PyUnicode_READ(kind2, data2, i);
9715
9716        if (c1 != c2)
9717            return (c1 < c2) ? -1 : 1;
9718    }
9719
9720    return (len1 < len2) ? -1 : (len1 != len2);
9721}
9722
9723#endif
9724
9725int
9726PyUnicode_Compare(PyObject *left, PyObject *right)
9727{
9728    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9729        if (PyUnicode_READY(left) == -1 ||
9730            PyUnicode_READY(right) == -1)
9731            return -1;
9732        return unicode_compare((PyUnicodeObject *)left,
9733                               (PyUnicodeObject *)right);
9734    }
9735    PyErr_Format(PyExc_TypeError,
9736                 "Can't compare %.100s and %.100s",
9737                 left->ob_type->tp_name,
9738                 right->ob_type->tp_name);
9739    return -1;
9740}
9741
9742int
9743PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9744{
9745    Py_ssize_t i;
9746    int kind;
9747    void *data;
9748    Py_UCS4 chr;
9749
9750    assert(_PyUnicode_CHECK(uni));
9751    if (PyUnicode_READY(uni) == -1)
9752        return -1;
9753    kind = PyUnicode_KIND(uni);
9754    data = PyUnicode_DATA(uni);
9755    /* Compare Unicode string and source character set string */
9756    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9757        if (chr != str[i])
9758            return (chr < (unsigned char)(str[i])) ? -1 : 1;
9759    /* This check keeps Python strings that end in '\0' from comparing equal
9760     to C strings identical up to that point. */
9761    if (PyUnicode_GET_LENGTH(uni) != i || chr)
9762        return 1; /* uni is longer */
9763    if (str[i])
9764        return -1; /* str is longer */
9765    return 0;
9766}
9767
9768
9769#define TEST_COND(cond)                         \
9770    ((cond) ? Py_True : Py_False)
9771
9772PyObject *
9773PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
9774{
9775    int result;
9776
9777    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9778        PyObject *v;
9779        if (PyUnicode_READY(left) == -1 ||
9780            PyUnicode_READY(right) == -1)
9781            return NULL;
9782        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9783            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
9784            if (op == Py_EQ) {
9785                Py_INCREF(Py_False);
9786                return Py_False;
9787            }
9788            if (op == Py_NE) {
9789                Py_INCREF(Py_True);
9790                return Py_True;
9791            }
9792        }
9793        if (left == right)
9794            result = 0;
9795        else
9796            result = unicode_compare((PyUnicodeObject *)left,
9797                                     (PyUnicodeObject *)right);
9798
9799        /* Convert the return value to a Boolean */
9800        switch (op) {
9801        case Py_EQ:
9802            v = TEST_COND(result == 0);
9803            break;
9804        case Py_NE:
9805            v = TEST_COND(result != 0);
9806            break;
9807        case Py_LE:
9808            v = TEST_COND(result <= 0);
9809            break;
9810        case Py_GE:
9811            v = TEST_COND(result >= 0);
9812            break;
9813        case Py_LT:
9814            v = TEST_COND(result == -1);
9815            break;
9816        case Py_GT:
9817            v = TEST_COND(result == 1);
9818            break;
9819        default:
9820            PyErr_BadArgument();
9821            return NULL;
9822        }
9823        Py_INCREF(v);
9824        return v;
9825    }
9826
9827    Py_RETURN_NOTIMPLEMENTED;
9828}
9829
9830int
9831PyUnicode_Contains(PyObject *container, PyObject *element)
9832{
9833    PyObject *str, *sub;
9834    int kind1, kind2, kind;
9835    void *buf1, *buf2;
9836    Py_ssize_t len1, len2;
9837    int result;
9838
9839    /* Coerce the two arguments */
9840    sub = PyUnicode_FromObject(element);
9841    if (!sub) {
9842        PyErr_Format(PyExc_TypeError,
9843                     "'in <string>' requires string as left operand, not %s",
9844                     element->ob_type->tp_name);
9845        return -1;
9846    }
9847    if (PyUnicode_READY(sub) == -1)
9848        return -1;
9849
9850    str = PyUnicode_FromObject(container);
9851    if (!str || PyUnicode_READY(str) == -1) {
9852        Py_DECREF(sub);
9853        return -1;
9854    }
9855
9856    kind1 = PyUnicode_KIND(str);
9857    kind2 = PyUnicode_KIND(sub);
9858    kind = kind1 > kind2 ? kind1 : kind2;
9859    buf1 = PyUnicode_DATA(str);
9860    buf2 = PyUnicode_DATA(sub);
9861    if (kind1 != kind)
9862        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9863    if (!buf1) {
9864        Py_DECREF(sub);
9865        return -1;
9866    }
9867    if (kind2 != kind)
9868        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9869    if (!buf2) {
9870        Py_DECREF(sub);
9871        if (kind1 != kind) PyMem_Free(buf1);
9872        return -1;
9873    }
9874    len1 = PyUnicode_GET_LENGTH(str);
9875    len2 = PyUnicode_GET_LENGTH(sub);
9876
9877    switch(kind) {
9878    case PyUnicode_1BYTE_KIND:
9879        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9880        break;
9881    case PyUnicode_2BYTE_KIND:
9882        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9883        break;
9884    case PyUnicode_4BYTE_KIND:
9885        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9886        break;
9887    default:
9888        result = -1;
9889        assert(0);
9890    }
9891
9892    Py_DECREF(str);
9893    Py_DECREF(sub);
9894
9895    if (kind1 != kind)
9896        PyMem_Free(buf1);
9897    if (kind2 != kind)
9898        PyMem_Free(buf2);
9899
9900    return result;
9901}
9902
9903/* Concat to string or Unicode object giving a new Unicode object. */
9904
9905PyObject *
9906PyUnicode_Concat(PyObject *left, PyObject *right)
9907{
9908    PyObject *u = NULL, *v = NULL, *w;
9909    Py_UCS4 maxchar;
9910
9911    /* Coerce the two arguments */
9912    u = PyUnicode_FromObject(left);
9913    if (u == NULL)
9914        goto onError;
9915    v = PyUnicode_FromObject(right);
9916    if (v == NULL)
9917        goto onError;
9918
9919    /* Shortcuts */
9920    if (v == unicode_empty) {
9921        Py_DECREF(v);
9922        return u;
9923    }
9924    if (u == unicode_empty) {
9925        Py_DECREF(u);
9926        return v;
9927    }
9928
9929    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
9930    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
9931
9932    /* Concat the two Unicode strings */
9933    w = PyUnicode_New(
9934        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
9935        maxchar);
9936    if (w == NULL)
9937        goto onError;
9938    if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
9939        goto onError;
9940    if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
9941                                 v, 0,
9942                                 PyUnicode_GET_LENGTH(v)) < 0)
9943        goto onError;
9944    Py_DECREF(u);
9945    Py_DECREF(v);
9946    return w;
9947
9948  onError:
9949    Py_XDECREF(u);
9950    Py_XDECREF(v);
9951    return NULL;
9952}
9953
9954void
9955PyUnicode_Append(PyObject **p_left, PyObject *right)
9956{
9957    PyObject *left, *res;
9958
9959    if (p_left == NULL) {
9960        if (!PyErr_Occurred())
9961            PyErr_BadInternalCall();
9962        return;
9963    }
9964    left = *p_left;
9965    if (right == NULL || !PyUnicode_Check(left)) {
9966        if (!PyErr_Occurred())
9967            PyErr_BadInternalCall();
9968        goto error;
9969    }
9970
9971    if (PyUnicode_CheckExact(left) && left != unicode_empty
9972        && PyUnicode_CheckExact(right) && right != unicode_empty
9973        && unicode_resizable(left)
9974        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
9975            || _PyUnicode_WSTR(left) != NULL))
9976    {
9977        Py_ssize_t left_len, right_len, new_len;
9978#ifdef Py_DEBUG
9979        Py_ssize_t copied;
9980#endif
9981
9982        if (PyUnicode_READY(left))
9983            goto error;
9984        if (PyUnicode_READY(right))
9985            goto error;
9986
9987        /* FIXME: support ascii+latin1, PyASCIIObject => PyCompactUnicodeObject */
9988        if (PyUnicode_MAX_CHAR_VALUE(right) <= PyUnicode_MAX_CHAR_VALUE(left))
9989        {
9990            left_len = PyUnicode_GET_LENGTH(left);
9991            right_len = PyUnicode_GET_LENGTH(right);
9992            if (left_len > PY_SSIZE_T_MAX - right_len) {
9993                PyErr_SetString(PyExc_OverflowError,
9994                                "strings are too large to concat");
9995                goto error;
9996            }
9997            new_len = left_len + right_len;
9998
9999            /* Now we own the last reference to 'left', so we can resize it
10000             * in-place.
10001             */
10002            if (unicode_resize(&left, new_len) != 0) {
10003                /* XXX if _PyUnicode_Resize() fails, 'left' has been
10004                 * deallocated so it cannot be put back into
10005                 * 'variable'.  The MemoryError is raised when there
10006                 * is no value in 'variable', which might (very
10007                 * remotely) be a cause of incompatibilities.
10008                 */
10009                goto error;
10010            }
10011            /* copy 'right' into the newly allocated area of 'left' */
10012#ifdef Py_DEBUG
10013            copied = PyUnicode_CopyCharacters(left, left_len,
10014                                              right, 0,
10015                                              right_len);
10016            assert(0 <= copied);
10017#else
10018            PyUnicode_CopyCharacters(left, left_len, right, 0, right_len);
10019#endif
10020            *p_left = left;
10021            return;
10022        }
10023    }
10024
10025    res = PyUnicode_Concat(left, right);
10026    if (res == NULL)
10027        goto error;
10028    Py_DECREF(left);
10029    *p_left = res;
10030    return;
10031
10032error:
10033    Py_DECREF(*p_left);
10034    *p_left = NULL;
10035}
10036
10037void
10038PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10039{
10040    PyUnicode_Append(pleft, right);
10041    Py_XDECREF(right);
10042}
10043
10044PyDoc_STRVAR(count__doc__,
10045             "S.count(sub[, start[, end]]) -> int\n\
10046\n\
10047Return the number of non-overlapping occurrences of substring sub in\n\
10048string S[start:end].  Optional arguments start and end are\n\
10049interpreted as in slice notation.");
10050
10051static PyObject *
10052unicode_count(PyUnicodeObject *self, PyObject *args)
10053{
10054    PyUnicodeObject *substring;
10055    Py_ssize_t start = 0;
10056    Py_ssize_t end = PY_SSIZE_T_MAX;
10057    PyObject *result;
10058    int kind1, kind2, kind;
10059    void *buf1, *buf2;
10060    Py_ssize_t len1, len2, iresult;
10061
10062    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10063                                            &start, &end))
10064        return NULL;
10065
10066    kind1 = PyUnicode_KIND(self);
10067    kind2 = PyUnicode_KIND(substring);
10068    kind = kind1 > kind2 ? kind1 : kind2;
10069    buf1 = PyUnicode_DATA(self);
10070    buf2 = PyUnicode_DATA(substring);
10071    if (kind1 != kind)
10072        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10073    if (!buf1) {
10074        Py_DECREF(substring);
10075        return NULL;
10076    }
10077    if (kind2 != kind)
10078        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10079    if (!buf2) {
10080        Py_DECREF(substring);
10081        if (kind1 != kind) PyMem_Free(buf1);
10082        return NULL;
10083    }
10084    len1 = PyUnicode_GET_LENGTH(self);
10085    len2 = PyUnicode_GET_LENGTH(substring);
10086
10087    ADJUST_INDICES(start, end, len1);
10088    switch(kind) {
10089    case PyUnicode_1BYTE_KIND:
10090        iresult = ucs1lib_count(
10091            ((Py_UCS1*)buf1) + start, end - start,
10092            buf2, len2, PY_SSIZE_T_MAX
10093            );
10094        break;
10095    case PyUnicode_2BYTE_KIND:
10096        iresult = ucs2lib_count(
10097            ((Py_UCS2*)buf1) + start, end - start,
10098            buf2, len2, PY_SSIZE_T_MAX
10099            );
10100        break;
10101    case PyUnicode_4BYTE_KIND:
10102        iresult = ucs4lib_count(
10103            ((Py_UCS4*)buf1) + start, end - start,
10104            buf2, len2, PY_SSIZE_T_MAX
10105            );
10106        break;
10107    default:
10108        assert(0); iresult = 0;
10109    }
10110
10111    result = PyLong_FromSsize_t(iresult);
10112
10113    if (kind1 != kind)
10114        PyMem_Free(buf1);
10115    if (kind2 != kind)
10116        PyMem_Free(buf2);
10117
10118    Py_DECREF(substring);
10119
10120    return result;
10121}
10122
10123PyDoc_STRVAR(encode__doc__,
10124             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10125\n\
10126Encode S using the codec registered for encoding. Default encoding\n\
10127is 'utf-8'. errors may be given to set a different error\n\
10128handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10129a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10130'xmlcharrefreplace' as well as any other name registered with\n\
10131codecs.register_error that can handle UnicodeEncodeErrors.");
10132
10133static PyObject *
10134unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10135{
10136    static char *kwlist[] = {"encoding", "errors", 0};
10137    char *encoding = NULL;
10138    char *errors = NULL;
10139
10140    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10141                                     kwlist, &encoding, &errors))
10142        return NULL;
10143    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10144}
10145
10146PyDoc_STRVAR(expandtabs__doc__,
10147             "S.expandtabs([tabsize]) -> str\n\
10148\n\
10149Return a copy of S where all tab characters are expanded using spaces.\n\
10150If tabsize is not given, a tab size of 8 characters is assumed.");
10151
10152static PyObject*
10153unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10154{
10155    Py_UNICODE *e;
10156    Py_UNICODE *p;
10157    Py_UNICODE *q;
10158    Py_UNICODE *qe;
10159    Py_ssize_t i, j, incr, wstr_length;
10160    PyUnicodeObject *u;
10161    int tabsize = 8;
10162
10163    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10164        return NULL;
10165
10166    if (PyUnicode_AsUnicodeAndSize((PyObject *)self, &wstr_length) == NULL)
10167        return NULL;
10168
10169    /* First pass: determine size of output string */
10170    i = 0; /* chars up to and including most recent \n or \r */
10171    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
10172    e = _PyUnicode_WSTR(self) + wstr_length; /* end of input */
10173    for (p = _PyUnicode_WSTR(self); p < e; p++)
10174        if (*p == '\t') {
10175            if (tabsize > 0) {
10176                incr = tabsize - (j % tabsize); /* cannot overflow */
10177                if (j > PY_SSIZE_T_MAX - incr)
10178                    goto overflow1;
10179                j += incr;
10180            }
10181        }
10182        else {
10183            if (j > PY_SSIZE_T_MAX - 1)
10184                goto overflow1;
10185            j++;
10186            if (*p == '\n' || *p == '\r') {
10187                if (i > PY_SSIZE_T_MAX - j)
10188                    goto overflow1;
10189                i += j;
10190                j = 0;
10191            }
10192        }
10193
10194    if (i > PY_SSIZE_T_MAX - j)
10195        goto overflow1;
10196
10197    /* Second pass: create output string and fill it */
10198    u = _PyUnicode_New(i + j);
10199    if (!u)
10200        return NULL;
10201
10202    j = 0; /* same as in first pass */
10203    q = _PyUnicode_WSTR(u); /* next output char */
10204    qe = _PyUnicode_WSTR(u) + PyUnicode_GET_SIZE(u); /* end of output */
10205
10206    for (p = _PyUnicode_WSTR(self); p < e; p++)
10207        if (*p == '\t') {
10208            if (tabsize > 0) {
10209                i = tabsize - (j % tabsize);
10210                j += i;
10211                while (i--) {
10212                    if (q >= qe)
10213                        goto overflow2;
10214                    *q++ = ' ';
10215                }
10216            }
10217        }
10218        else {
10219            if (q >= qe)
10220                goto overflow2;
10221            *q++ = *p;
10222            j++;
10223            if (*p == '\n' || *p == '\r')
10224                j = 0;
10225        }
10226
10227    if (_PyUnicode_READY_REPLACE(&u)) {
10228        Py_DECREF(u);
10229        return NULL;
10230    }
10231    return (PyObject*) u;
10232
10233  overflow2:
10234    Py_DECREF(u);
10235  overflow1:
10236    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10237    return NULL;
10238}
10239
10240PyDoc_STRVAR(find__doc__,
10241             "S.find(sub[, start[, end]]) -> int\n\
10242\n\
10243Return the lowest index in S where substring sub is found,\n\
10244such that sub is contained within S[start:end].  Optional\n\
10245arguments start and end are interpreted as in slice notation.\n\
10246\n\
10247Return -1 on failure.");
10248
10249static PyObject *
10250unicode_find(PyObject *self, PyObject *args)
10251{
10252    PyUnicodeObject *substring;
10253    Py_ssize_t start;
10254    Py_ssize_t end;
10255    Py_ssize_t result;
10256
10257    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10258                                            &start, &end))
10259        return NULL;
10260
10261    if (PyUnicode_READY(self) == -1)
10262        return NULL;
10263    if (PyUnicode_READY(substring) == -1)
10264        return NULL;
10265
10266    result = any_find_slice(
10267        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10268        self, (PyObject*)substring, start, end
10269        );
10270
10271    Py_DECREF(substring);
10272
10273    if (result == -2)
10274        return NULL;
10275
10276    return PyLong_FromSsize_t(result);
10277}
10278
10279static PyObject *
10280unicode_getitem(PyObject *self, Py_ssize_t index)
10281{
10282    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10283    if (ch == (Py_UCS4)-1)
10284        return NULL;
10285    return PyUnicode_FromOrdinal(ch);
10286}
10287
10288/* Believe it or not, this produces the same value for ASCII strings
10289   as bytes_hash(). */
10290static Py_hash_t
10291unicode_hash(PyUnicodeObject *self)
10292{
10293    Py_ssize_t len;
10294    Py_uhash_t x;
10295
10296    if (_PyUnicode_HASH(self) != -1)
10297        return _PyUnicode_HASH(self);
10298    if (PyUnicode_READY(self) == -1)
10299        return -1;
10300    len = PyUnicode_GET_LENGTH(self);
10301
10302    /* The hash function as a macro, gets expanded three times below. */
10303#define HASH(P) \
10304    x = (Py_uhash_t)*P << 7; \
10305    while (--len >= 0) \
10306        x = (1000003*x) ^ (Py_uhash_t)*P++;
10307
10308    switch (PyUnicode_KIND(self)) {
10309    case PyUnicode_1BYTE_KIND: {
10310        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10311        HASH(c);
10312        break;
10313    }
10314    case PyUnicode_2BYTE_KIND: {
10315        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10316        HASH(s);
10317        break;
10318    }
10319    default: {
10320        Py_UCS4 *l;
10321        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10322               "Impossible switch case in unicode_hash");
10323        l = PyUnicode_4BYTE_DATA(self);
10324        HASH(l);
10325        break;
10326    }
10327    }
10328    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10329
10330    if (x == -1)
10331        x = -2;
10332    _PyUnicode_HASH(self) = x;
10333    return x;
10334}
10335#undef HASH
10336
10337PyDoc_STRVAR(index__doc__,
10338             "S.index(sub[, start[, end]]) -> int\n\
10339\n\
10340Like S.find() but raise ValueError when the substring is not found.");
10341
10342static PyObject *
10343unicode_index(PyObject *self, PyObject *args)
10344{
10345    Py_ssize_t result;
10346    PyUnicodeObject *substring;
10347    Py_ssize_t start;
10348    Py_ssize_t end;
10349
10350    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10351                                            &start, &end))
10352        return NULL;
10353
10354    if (PyUnicode_READY(self) == -1)
10355        return NULL;
10356    if (PyUnicode_READY(substring) == -1)
10357        return NULL;
10358
10359    result = any_find_slice(
10360        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10361        self, (PyObject*)substring, start, end
10362        );
10363
10364    Py_DECREF(substring);
10365
10366    if (result == -2)
10367        return NULL;
10368
10369    if (result < 0) {
10370        PyErr_SetString(PyExc_ValueError, "substring not found");
10371        return NULL;
10372    }
10373
10374    return PyLong_FromSsize_t(result);
10375}
10376
10377PyDoc_STRVAR(islower__doc__,
10378             "S.islower() -> bool\n\
10379\n\
10380Return True if all cased characters in S are lowercase and there is\n\
10381at least one cased character in S, False otherwise.");
10382
10383static PyObject*
10384unicode_islower(PyUnicodeObject *self)
10385{
10386    Py_ssize_t i, length;
10387    int kind;
10388    void *data;
10389    int cased;
10390
10391    if (PyUnicode_READY(self) == -1)
10392        return NULL;
10393    length = PyUnicode_GET_LENGTH(self);
10394    kind = PyUnicode_KIND(self);
10395    data = PyUnicode_DATA(self);
10396
10397    /* Shortcut for single character strings */
10398    if (length == 1)
10399        return PyBool_FromLong(
10400            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10401
10402    /* Special case for empty strings */
10403    if (length == 0)
10404        return PyBool_FromLong(0);
10405
10406    cased = 0;
10407    for (i = 0; i < length; i++) {
10408        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10409
10410        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10411            return PyBool_FromLong(0);
10412        else if (!cased && Py_UNICODE_ISLOWER(ch))
10413            cased = 1;
10414    }
10415    return PyBool_FromLong(cased);
10416}
10417
10418PyDoc_STRVAR(isupper__doc__,
10419             "S.isupper() -> bool\n\
10420\n\
10421Return True if all cased characters in S are uppercase and there is\n\
10422at least one cased character in S, False otherwise.");
10423
10424static PyObject*
10425unicode_isupper(PyUnicodeObject *self)
10426{
10427    Py_ssize_t i, length;
10428    int kind;
10429    void *data;
10430    int cased;
10431
10432    if (PyUnicode_READY(self) == -1)
10433        return NULL;
10434    length = PyUnicode_GET_LENGTH(self);
10435    kind = PyUnicode_KIND(self);
10436    data = PyUnicode_DATA(self);
10437
10438    /* Shortcut for single character strings */
10439    if (length == 1)
10440        return PyBool_FromLong(
10441            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10442
10443    /* Special case for empty strings */
10444    if (length == 0)
10445        return PyBool_FromLong(0);
10446
10447    cased = 0;
10448    for (i = 0; i < length; i++) {
10449        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10450
10451        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10452            return PyBool_FromLong(0);
10453        else if (!cased && Py_UNICODE_ISUPPER(ch))
10454            cased = 1;
10455    }
10456    return PyBool_FromLong(cased);
10457}
10458
10459PyDoc_STRVAR(istitle__doc__,
10460             "S.istitle() -> bool\n\
10461\n\
10462Return True if S is a titlecased string and there is at least one\n\
10463character in S, i.e. upper- and titlecase characters may only\n\
10464follow uncased characters and lowercase characters only cased ones.\n\
10465Return False otherwise.");
10466
10467static PyObject*
10468unicode_istitle(PyUnicodeObject *self)
10469{
10470    Py_ssize_t i, length;
10471    int kind;
10472    void *data;
10473    int cased, previous_is_cased;
10474
10475    if (PyUnicode_READY(self) == -1)
10476        return NULL;
10477    length = PyUnicode_GET_LENGTH(self);
10478    kind = PyUnicode_KIND(self);
10479    data = PyUnicode_DATA(self);
10480
10481    /* Shortcut for single character strings */
10482    if (length == 1) {
10483        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10484        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10485                               (Py_UNICODE_ISUPPER(ch) != 0));
10486    }
10487
10488    /* Special case for empty strings */
10489    if (length == 0)
10490        return PyBool_FromLong(0);
10491
10492    cased = 0;
10493    previous_is_cased = 0;
10494    for (i = 0; i < length; i++) {
10495        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10496
10497        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10498            if (previous_is_cased)
10499                return PyBool_FromLong(0);
10500            previous_is_cased = 1;
10501            cased = 1;
10502        }
10503        else if (Py_UNICODE_ISLOWER(ch)) {
10504            if (!previous_is_cased)
10505                return PyBool_FromLong(0);
10506            previous_is_cased = 1;
10507            cased = 1;
10508        }
10509        else
10510            previous_is_cased = 0;
10511    }
10512    return PyBool_FromLong(cased);
10513}
10514
10515PyDoc_STRVAR(isspace__doc__,
10516             "S.isspace() -> bool\n\
10517\n\
10518Return True if all characters in S are whitespace\n\
10519and there is at least one character in S, False otherwise.");
10520
10521static PyObject*
10522unicode_isspace(PyUnicodeObject *self)
10523{
10524    Py_ssize_t i, length;
10525    int kind;
10526    void *data;
10527
10528    if (PyUnicode_READY(self) == -1)
10529        return NULL;
10530    length = PyUnicode_GET_LENGTH(self);
10531    kind = PyUnicode_KIND(self);
10532    data = PyUnicode_DATA(self);
10533
10534    /* Shortcut for single character strings */
10535    if (length == 1)
10536        return PyBool_FromLong(
10537            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10538
10539    /* Special case for empty strings */
10540    if (length == 0)
10541        return PyBool_FromLong(0);
10542
10543    for (i = 0; i < length; i++) {
10544        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10545        if (!Py_UNICODE_ISSPACE(ch))
10546            return PyBool_FromLong(0);
10547    }
10548    return PyBool_FromLong(1);
10549}
10550
10551PyDoc_STRVAR(isalpha__doc__,
10552             "S.isalpha() -> bool\n\
10553\n\
10554Return True if all characters in S are alphabetic\n\
10555and there is at least one character in S, False otherwise.");
10556
10557static PyObject*
10558unicode_isalpha(PyUnicodeObject *self)
10559{
10560    Py_ssize_t i, length;
10561    int kind;
10562    void *data;
10563
10564    if (PyUnicode_READY(self) == -1)
10565        return NULL;
10566    length = PyUnicode_GET_LENGTH(self);
10567    kind = PyUnicode_KIND(self);
10568    data = PyUnicode_DATA(self);
10569
10570    /* Shortcut for single character strings */
10571    if (length == 1)
10572        return PyBool_FromLong(
10573            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10574
10575    /* Special case for empty strings */
10576    if (length == 0)
10577        return PyBool_FromLong(0);
10578
10579    for (i = 0; i < length; i++) {
10580        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10581            return PyBool_FromLong(0);
10582    }
10583    return PyBool_FromLong(1);
10584}
10585
10586PyDoc_STRVAR(isalnum__doc__,
10587             "S.isalnum() -> bool\n\
10588\n\
10589Return True if all characters in S are alphanumeric\n\
10590and there is at least one character in S, False otherwise.");
10591
10592static PyObject*
10593unicode_isalnum(PyUnicodeObject *self)
10594{
10595    int kind;
10596    void *data;
10597    Py_ssize_t len, i;
10598
10599    if (PyUnicode_READY(self) == -1)
10600        return NULL;
10601
10602    kind = PyUnicode_KIND(self);
10603    data = PyUnicode_DATA(self);
10604    len = PyUnicode_GET_LENGTH(self);
10605
10606    /* Shortcut for single character strings */
10607    if (len == 1) {
10608        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10609        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10610    }
10611
10612    /* Special case for empty strings */
10613    if (len == 0)
10614        return PyBool_FromLong(0);
10615
10616    for (i = 0; i < len; i++) {
10617        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10618        if (!Py_UNICODE_ISALNUM(ch))
10619            return PyBool_FromLong(0);
10620    }
10621    return PyBool_FromLong(1);
10622}
10623
10624PyDoc_STRVAR(isdecimal__doc__,
10625             "S.isdecimal() -> bool\n\
10626\n\
10627Return True if there are only decimal characters in S,\n\
10628False otherwise.");
10629
10630static PyObject*
10631unicode_isdecimal(PyUnicodeObject *self)
10632{
10633    Py_ssize_t i, length;
10634    int kind;
10635    void *data;
10636
10637    if (PyUnicode_READY(self) == -1)
10638        return NULL;
10639    length = PyUnicode_GET_LENGTH(self);
10640    kind = PyUnicode_KIND(self);
10641    data = PyUnicode_DATA(self);
10642
10643    /* Shortcut for single character strings */
10644    if (length == 1)
10645        return PyBool_FromLong(
10646            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
10647
10648    /* Special case for empty strings */
10649    if (length == 0)
10650        return PyBool_FromLong(0);
10651
10652    for (i = 0; i < length; i++) {
10653        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
10654            return PyBool_FromLong(0);
10655    }
10656    return PyBool_FromLong(1);
10657}
10658
10659PyDoc_STRVAR(isdigit__doc__,
10660             "S.isdigit() -> bool\n\
10661\n\
10662Return True if all characters in S are digits\n\
10663and there is at least one character in S, False otherwise.");
10664
10665static PyObject*
10666unicode_isdigit(PyUnicodeObject *self)
10667{
10668    Py_ssize_t i, length;
10669    int kind;
10670    void *data;
10671
10672    if (PyUnicode_READY(self) == -1)
10673        return NULL;
10674    length = PyUnicode_GET_LENGTH(self);
10675    kind = PyUnicode_KIND(self);
10676    data = PyUnicode_DATA(self);
10677
10678    /* Shortcut for single character strings */
10679    if (length == 1) {
10680        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10681        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10682    }
10683
10684    /* Special case for empty strings */
10685    if (length == 0)
10686        return PyBool_FromLong(0);
10687
10688    for (i = 0; i < length; i++) {
10689        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
10690            return PyBool_FromLong(0);
10691    }
10692    return PyBool_FromLong(1);
10693}
10694
10695PyDoc_STRVAR(isnumeric__doc__,
10696             "S.isnumeric() -> bool\n\
10697\n\
10698Return True if there are only numeric characters in S,\n\
10699False otherwise.");
10700
10701static PyObject*
10702unicode_isnumeric(PyUnicodeObject *self)
10703{
10704    Py_ssize_t i, length;
10705    int kind;
10706    void *data;
10707
10708    if (PyUnicode_READY(self) == -1)
10709        return NULL;
10710    length = PyUnicode_GET_LENGTH(self);
10711    kind = PyUnicode_KIND(self);
10712    data = PyUnicode_DATA(self);
10713
10714    /* Shortcut for single character strings */
10715    if (length == 1)
10716        return PyBool_FromLong(
10717            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
10718
10719    /* Special case for empty strings */
10720    if (length == 0)
10721        return PyBool_FromLong(0);
10722
10723    for (i = 0; i < length; i++) {
10724        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
10725            return PyBool_FromLong(0);
10726    }
10727    return PyBool_FromLong(1);
10728}
10729
10730int
10731PyUnicode_IsIdentifier(PyObject *self)
10732{
10733    int kind;
10734    void *data;
10735    Py_ssize_t i;
10736    Py_UCS4 first;
10737
10738    if (PyUnicode_READY(self) == -1) {
10739        Py_FatalError("identifier not ready");
10740        return 0;
10741    }
10742
10743    /* Special case for empty strings */
10744    if (PyUnicode_GET_LENGTH(self) == 0)
10745        return 0;
10746    kind = PyUnicode_KIND(self);
10747    data = PyUnicode_DATA(self);
10748
10749    /* PEP 3131 says that the first character must be in
10750       XID_Start and subsequent characters in XID_Continue,
10751       and for the ASCII range, the 2.x rules apply (i.e
10752       start with letters and underscore, continue with
10753       letters, digits, underscore). However, given the current
10754       definition of XID_Start and XID_Continue, it is sufficient
10755       to check just for these, except that _ must be allowed
10756       as starting an identifier.  */
10757    first = PyUnicode_READ(kind, data, 0);
10758    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
10759        return 0;
10760
10761    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
10762        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
10763            return 0;
10764    return 1;
10765}
10766
10767PyDoc_STRVAR(isidentifier__doc__,
10768             "S.isidentifier() -> bool\n\
10769\n\
10770Return True if S is a valid identifier according\n\
10771to the language definition.");
10772
10773static PyObject*
10774unicode_isidentifier(PyObject *self)
10775{
10776    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10777}
10778
10779PyDoc_STRVAR(isprintable__doc__,
10780             "S.isprintable() -> bool\n\
10781\n\
10782Return True if all characters in S are considered\n\
10783printable in repr() or S is empty, False otherwise.");
10784
10785static PyObject*
10786unicode_isprintable(PyObject *self)
10787{
10788    Py_ssize_t i, length;
10789    int kind;
10790    void *data;
10791
10792    if (PyUnicode_READY(self) == -1)
10793        return NULL;
10794    length = PyUnicode_GET_LENGTH(self);
10795    kind = PyUnicode_KIND(self);
10796    data = PyUnicode_DATA(self);
10797
10798    /* Shortcut for single character strings */
10799    if (length == 1)
10800        return PyBool_FromLong(
10801            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
10802
10803    for (i = 0; i < length; i++) {
10804        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
10805            Py_RETURN_FALSE;
10806        }
10807    }
10808    Py_RETURN_TRUE;
10809}
10810
10811PyDoc_STRVAR(join__doc__,
10812             "S.join(iterable) -> str\n\
10813\n\
10814Return a string which is the concatenation of the strings in the\n\
10815iterable.  The separator between elements is S.");
10816
10817static PyObject*
10818unicode_join(PyObject *self, PyObject *data)
10819{
10820    return PyUnicode_Join(self, data);
10821}
10822
10823static Py_ssize_t
10824unicode_length(PyUnicodeObject *self)
10825{
10826    if (PyUnicode_READY(self) == -1)
10827        return -1;
10828    return PyUnicode_GET_LENGTH(self);
10829}
10830
10831PyDoc_STRVAR(ljust__doc__,
10832             "S.ljust(width[, fillchar]) -> str\n\
10833\n\
10834Return S left-justified in a Unicode string of length width. Padding is\n\
10835done using the specified fill character (default is a space).");
10836
10837static PyObject *
10838unicode_ljust(PyUnicodeObject *self, PyObject *args)
10839{
10840    Py_ssize_t width;
10841    Py_UCS4 fillchar = ' ';
10842
10843    if (PyUnicode_READY(self) == -1)
10844        return NULL;
10845
10846    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
10847        return NULL;
10848
10849    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10850        Py_INCREF(self);
10851        return (PyObject*) self;
10852    }
10853
10854    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
10855}
10856
10857PyDoc_STRVAR(lower__doc__,
10858             "S.lower() -> str\n\
10859\n\
10860Return a copy of the string S converted to lowercase.");
10861
10862static PyObject*
10863unicode_lower(PyUnicodeObject *self)
10864{
10865    return fixup(self, fixlower);
10866}
10867
10868#define LEFTSTRIP 0
10869#define RIGHTSTRIP 1
10870#define BOTHSTRIP 2
10871
10872/* Arrays indexed by above */
10873static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10874
10875#define STRIPNAME(i) (stripformat[i]+3)
10876
10877/* externally visible for str.strip(unicode) */
10878PyObject *
10879_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10880{
10881    void *data;
10882    int kind;
10883    Py_ssize_t i, j, len;
10884    BLOOM_MASK sepmask;
10885
10886    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10887        return NULL;
10888
10889    kind = PyUnicode_KIND(self);
10890    data = PyUnicode_DATA(self);
10891    len = PyUnicode_GET_LENGTH(self);
10892    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
10893                              PyUnicode_DATA(sepobj),
10894                              PyUnicode_GET_LENGTH(sepobj));
10895
10896    i = 0;
10897    if (striptype != RIGHTSTRIP) {
10898        while (i < len &&
10899               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
10900            i++;
10901        }
10902    }
10903
10904    j = len;
10905    if (striptype != LEFTSTRIP) {
10906        do {
10907            j--;
10908        } while (j >= i &&
10909                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
10910        j++;
10911    }
10912
10913    return PyUnicode_Substring((PyObject*)self, i, j);
10914}
10915
10916PyObject*
10917PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
10918{
10919    unsigned char *data;
10920    int kind;
10921    Py_ssize_t length;
10922
10923    if (PyUnicode_READY(self) == -1)
10924        return NULL;
10925
10926    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
10927
10928    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
10929    {
10930        if (PyUnicode_CheckExact(self)) {
10931            Py_INCREF(self);
10932            return self;
10933        }
10934        else
10935            return PyUnicode_Copy(self);
10936    }
10937
10938    length = end - start;
10939    if (length == 1)
10940        return unicode_getitem(self, start);
10941
10942    if (start < 0 || end < 0) {
10943        PyErr_SetString(PyExc_IndexError, "string index out of range");
10944        return NULL;
10945    }
10946
10947    kind = PyUnicode_KIND(self);
10948    data = PyUnicode_1BYTE_DATA(self);
10949    return PyUnicode_FromKindAndData(kind,
10950                                     data + PyUnicode_KIND_SIZE(kind, start),
10951                                     length);
10952}
10953
10954static PyObject *
10955do_strip(PyUnicodeObject *self, int striptype)
10956{
10957    int kind;
10958    void *data;
10959    Py_ssize_t len, i, j;
10960
10961    if (PyUnicode_READY(self) == -1)
10962        return NULL;
10963
10964    kind = PyUnicode_KIND(self);
10965    data = PyUnicode_DATA(self);
10966    len = PyUnicode_GET_LENGTH(self);
10967
10968    i = 0;
10969    if (striptype != RIGHTSTRIP) {
10970        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
10971            i++;
10972        }
10973    }
10974
10975    j = len;
10976    if (striptype != LEFTSTRIP) {
10977        do {
10978            j--;
10979        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
10980        j++;
10981    }
10982
10983    return PyUnicode_Substring((PyObject*)self, i, j);
10984}
10985
10986
10987static PyObject *
10988do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
10989{
10990    PyObject *sep = NULL;
10991
10992    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
10993        return NULL;
10994
10995    if (sep != NULL && sep != Py_None) {
10996        if (PyUnicode_Check(sep))
10997            return _PyUnicode_XStrip(self, striptype, sep);
10998        else {
10999            PyErr_Format(PyExc_TypeError,
11000                         "%s arg must be None or str",
11001                         STRIPNAME(striptype));
11002            return NULL;
11003        }
11004    }
11005
11006    return do_strip(self, striptype);
11007}
11008
11009
11010PyDoc_STRVAR(strip__doc__,
11011             "S.strip([chars]) -> str\n\
11012\n\
11013Return a copy of the string S with leading and trailing\n\
11014whitespace removed.\n\
11015If chars is given and not None, remove characters in chars instead.");
11016
11017static PyObject *
11018unicode_strip(PyUnicodeObject *self, PyObject *args)
11019{
11020    if (PyTuple_GET_SIZE(args) == 0)
11021        return do_strip(self, BOTHSTRIP); /* Common case */
11022    else
11023        return do_argstrip(self, BOTHSTRIP, args);
11024}
11025
11026
11027PyDoc_STRVAR(lstrip__doc__,
11028             "S.lstrip([chars]) -> str\n\
11029\n\
11030Return a copy of the string S with leading whitespace removed.\n\
11031If chars is given and not None, remove characters in chars instead.");
11032
11033static PyObject *
11034unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11035{
11036    if (PyTuple_GET_SIZE(args) == 0)
11037        return do_strip(self, LEFTSTRIP); /* Common case */
11038    else
11039        return do_argstrip(self, LEFTSTRIP, args);
11040}
11041
11042
11043PyDoc_STRVAR(rstrip__doc__,
11044             "S.rstrip([chars]) -> str\n\
11045\n\
11046Return a copy of the string S with trailing whitespace removed.\n\
11047If chars is given and not None, remove characters in chars instead.");
11048
11049static PyObject *
11050unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11051{
11052    if (PyTuple_GET_SIZE(args) == 0)
11053        return do_strip(self, RIGHTSTRIP); /* Common case */
11054    else
11055        return do_argstrip(self, RIGHTSTRIP, args);
11056}
11057
11058
11059static PyObject*
11060unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
11061{
11062    PyUnicodeObject *u;
11063    Py_ssize_t nchars, n;
11064
11065    if (len < 1) {
11066        Py_INCREF(unicode_empty);
11067        return unicode_empty;
11068    }
11069
11070    if (len == 1 && PyUnicode_CheckExact(str)) {
11071        /* no repeat, return original string */
11072        Py_INCREF(str);
11073        return (PyObject*) str;
11074    }
11075
11076    if (PyUnicode_READY(str) == -1)
11077        return NULL;
11078
11079    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11080        PyErr_SetString(PyExc_OverflowError,
11081                        "repeated string is too long");
11082        return NULL;
11083    }
11084    nchars = len * PyUnicode_GET_LENGTH(str);
11085
11086    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11087    if (!u)
11088        return NULL;
11089    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11090
11091    if (PyUnicode_GET_LENGTH(str) == 1) {
11092        const int kind = PyUnicode_KIND(str);
11093        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11094        void *to = PyUnicode_DATA(u);
11095        if (kind == PyUnicode_1BYTE_KIND)
11096            memset(to, (unsigned char)fill_char, len);
11097        else {
11098            for (n = 0; n < len; ++n)
11099                PyUnicode_WRITE(kind, to, n, fill_char);
11100        }
11101    }
11102    else {
11103        /* number of characters copied this far */
11104        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11105        const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11106        char *to = (char *) PyUnicode_DATA(u);
11107        Py_MEMCPY(to, PyUnicode_DATA(str),
11108                  PyUnicode_GET_LENGTH(str) * char_size);
11109        while (done < nchars) {
11110            n = (done <= nchars-done) ? done : nchars-done;
11111            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11112            done += n;
11113        }
11114    }
11115
11116    return (PyObject*) u;
11117}
11118
11119PyObject *
11120PyUnicode_Replace(PyObject *obj,
11121                  PyObject *subobj,
11122                  PyObject *replobj,
11123                  Py_ssize_t maxcount)
11124{
11125    PyObject *self;
11126    PyObject *str1;
11127    PyObject *str2;
11128    PyObject *result;
11129
11130    self = PyUnicode_FromObject(obj);
11131    if (self == NULL || PyUnicode_READY(self) == -1)
11132        return NULL;
11133    str1 = PyUnicode_FromObject(subobj);
11134    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11135        Py_DECREF(self);
11136        return NULL;
11137    }
11138    str2 = PyUnicode_FromObject(replobj);
11139    if (str2 == NULL || PyUnicode_READY(str2)) {
11140        Py_DECREF(self);
11141        Py_DECREF(str1);
11142        return NULL;
11143    }
11144    result = replace(self, str1, str2, maxcount);
11145    Py_DECREF(self);
11146    Py_DECREF(str1);
11147    Py_DECREF(str2);
11148    return result;
11149}
11150
11151PyDoc_STRVAR(replace__doc__,
11152             "S.replace(old, new[, count]) -> str\n\
11153\n\
11154Return a copy of S with all occurrences of substring\n\
11155old replaced by new.  If the optional argument count is\n\
11156given, only the first count occurrences are replaced.");
11157
11158static PyObject*
11159unicode_replace(PyObject *self, PyObject *args)
11160{
11161    PyObject *str1;
11162    PyObject *str2;
11163    Py_ssize_t maxcount = -1;
11164    PyObject *result;
11165
11166    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11167        return NULL;
11168    if (!PyUnicode_READY(self) == -1)
11169        return NULL;
11170    str1 = PyUnicode_FromObject(str1);
11171    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11172        return NULL;
11173    str2 = PyUnicode_FromObject(str2);
11174    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11175        Py_DECREF(str1);
11176        return NULL;
11177    }
11178
11179    result = replace(self, str1, str2, maxcount);
11180
11181    Py_DECREF(str1);
11182    Py_DECREF(str2);
11183    return result;
11184}
11185
11186static PyObject *
11187unicode_repr(PyObject *unicode)
11188{
11189    PyObject *repr;
11190    Py_ssize_t isize;
11191    Py_ssize_t osize, squote, dquote, i, o;
11192    Py_UCS4 max, quote;
11193    int ikind, okind;
11194    void *idata, *odata;
11195
11196    if (PyUnicode_READY(unicode) == -1)
11197        return NULL;
11198
11199    isize = PyUnicode_GET_LENGTH(unicode);
11200    idata = PyUnicode_DATA(unicode);
11201
11202    /* Compute length of output, quote characters, and
11203       maximum character */
11204    osize = 2; /* quotes */
11205    max = 127;
11206    squote = dquote = 0;
11207    ikind = PyUnicode_KIND(unicode);
11208    for (i = 0; i < isize; i++) {
11209        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11210        switch (ch) {
11211        case '\'': squote++; osize++; break;
11212        case '"':  dquote++; osize++; break;
11213        case '\\': case '\t': case '\r': case '\n':
11214            osize += 2; break;
11215        default:
11216            /* Fast-path ASCII */
11217            if (ch < ' ' || ch == 0x7f)
11218                osize += 4; /* \xHH */
11219            else if (ch < 0x7f)
11220                osize++;
11221            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11222                osize++;
11223                max = ch > max ? ch : max;
11224            }
11225            else if (ch < 0x100)
11226                osize += 4; /* \xHH */
11227            else if (ch < 0x10000)
11228                osize += 6; /* \uHHHH */
11229            else
11230                osize += 10; /* \uHHHHHHHH */
11231        }
11232    }
11233
11234    quote = '\'';
11235    if (squote) {
11236        if (dquote)
11237            /* Both squote and dquote present. Use squote,
11238               and escape them */
11239            osize += squote;
11240        else
11241            quote = '"';
11242    }
11243
11244    repr = PyUnicode_New(osize, max);
11245    if (repr == NULL)
11246        return NULL;
11247    okind = PyUnicode_KIND(repr);
11248    odata = PyUnicode_DATA(repr);
11249
11250    PyUnicode_WRITE(okind, odata, 0, quote);
11251    PyUnicode_WRITE(okind, odata, osize-1, quote);
11252
11253    for (i = 0, o = 1; i < isize; i++) {
11254        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11255
11256        /* Escape quotes and backslashes */
11257        if ((ch == quote) || (ch == '\\')) {
11258            PyUnicode_WRITE(okind, odata, o++, '\\');
11259            PyUnicode_WRITE(okind, odata, o++, ch);
11260            continue;
11261        }
11262
11263        /* Map special whitespace to '\t', \n', '\r' */
11264        if (ch == '\t') {
11265            PyUnicode_WRITE(okind, odata, o++, '\\');
11266            PyUnicode_WRITE(okind, odata, o++, 't');
11267        }
11268        else if (ch == '\n') {
11269            PyUnicode_WRITE(okind, odata, o++, '\\');
11270            PyUnicode_WRITE(okind, odata, o++, 'n');
11271        }
11272        else if (ch == '\r') {
11273            PyUnicode_WRITE(okind, odata, o++, '\\');
11274            PyUnicode_WRITE(okind, odata, o++, 'r');
11275        }
11276
11277        /* Map non-printable US ASCII to '\xhh' */
11278        else if (ch < ' ' || ch == 0x7F) {
11279            PyUnicode_WRITE(okind, odata, o++, '\\');
11280            PyUnicode_WRITE(okind, odata, o++, 'x');
11281            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11282            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11283        }
11284
11285        /* Copy ASCII characters as-is */
11286        else if (ch < 0x7F) {
11287            PyUnicode_WRITE(okind, odata, o++, ch);
11288        }
11289
11290        /* Non-ASCII characters */
11291        else {
11292            /* Map Unicode whitespace and control characters
11293               (categories Z* and C* except ASCII space)
11294            */
11295            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11296                /* Map 8-bit characters to '\xhh' */
11297                if (ch <= 0xff) {
11298                    PyUnicode_WRITE(okind, odata, o++, '\\');
11299                    PyUnicode_WRITE(okind, odata, o++, 'x');
11300                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11301                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11302                }
11303                /* Map 21-bit characters to '\U00xxxxxx' */
11304                else if (ch >= 0x10000) {
11305                    PyUnicode_WRITE(okind, odata, o++, '\\');
11306                    PyUnicode_WRITE(okind, odata, o++, 'U');
11307                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11308                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11309                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11310                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11311                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11312                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11313                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11314                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11315                }
11316                /* Map 16-bit characters to '\uxxxx' */
11317                else {
11318                    PyUnicode_WRITE(okind, odata, o++, '\\');
11319                    PyUnicode_WRITE(okind, odata, o++, 'u');
11320                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11321                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11322                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11323                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11324                }
11325            }
11326            /* Copy characters as-is */
11327            else {
11328                PyUnicode_WRITE(okind, odata, o++, ch);
11329            }
11330        }
11331    }
11332    /* Closing quote already added at the beginning */
11333    return repr;
11334}
11335
11336PyDoc_STRVAR(rfind__doc__,
11337             "S.rfind(sub[, start[, end]]) -> int\n\
11338\n\
11339Return the highest index in S where substring sub is found,\n\
11340such that sub is contained within S[start:end].  Optional\n\
11341arguments start and end are interpreted as in slice notation.\n\
11342\n\
11343Return -1 on failure.");
11344
11345static PyObject *
11346unicode_rfind(PyObject *self, PyObject *args)
11347{
11348    PyUnicodeObject *substring;
11349    Py_ssize_t start;
11350    Py_ssize_t end;
11351    Py_ssize_t result;
11352
11353    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11354                                            &start, &end))
11355        return NULL;
11356
11357    if (PyUnicode_READY(self) == -1)
11358        return NULL;
11359    if (PyUnicode_READY(substring) == -1)
11360        return NULL;
11361
11362    result = any_find_slice(
11363        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11364        self, (PyObject*)substring, start, end
11365        );
11366
11367    Py_DECREF(substring);
11368
11369    if (result == -2)
11370        return NULL;
11371
11372    return PyLong_FromSsize_t(result);
11373}
11374
11375PyDoc_STRVAR(rindex__doc__,
11376             "S.rindex(sub[, start[, end]]) -> int\n\
11377\n\
11378Like S.rfind() but raise ValueError when the substring is not found.");
11379
11380static PyObject *
11381unicode_rindex(PyObject *self, PyObject *args)
11382{
11383    PyUnicodeObject *substring;
11384    Py_ssize_t start;
11385    Py_ssize_t end;
11386    Py_ssize_t result;
11387
11388    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11389                                            &start, &end))
11390        return NULL;
11391
11392    if (PyUnicode_READY(self) == -1)
11393        return NULL;
11394    if (PyUnicode_READY(substring) == -1)
11395        return NULL;
11396
11397    result = any_find_slice(
11398        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11399        self, (PyObject*)substring, start, end
11400        );
11401
11402    Py_DECREF(substring);
11403
11404    if (result == -2)
11405        return NULL;
11406
11407    if (result < 0) {
11408        PyErr_SetString(PyExc_ValueError, "substring not found");
11409        return NULL;
11410    }
11411
11412    return PyLong_FromSsize_t(result);
11413}
11414
11415PyDoc_STRVAR(rjust__doc__,
11416             "S.rjust(width[, fillchar]) -> str\n\
11417\n\
11418Return S right-justified in a string of length width. Padding is\n\
11419done using the specified fill character (default is a space).");
11420
11421static PyObject *
11422unicode_rjust(PyUnicodeObject *self, PyObject *args)
11423{
11424    Py_ssize_t width;
11425    Py_UCS4 fillchar = ' ';
11426
11427    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11428        return NULL;
11429
11430    if (PyUnicode_READY(self) == -1)
11431        return NULL;
11432
11433    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11434        Py_INCREF(self);
11435        return (PyObject*) self;
11436    }
11437
11438    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11439}
11440
11441PyObject *
11442PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11443{
11444    PyObject *result;
11445
11446    s = PyUnicode_FromObject(s);
11447    if (s == NULL)
11448        return NULL;
11449    if (sep != NULL) {
11450        sep = PyUnicode_FromObject(sep);
11451        if (sep == NULL) {
11452            Py_DECREF(s);
11453            return NULL;
11454        }
11455    }
11456
11457    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11458
11459    Py_DECREF(s);
11460    Py_XDECREF(sep);
11461    return result;
11462}
11463
11464PyDoc_STRVAR(split__doc__,
11465             "S.split([sep[, maxsplit]]) -> list of strings\n\
11466\n\
11467Return a list of the words in S, using sep as the\n\
11468delimiter string.  If maxsplit is given, at most maxsplit\n\
11469splits are done. If sep is not specified or is None, any\n\
11470whitespace string is a separator and empty strings are\n\
11471removed from the result.");
11472
11473static PyObject*
11474unicode_split(PyUnicodeObject *self, PyObject *args)
11475{
11476    PyObject *substring = Py_None;
11477    Py_ssize_t maxcount = -1;
11478
11479    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11480        return NULL;
11481
11482    if (substring == Py_None)
11483        return split(self, NULL, maxcount);
11484    else if (PyUnicode_Check(substring))
11485        return split(self, (PyUnicodeObject *)substring, maxcount);
11486    else
11487        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11488}
11489
11490PyObject *
11491PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11492{
11493    PyObject* str_obj;
11494    PyObject* sep_obj;
11495    PyObject* out;
11496    int kind1, kind2, kind;
11497    void *buf1 = NULL, *buf2 = NULL;
11498    Py_ssize_t len1, len2;
11499
11500    str_obj = PyUnicode_FromObject(str_in);
11501    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11502        return NULL;
11503    sep_obj = PyUnicode_FromObject(sep_in);
11504    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11505        Py_DECREF(str_obj);
11506        return NULL;
11507    }
11508
11509    kind1 = PyUnicode_KIND(str_in);
11510    kind2 = PyUnicode_KIND(sep_obj);
11511    kind = kind1 > kind2 ? kind1 : kind2;
11512    buf1 = PyUnicode_DATA(str_in);
11513    if (kind1 != kind)
11514        buf1 = _PyUnicode_AsKind(str_in, kind);
11515    if (!buf1)
11516        goto onError;
11517    buf2 = PyUnicode_DATA(sep_obj);
11518    if (kind2 != kind)
11519        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11520    if (!buf2)
11521        goto onError;
11522    len1 = PyUnicode_GET_LENGTH(str_obj);
11523    len2 = PyUnicode_GET_LENGTH(sep_obj);
11524
11525    switch(PyUnicode_KIND(str_in)) {
11526    case PyUnicode_1BYTE_KIND:
11527        out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11528        break;
11529    case PyUnicode_2BYTE_KIND:
11530        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11531        break;
11532    case PyUnicode_4BYTE_KIND:
11533        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11534        break;
11535    default:
11536        assert(0);
11537        out = 0;
11538    }
11539
11540    Py_DECREF(sep_obj);
11541    Py_DECREF(str_obj);
11542    if (kind1 != kind)
11543        PyMem_Free(buf1);
11544    if (kind2 != kind)
11545        PyMem_Free(buf2);
11546
11547    return out;
11548  onError:
11549    Py_DECREF(sep_obj);
11550    Py_DECREF(str_obj);
11551    if (kind1 != kind && buf1)
11552        PyMem_Free(buf1);
11553    if (kind2 != kind && buf2)
11554        PyMem_Free(buf2);
11555    return NULL;
11556}
11557
11558
11559PyObject *
11560PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11561{
11562    PyObject* str_obj;
11563    PyObject* sep_obj;
11564    PyObject* out;
11565    int kind1, kind2, kind;
11566    void *buf1 = NULL, *buf2 = NULL;
11567    Py_ssize_t len1, len2;
11568
11569    str_obj = PyUnicode_FromObject(str_in);
11570    if (!str_obj)
11571        return NULL;
11572    sep_obj = PyUnicode_FromObject(sep_in);
11573    if (!sep_obj) {
11574        Py_DECREF(str_obj);
11575        return NULL;
11576    }
11577
11578    kind1 = PyUnicode_KIND(str_in);
11579    kind2 = PyUnicode_KIND(sep_obj);
11580    kind = Py_MAX(kind1, kind2);
11581    buf1 = PyUnicode_DATA(str_in);
11582    if (kind1 != kind)
11583        buf1 = _PyUnicode_AsKind(str_in, kind);
11584    if (!buf1)
11585        goto onError;
11586    buf2 = PyUnicode_DATA(sep_obj);
11587    if (kind2 != kind)
11588        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11589    if (!buf2)
11590        goto onError;
11591    len1 = PyUnicode_GET_LENGTH(str_obj);
11592    len2 = PyUnicode_GET_LENGTH(sep_obj);
11593
11594    switch(PyUnicode_KIND(str_in)) {
11595    case PyUnicode_1BYTE_KIND:
11596        out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11597        break;
11598    case PyUnicode_2BYTE_KIND:
11599        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11600        break;
11601    case PyUnicode_4BYTE_KIND:
11602        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11603        break;
11604    default:
11605        assert(0);
11606        out = 0;
11607    }
11608
11609    Py_DECREF(sep_obj);
11610    Py_DECREF(str_obj);
11611    if (kind1 != kind)
11612        PyMem_Free(buf1);
11613    if (kind2 != kind)
11614        PyMem_Free(buf2);
11615
11616    return out;
11617  onError:
11618    Py_DECREF(sep_obj);
11619    Py_DECREF(str_obj);
11620    if (kind1 != kind && buf1)
11621        PyMem_Free(buf1);
11622    if (kind2 != kind && buf2)
11623        PyMem_Free(buf2);
11624    return NULL;
11625}
11626
11627PyDoc_STRVAR(partition__doc__,
11628             "S.partition(sep) -> (head, sep, tail)\n\
11629\n\
11630Search for the separator sep in S, and return the part before it,\n\
11631the separator itself, and the part after it.  If the separator is not\n\
11632found, return S and two empty strings.");
11633
11634static PyObject*
11635unicode_partition(PyUnicodeObject *self, PyObject *separator)
11636{
11637    return PyUnicode_Partition((PyObject *)self, separator);
11638}
11639
11640PyDoc_STRVAR(rpartition__doc__,
11641             "S.rpartition(sep) -> (head, sep, tail)\n\
11642\n\
11643Search for the separator sep in S, starting at the end of S, and return\n\
11644the part before it, the separator itself, and the part after it.  If the\n\
11645separator is not found, return two empty strings and S.");
11646
11647static PyObject*
11648unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11649{
11650    return PyUnicode_RPartition((PyObject *)self, separator);
11651}
11652
11653PyObject *
11654PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11655{
11656    PyObject *result;
11657
11658    s = PyUnicode_FromObject(s);
11659    if (s == NULL)
11660        return NULL;
11661    if (sep != NULL) {
11662        sep = PyUnicode_FromObject(sep);
11663        if (sep == NULL) {
11664            Py_DECREF(s);
11665            return NULL;
11666        }
11667    }
11668
11669    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11670
11671    Py_DECREF(s);
11672    Py_XDECREF(sep);
11673    return result;
11674}
11675
11676PyDoc_STRVAR(rsplit__doc__,
11677             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
11678\n\
11679Return a list of the words in S, using sep as the\n\
11680delimiter string, starting at the end of the string and\n\
11681working to the front.  If maxsplit is given, at most maxsplit\n\
11682splits are done. If sep is not specified, any whitespace string\n\
11683is a separator.");
11684
11685static PyObject*
11686unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11687{
11688    PyObject *substring = Py_None;
11689    Py_ssize_t maxcount = -1;
11690
11691    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
11692        return NULL;
11693
11694    if (substring == Py_None)
11695        return rsplit(self, NULL, maxcount);
11696    else if (PyUnicode_Check(substring))
11697        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
11698    else
11699        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
11700}
11701
11702PyDoc_STRVAR(splitlines__doc__,
11703             "S.splitlines([keepends]) -> list of strings\n\
11704\n\
11705Return a list of the lines in S, breaking at line boundaries.\n\
11706Line breaks are not included in the resulting list unless keepends\n\
11707is given and true.");
11708
11709static PyObject*
11710unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
11711{
11712    static char *kwlist[] = {"keepends", 0};
11713    int keepends = 0;
11714
11715    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11716                                     kwlist, &keepends))
11717        return NULL;
11718
11719    return PyUnicode_Splitlines((PyObject *)self, keepends);
11720}
11721
11722static
11723PyObject *unicode_str(PyObject *self)
11724{
11725    if (PyUnicode_CheckExact(self)) {
11726        Py_INCREF(self);
11727        return self;
11728    } else
11729        /* Subtype -- return genuine unicode string with the same value. */
11730        return PyUnicode_Copy(self);
11731}
11732
11733PyDoc_STRVAR(swapcase__doc__,
11734             "S.swapcase() -> str\n\
11735\n\
11736Return a copy of S with uppercase characters converted to lowercase\n\
11737and vice versa.");
11738
11739static PyObject*
11740unicode_swapcase(PyUnicodeObject *self)
11741{
11742    return fixup(self, fixswapcase);
11743}
11744
11745PyDoc_STRVAR(maketrans__doc__,
11746             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
11747\n\
11748Return a translation table usable for str.translate().\n\
11749If there is only one argument, it must be a dictionary mapping Unicode\n\
11750ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
11751Character keys will be then converted to ordinals.\n\
11752If there are two arguments, they must be strings of equal length, and\n\
11753in the resulting dictionary, each character in x will be mapped to the\n\
11754character at the same position in y. If there is a third argument, it\n\
11755must be a string, whose characters will be mapped to None in the result.");
11756
11757static PyObject*
11758unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11759{
11760    PyObject *x, *y = NULL, *z = NULL;
11761    PyObject *new = NULL, *key, *value;
11762    Py_ssize_t i = 0;
11763    int res;
11764
11765    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11766        return NULL;
11767    new = PyDict_New();
11768    if (!new)
11769        return NULL;
11770    if (y != NULL) {
11771        int x_kind, y_kind, z_kind;
11772        void *x_data, *y_data, *z_data;
11773
11774        /* x must be a string too, of equal length */
11775        if (!PyUnicode_Check(x)) {
11776            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11777                            "be a string if there is a second argument");
11778            goto err;
11779        }
11780        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
11781            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11782                            "arguments must have equal length");
11783            goto err;
11784        }
11785        /* create entries for translating chars in x to those in y */
11786        x_kind = PyUnicode_KIND(x);
11787        y_kind = PyUnicode_KIND(y);
11788        x_data = PyUnicode_DATA(x);
11789        y_data = PyUnicode_DATA(y);
11790        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11791            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11792            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
11793            if (!key || !value)
11794                goto err;
11795            res = PyDict_SetItem(new, key, value);
11796            Py_DECREF(key);
11797            Py_DECREF(value);
11798            if (res < 0)
11799                goto err;
11800        }
11801        /* create entries for deleting chars in z */
11802        if (z != NULL) {
11803            z_kind = PyUnicode_KIND(z);
11804            z_data = PyUnicode_DATA(z);
11805            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
11806                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
11807                if (!key)
11808                    goto err;
11809                res = PyDict_SetItem(new, key, Py_None);
11810                Py_DECREF(key);
11811                if (res < 0)
11812                    goto err;
11813            }
11814        }
11815    } else {
11816        int kind;
11817        void *data;
11818
11819        /* x must be a dict */
11820        if (!PyDict_CheckExact(x)) {
11821            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11822                            "to maketrans it must be a dict");
11823            goto err;
11824        }
11825        /* copy entries into the new dict, converting string keys to int keys */
11826        while (PyDict_Next(x, &i, &key, &value)) {
11827            if (PyUnicode_Check(key)) {
11828                /* convert string keys to integer keys */
11829                PyObject *newkey;
11830                if (PyUnicode_GET_SIZE(key) != 1) {
11831                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
11832                                    "table must be of length 1");
11833                    goto err;
11834                }
11835                kind = PyUnicode_KIND(key);
11836                data = PyUnicode_DATA(key);
11837                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
11838                if (!newkey)
11839                    goto err;
11840                res = PyDict_SetItem(new, newkey, value);
11841                Py_DECREF(newkey);
11842                if (res < 0)
11843                    goto err;
11844            } else if (PyLong_Check(key)) {
11845                /* just keep integer keys */
11846                if (PyDict_SetItem(new, key, value) < 0)
11847                    goto err;
11848            } else {
11849                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11850                                "be strings or integers");
11851                goto err;
11852            }
11853        }
11854    }
11855    return new;
11856  err:
11857    Py_DECREF(new);
11858    return NULL;
11859}
11860
11861PyDoc_STRVAR(translate__doc__,
11862             "S.translate(table) -> str\n\
11863\n\
11864Return a copy of the string S, where all characters have been mapped\n\
11865through the given translation table, which must be a mapping of\n\
11866Unicode ordinals to Unicode ordinals, strings, or None.\n\
11867Unmapped characters are left untouched. Characters mapped to None\n\
11868are deleted.");
11869
11870static PyObject*
11871unicode_translate(PyObject *self, PyObject *table)
11872{
11873    return _PyUnicode_TranslateCharmap(self, table, "ignore");
11874}
11875
11876PyDoc_STRVAR(upper__doc__,
11877             "S.upper() -> str\n\
11878\n\
11879Return a copy of S converted to uppercase.");
11880
11881static PyObject*
11882unicode_upper(PyUnicodeObject *self)
11883{
11884    return fixup(self, fixupper);
11885}
11886
11887PyDoc_STRVAR(zfill__doc__,
11888             "S.zfill(width) -> str\n\
11889\n\
11890Pad a numeric string S with zeros on the left, to fill a field\n\
11891of the specified width. The string S is never truncated.");
11892
11893static PyObject *
11894unicode_zfill(PyUnicodeObject *self, PyObject *args)
11895{
11896    Py_ssize_t fill;
11897    PyUnicodeObject *u;
11898    Py_ssize_t width;
11899    int kind;
11900    void *data;
11901    Py_UCS4 chr;
11902
11903    if (PyUnicode_READY(self) == -1)
11904        return NULL;
11905
11906    if (!PyArg_ParseTuple(args, "n:zfill", &width))
11907        return NULL;
11908
11909    if (PyUnicode_GET_LENGTH(self) >= width) {
11910        if (PyUnicode_CheckExact(self)) {
11911            Py_INCREF(self);
11912            return (PyObject*) self;
11913        }
11914        else
11915            return PyUnicode_Copy((PyObject*)self);
11916    }
11917
11918    fill = width - _PyUnicode_LENGTH(self);
11919
11920    u = pad(self, fill, 0, '0');
11921
11922    if (u == NULL)
11923        return NULL;
11924
11925    kind = PyUnicode_KIND(u);
11926    data = PyUnicode_DATA(u);
11927    chr = PyUnicode_READ(kind, data, fill);
11928
11929    if (chr == '+' || chr == '-') {
11930        /* move sign to beginning of string */
11931        PyUnicode_WRITE(kind, data, 0, chr);
11932        PyUnicode_WRITE(kind, data, fill, '0');
11933    }
11934
11935    return (PyObject*) u;
11936}
11937
11938#if 0
11939static PyObject *
11940unicode__decimal2ascii(PyObject *self)
11941{
11942    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
11943}
11944#endif
11945
11946PyDoc_STRVAR(startswith__doc__,
11947             "S.startswith(prefix[, start[, end]]) -> bool\n\
11948\n\
11949Return True if S starts with the specified prefix, False otherwise.\n\
11950With optional start, test S beginning at that position.\n\
11951With optional end, stop comparing S at that position.\n\
11952prefix can also be a tuple of strings to try.");
11953
11954static PyObject *
11955unicode_startswith(PyUnicodeObject *self,
11956                   PyObject *args)
11957{
11958    PyObject *subobj;
11959    PyUnicodeObject *substring;
11960    Py_ssize_t start = 0;
11961    Py_ssize_t end = PY_SSIZE_T_MAX;
11962    int result;
11963
11964    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
11965        return NULL;
11966    if (PyTuple_Check(subobj)) {
11967        Py_ssize_t i;
11968        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
11969            substring = (PyUnicodeObject *)PyUnicode_FromObject(
11970                PyTuple_GET_ITEM(subobj, i));
11971            if (substring == NULL)
11972                return NULL;
11973            result = tailmatch(self, substring, start, end, -1);
11974            Py_DECREF(substring);
11975            if (result) {
11976                Py_RETURN_TRUE;
11977            }
11978        }
11979        /* nothing matched */
11980        Py_RETURN_FALSE;
11981    }
11982    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
11983    if (substring == NULL) {
11984        if (PyErr_ExceptionMatches(PyExc_TypeError))
11985            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
11986                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
11987        return NULL;
11988    }
11989    result = tailmatch(self, substring, start, end, -1);
11990    Py_DECREF(substring);
11991    return PyBool_FromLong(result);
11992}
11993
11994
11995PyDoc_STRVAR(endswith__doc__,
11996             "S.endswith(suffix[, start[, end]]) -> bool\n\
11997\n\
11998Return True if S ends with the specified suffix, False otherwise.\n\
11999With optional start, test S beginning at that position.\n\
12000With optional end, stop comparing S at that position.\n\
12001suffix can also be a tuple of strings to try.");
12002
12003static PyObject *
12004unicode_endswith(PyUnicodeObject *self,
12005                 PyObject *args)
12006{
12007    PyObject *subobj;
12008    PyUnicodeObject *substring;
12009    Py_ssize_t start = 0;
12010    Py_ssize_t end = PY_SSIZE_T_MAX;
12011    int result;
12012
12013    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12014        return NULL;
12015    if (PyTuple_Check(subobj)) {
12016        Py_ssize_t i;
12017        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12018            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12019                PyTuple_GET_ITEM(subobj, i));
12020            if (substring == NULL)
12021                return NULL;
12022            result = tailmatch(self, substring, start, end, +1);
12023            Py_DECREF(substring);
12024            if (result) {
12025                Py_RETURN_TRUE;
12026            }
12027        }
12028        Py_RETURN_FALSE;
12029    }
12030    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12031    if (substring == NULL) {
12032        if (PyErr_ExceptionMatches(PyExc_TypeError))
12033            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12034                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12035        return NULL;
12036    }
12037    result = tailmatch(self, substring, start, end, +1);
12038    Py_DECREF(substring);
12039    return PyBool_FromLong(result);
12040}
12041
12042#include "stringlib/unicode_format.h"
12043
12044PyDoc_STRVAR(format__doc__,
12045             "S.format(*args, **kwargs) -> str\n\
12046\n\
12047Return a formatted version of S, using substitutions from args and kwargs.\n\
12048The substitutions are identified by braces ('{' and '}').");
12049
12050PyDoc_STRVAR(format_map__doc__,
12051             "S.format_map(mapping) -> str\n\
12052\n\
12053Return a formatted version of S, using substitutions from mapping.\n\
12054The substitutions are identified by braces ('{' and '}').");
12055
12056static PyObject *
12057unicode__format__(PyObject* self, PyObject* args)
12058{
12059    PyObject *format_spec;
12060
12061    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12062        return NULL;
12063
12064    return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12065                                     PyUnicode_GET_LENGTH(format_spec));
12066}
12067
12068PyDoc_STRVAR(p_format__doc__,
12069             "S.__format__(format_spec) -> str\n\
12070\n\
12071Return a formatted version of S as described by format_spec.");
12072
12073static PyObject *
12074unicode__sizeof__(PyUnicodeObject *v)
12075{
12076    Py_ssize_t size;
12077
12078    /* If it's a compact object, account for base structure +
12079       character data. */
12080    if (PyUnicode_IS_COMPACT_ASCII(v))
12081        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12082    else if (PyUnicode_IS_COMPACT(v))
12083        size = sizeof(PyCompactUnicodeObject) +
12084            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12085    else {
12086        /* If it is a two-block object, account for base object, and
12087           for character block if present. */
12088        size = sizeof(PyUnicodeObject);
12089        if (_PyUnicode_DATA_ANY(v))
12090            size += (PyUnicode_GET_LENGTH(v) + 1) *
12091                PyUnicode_CHARACTER_SIZE(v);
12092    }
12093    /* If the wstr pointer is present, account for it unless it is shared
12094       with the data pointer. Check if the data is not shared. */
12095    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12096        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12097    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12098        size += PyUnicode_UTF8_LENGTH(v) + 1;
12099
12100    return PyLong_FromSsize_t(size);
12101}
12102
12103PyDoc_STRVAR(sizeof__doc__,
12104             "S.__sizeof__() -> size of S in memory, in bytes");
12105
12106static PyObject *
12107unicode_getnewargs(PyObject *v)
12108{
12109    PyObject *copy = PyUnicode_Copy(v);
12110    if (!copy)
12111        return NULL;
12112    return Py_BuildValue("(N)", copy);
12113}
12114
12115static PyMethodDef unicode_methods[] = {
12116
12117    /* Order is according to common usage: often used methods should
12118       appear first, since lookup is done sequentially. */
12119
12120    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12121    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12122    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12123    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12124    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12125    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12126    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12127    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12128    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12129    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12130    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12131    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12132    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12133    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12134    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12135    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12136    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12137    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12138    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12139    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12140    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12141    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12142    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12143    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12144    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12145    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12146    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12147    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12148    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12149    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12150    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12151    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12152    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12153    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12154    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12155    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12156    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12157    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12158    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12159    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12160    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12161    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12162    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12163    {"maketrans", (PyCFunction) unicode_maketrans,
12164     METH_VARARGS | METH_STATIC, maketrans__doc__},
12165    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12166#if 0
12167    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12168#endif
12169
12170#if 0
12171    /* These methods are just used for debugging the implementation. */
12172    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12173#endif
12174
12175    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12176    {NULL, NULL}
12177};
12178
12179static PyObject *
12180unicode_mod(PyObject *v, PyObject *w)
12181{
12182    if (!PyUnicode_Check(v))
12183        Py_RETURN_NOTIMPLEMENTED;
12184    return PyUnicode_Format(v, w);
12185}
12186
12187static PyNumberMethods unicode_as_number = {
12188    0,              /*nb_add*/
12189    0,              /*nb_subtract*/
12190    0,              /*nb_multiply*/
12191    unicode_mod,            /*nb_remainder*/
12192};
12193
12194static PySequenceMethods unicode_as_sequence = {
12195    (lenfunc) unicode_length,       /* sq_length */
12196    PyUnicode_Concat,           /* sq_concat */
12197    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12198    (ssizeargfunc) unicode_getitem,     /* sq_item */
12199    0,                  /* sq_slice */
12200    0,                  /* sq_ass_item */
12201    0,                  /* sq_ass_slice */
12202    PyUnicode_Contains,         /* sq_contains */
12203};
12204
12205static PyObject*
12206unicode_subscript(PyUnicodeObject* self, PyObject* item)
12207{
12208    if (PyUnicode_READY(self) == -1)
12209        return NULL;
12210
12211    if (PyIndex_Check(item)) {
12212        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12213        if (i == -1 && PyErr_Occurred())
12214            return NULL;
12215        if (i < 0)
12216            i += PyUnicode_GET_LENGTH(self);
12217        return unicode_getitem((PyObject*)self, i);
12218    } else if (PySlice_Check(item)) {
12219        Py_ssize_t start, stop, step, slicelength, cur, i;
12220        const Py_UNICODE* source_buf;
12221        Py_UNICODE* result_buf;
12222        PyObject* result;
12223
12224        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12225                                 &start, &stop, &step, &slicelength) < 0) {
12226            return NULL;
12227        }
12228
12229        if (slicelength <= 0) {
12230            return PyUnicode_New(0, 0);
12231        } else if (start == 0 && step == 1 &&
12232                   slicelength == PyUnicode_GET_LENGTH(self) &&
12233                   PyUnicode_CheckExact(self)) {
12234            Py_INCREF(self);
12235            return (PyObject *)self;
12236        } else if (step == 1) {
12237            return PyUnicode_Substring((PyObject*)self,
12238                                       start, start + slicelength);
12239        } else {
12240            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
12241            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
12242                                                       sizeof(Py_UNICODE));
12243
12244            if (result_buf == NULL)
12245                return PyErr_NoMemory();
12246
12247            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12248                result_buf[i] = source_buf[cur];
12249            }
12250
12251            result = PyUnicode_FromUnicode(result_buf, slicelength);
12252            PyObject_FREE(result_buf);
12253            return result;
12254        }
12255    } else {
12256        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12257        return NULL;
12258    }
12259}
12260
12261static PyMappingMethods unicode_as_mapping = {
12262    (lenfunc)unicode_length,        /* mp_length */
12263    (binaryfunc)unicode_subscript,  /* mp_subscript */
12264    (objobjargproc)0,           /* mp_ass_subscript */
12265};
12266
12267
12268/* Helpers for PyUnicode_Format() */
12269
12270static PyObject *
12271getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12272{
12273    Py_ssize_t argidx = *p_argidx;
12274    if (argidx < arglen) {
12275        (*p_argidx)++;
12276        if (arglen < 0)
12277            return args;
12278        else
12279            return PyTuple_GetItem(args, argidx);
12280    }
12281    PyErr_SetString(PyExc_TypeError,
12282                    "not enough arguments for format string");
12283    return NULL;
12284}
12285
12286/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12287
12288static PyObject *
12289formatfloat(PyObject *v, int flags, int prec, int type)
12290{
12291    char *p;
12292    PyObject *result;
12293    double x;
12294
12295    x = PyFloat_AsDouble(v);
12296    if (x == -1.0 && PyErr_Occurred())
12297        return NULL;
12298
12299    if (prec < 0)
12300        prec = 6;
12301
12302    p = PyOS_double_to_string(x, type, prec,
12303                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12304    if (p == NULL)
12305        return NULL;
12306    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12307    PyMem_Free(p);
12308    return result;
12309}
12310
12311static PyObject*
12312formatlong(PyObject *val, int flags, int prec, int type)
12313{
12314    char *buf;
12315    int len;
12316    PyObject *str; /* temporary string object. */
12317    PyObject *result;
12318
12319    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12320    if (!str)
12321        return NULL;
12322    result = PyUnicode_DecodeASCII(buf, len, NULL);
12323    Py_DECREF(str);
12324    return result;
12325}
12326
12327static int
12328formatchar(Py_UCS4 *buf,
12329           size_t buflen,
12330           PyObject *v)
12331{
12332    /* presume that the buffer is at least 3 characters long */
12333    if (PyUnicode_Check(v)) {
12334        if (PyUnicode_GET_LENGTH(v) == 1) {
12335            buf[0] = PyUnicode_READ_CHAR(v, 0);
12336            buf[1] = '\0';
12337            return 1;
12338        }
12339        goto onError;
12340    }
12341    else {
12342        /* Integer input truncated to a character */
12343        long x;
12344        x = PyLong_AsLong(v);
12345        if (x == -1 && PyErr_Occurred())
12346            goto onError;
12347
12348        if (x < 0 || x > 0x10ffff) {
12349            PyErr_SetString(PyExc_OverflowError,
12350                            "%c arg not in range(0x110000)");
12351            return -1;
12352        }
12353
12354        buf[0] = (Py_UCS4) x;
12355        buf[1] = '\0';
12356        return 1;
12357    }
12358
12359  onError:
12360    PyErr_SetString(PyExc_TypeError,
12361                    "%c requires int or char");
12362    return -1;
12363}
12364
12365/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12366   FORMATBUFLEN is the length of the buffer in which chars are formatted.
12367*/
12368#define FORMATBUFLEN (size_t)10
12369
12370PyObject *
12371PyUnicode_Format(PyObject *format, PyObject *args)
12372{
12373    void *fmt;
12374    int fmtkind;
12375    PyObject *result;
12376    Py_UCS4 *res, *res0;
12377    Py_UCS4 max;
12378    int kind;
12379    Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12380    int args_owned = 0;
12381    PyObject *dict = NULL;
12382    PyUnicodeObject *uformat;
12383
12384    if (format == NULL || args == NULL) {
12385        PyErr_BadInternalCall();
12386        return NULL;
12387    }
12388    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12389    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12390        return NULL;
12391    fmt = PyUnicode_DATA(uformat);
12392    fmtkind = PyUnicode_KIND(uformat);
12393    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12394    fmtpos = 0;
12395
12396    reslen = rescnt = fmtcnt + 100;
12397    res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12398    if (res0 == NULL) {
12399        PyErr_NoMemory();
12400        goto onError;
12401    }
12402
12403    if (PyTuple_Check(args)) {
12404        arglen = PyTuple_Size(args);
12405        argidx = 0;
12406    }
12407    else {
12408        arglen = -1;
12409        argidx = -2;
12410    }
12411    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12412        !PyUnicode_Check(args))
12413        dict = args;
12414
12415    while (--fmtcnt >= 0) {
12416        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12417            if (--rescnt < 0) {
12418                rescnt = fmtcnt + 100;
12419                reslen += rescnt;
12420                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12421                if (res0 == NULL){
12422                    PyErr_NoMemory();
12423                    goto onError;
12424                }
12425                res = res0 + reslen - rescnt;
12426                --rescnt;
12427            }
12428            *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12429        }
12430        else {
12431            /* Got a format specifier */
12432            int flags = 0;
12433            Py_ssize_t width = -1;
12434            int prec = -1;
12435            Py_UCS4 c = '\0';
12436            Py_UCS4 fill;
12437            int isnumok;
12438            PyObject *v = NULL;
12439            PyObject *temp = NULL;
12440            void *pbuf;
12441            Py_ssize_t pindex;
12442            Py_UNICODE sign;
12443            Py_ssize_t len, len1;
12444            Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12445
12446            fmtpos++;
12447            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12448                Py_ssize_t keystart;
12449                Py_ssize_t keylen;
12450                PyObject *key;
12451                int pcount = 1;
12452
12453                if (dict == NULL) {
12454                    PyErr_SetString(PyExc_TypeError,
12455                                    "format requires a mapping");
12456                    goto onError;
12457                }
12458                ++fmtpos;
12459                --fmtcnt;
12460                keystart = fmtpos;
12461                /* Skip over balanced parentheses */
12462                while (pcount > 0 && --fmtcnt >= 0) {
12463                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12464                        --pcount;
12465                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12466                        ++pcount;
12467                    fmtpos++;
12468                }
12469                keylen = fmtpos - keystart - 1;
12470                if (fmtcnt < 0 || pcount > 0) {
12471                    PyErr_SetString(PyExc_ValueError,
12472                                    "incomplete format key");
12473                    goto onError;
12474                }
12475                key = PyUnicode_Substring((PyObject*)uformat,
12476                                          keystart, keystart + keylen);
12477                if (key == NULL)
12478                    goto onError;
12479                if (args_owned) {
12480                    Py_DECREF(args);
12481                    args_owned = 0;
12482                }
12483                args = PyObject_GetItem(dict, key);
12484                Py_DECREF(key);
12485                if (args == NULL) {
12486                    goto onError;
12487                }
12488                args_owned = 1;
12489                arglen = -1;
12490                argidx = -2;
12491            }
12492            while (--fmtcnt >= 0) {
12493                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12494                case '-': flags |= F_LJUST; continue;
12495                case '+': flags |= F_SIGN; continue;
12496                case ' ': flags |= F_BLANK; continue;
12497                case '#': flags |= F_ALT; continue;
12498                case '0': flags |= F_ZERO; continue;
12499                }
12500                break;
12501            }
12502            if (c == '*') {
12503                v = getnextarg(args, arglen, &argidx);
12504                if (v == NULL)
12505                    goto onError;
12506                if (!PyLong_Check(v)) {
12507                    PyErr_SetString(PyExc_TypeError,
12508                                    "* wants int");
12509                    goto onError;
12510                }
12511                width = PyLong_AsLong(v);
12512                if (width == -1 && PyErr_Occurred())
12513                    goto onError;
12514                if (width < 0) {
12515                    flags |= F_LJUST;
12516                    width = -width;
12517                }
12518                if (--fmtcnt >= 0)
12519                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12520            }
12521            else if (c >= '0' && c <= '9') {
12522                width = c - '0';
12523                while (--fmtcnt >= 0) {
12524                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12525                    if (c < '0' || c > '9')
12526                        break;
12527                    if ((width*10) / 10 != width) {
12528                        PyErr_SetString(PyExc_ValueError,
12529                                        "width too big");
12530                        goto onError;
12531                    }
12532                    width = width*10 + (c - '0');
12533                }
12534            }
12535            if (c == '.') {
12536                prec = 0;
12537                if (--fmtcnt >= 0)
12538                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12539                if (c == '*') {
12540                    v = getnextarg(args, arglen, &argidx);
12541                    if (v == NULL)
12542                        goto onError;
12543                    if (!PyLong_Check(v)) {
12544                        PyErr_SetString(PyExc_TypeError,
12545                                        "* wants int");
12546                        goto onError;
12547                    }
12548                    prec = PyLong_AsLong(v);
12549                    if (prec == -1 && PyErr_Occurred())
12550                        goto onError;
12551                    if (prec < 0)
12552                        prec = 0;
12553                    if (--fmtcnt >= 0)
12554                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12555                }
12556                else if (c >= '0' && c <= '9') {
12557                    prec = c - '0';
12558                    while (--fmtcnt >= 0) {
12559                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12560                        if (c < '0' || c > '9')
12561                            break;
12562                        if ((prec*10) / 10 != prec) {
12563                            PyErr_SetString(PyExc_ValueError,
12564                                            "prec too big");
12565                            goto onError;
12566                        }
12567                        prec = prec*10 + (c - '0');
12568                    }
12569                }
12570            } /* prec */
12571            if (fmtcnt >= 0) {
12572                if (c == 'h' || c == 'l' || c == 'L') {
12573                    if (--fmtcnt >= 0)
12574                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12575                }
12576            }
12577            if (fmtcnt < 0) {
12578                PyErr_SetString(PyExc_ValueError,
12579                                "incomplete format");
12580                goto onError;
12581            }
12582            if (c != '%') {
12583                v = getnextarg(args, arglen, &argidx);
12584                if (v == NULL)
12585                    goto onError;
12586            }
12587            sign = 0;
12588            fill = ' ';
12589            switch (c) {
12590
12591            case '%':
12592                pbuf = formatbuf;
12593                kind = PyUnicode_4BYTE_KIND;
12594                /* presume that buffer length is at least 1 */
12595                PyUnicode_WRITE(kind, pbuf, 0, '%');
12596                len = 1;
12597                break;
12598
12599            case 's':
12600            case 'r':
12601            case 'a':
12602                if (PyUnicode_CheckExact(v) && c == 's') {
12603                    temp = v;
12604                    Py_INCREF(temp);
12605                }
12606                else {
12607                    if (c == 's')
12608                        temp = PyObject_Str(v);
12609                    else if (c == 'r')
12610                        temp = PyObject_Repr(v);
12611                    else
12612                        temp = PyObject_ASCII(v);
12613                    if (temp == NULL)
12614                        goto onError;
12615                    if (PyUnicode_Check(temp))
12616                        /* nothing to do */;
12617                    else {
12618                        Py_DECREF(temp);
12619                        PyErr_SetString(PyExc_TypeError,
12620                                        "%s argument has non-string str()");
12621                        goto onError;
12622                    }
12623                }
12624                if (PyUnicode_READY(temp) == -1) {
12625                    Py_CLEAR(temp);
12626                    goto onError;
12627                }
12628                pbuf = PyUnicode_DATA(temp);
12629                kind = PyUnicode_KIND(temp);
12630                len = PyUnicode_GET_LENGTH(temp);
12631                if (prec >= 0 && len > prec)
12632                    len = prec;
12633                break;
12634
12635            case 'i':
12636            case 'd':
12637            case 'u':
12638            case 'o':
12639            case 'x':
12640            case 'X':
12641                isnumok = 0;
12642                if (PyNumber_Check(v)) {
12643                    PyObject *iobj=NULL;
12644
12645                    if (PyLong_Check(v)) {
12646                        iobj = v;
12647                        Py_INCREF(iobj);
12648                    }
12649                    else {
12650                        iobj = PyNumber_Long(v);
12651                    }
12652                    if (iobj!=NULL) {
12653                        if (PyLong_Check(iobj)) {
12654                            isnumok = 1;
12655                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
12656                            Py_DECREF(iobj);
12657                            if (!temp)
12658                                goto onError;
12659                            if (PyUnicode_READY(temp) == -1) {
12660                                Py_CLEAR(temp);
12661                                goto onError;
12662                            }
12663                            pbuf = PyUnicode_DATA(temp);
12664                            kind = PyUnicode_KIND(temp);
12665                            len = PyUnicode_GET_LENGTH(temp);
12666                            sign = 1;
12667                        }
12668                        else {
12669                            Py_DECREF(iobj);
12670                        }
12671                    }
12672                }
12673                if (!isnumok) {
12674                    PyErr_Format(PyExc_TypeError,
12675                                 "%%%c format: a number is required, "
12676                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12677                    goto onError;
12678                }
12679                if (flags & F_ZERO)
12680                    fill = '0';
12681                break;
12682
12683            case 'e':
12684            case 'E':
12685            case 'f':
12686            case 'F':
12687            case 'g':
12688            case 'G':
12689                temp = formatfloat(v, flags, prec, c);
12690                if (!temp)
12691                    goto onError;
12692                if (PyUnicode_READY(temp) == -1) {
12693                    Py_CLEAR(temp);
12694                    goto onError;
12695                }
12696                pbuf = PyUnicode_DATA(temp);
12697                kind = PyUnicode_KIND(temp);
12698                len = PyUnicode_GET_LENGTH(temp);
12699                sign = 1;
12700                if (flags & F_ZERO)
12701                    fill = '0';
12702                break;
12703
12704            case 'c':
12705                pbuf = formatbuf;
12706                kind = PyUnicode_4BYTE_KIND;
12707                len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
12708                if (len < 0)
12709                    goto onError;
12710                break;
12711
12712            default:
12713                PyErr_Format(PyExc_ValueError,
12714                             "unsupported format character '%c' (0x%x) "
12715                             "at index %zd",
12716                             (31<=c && c<=126) ? (char)c : '?',
12717                             (int)c,
12718                             fmtpos - 1);
12719                goto onError;
12720            }
12721            /* pbuf is initialized here. */
12722            pindex = 0;
12723            if (sign) {
12724                if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12725                    PyUnicode_READ(kind, pbuf, pindex) == '+') {
12726                    sign = PyUnicode_READ(kind, pbuf, pindex++);
12727                    len--;
12728                }
12729                else if (flags & F_SIGN)
12730                    sign = '+';
12731                else if (flags & F_BLANK)
12732                    sign = ' ';
12733                else
12734                    sign = 0;
12735            }
12736            if (width < len)
12737                width = len;
12738            if (rescnt - (sign != 0) < width) {
12739                reslen -= rescnt;
12740                rescnt = width + fmtcnt + 100;
12741                reslen += rescnt;
12742                if (reslen < 0) {
12743                    Py_XDECREF(temp);
12744                    PyErr_NoMemory();
12745                    goto onError;
12746                }
12747                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12748                if (res0 == 0) {
12749                    PyErr_NoMemory();
12750                    Py_XDECREF(temp);
12751                    goto onError;
12752                }
12753                res = res0 + reslen - rescnt;
12754            }
12755            if (sign) {
12756                if (fill != ' ')
12757                    *res++ = sign;
12758                rescnt--;
12759                if (width > len)
12760                    width--;
12761            }
12762            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12763                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12764                assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12765                if (fill != ' ') {
12766                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12767                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12768                }
12769                rescnt -= 2;
12770                width -= 2;
12771                if (width < 0)
12772                    width = 0;
12773                len -= 2;
12774            }
12775            if (width > len && !(flags & F_LJUST)) {
12776                do {
12777                    --rescnt;
12778                    *res++ = fill;
12779                } while (--width > len);
12780            }
12781            if (fill == ' ') {
12782                if (sign)
12783                    *res++ = sign;
12784                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12785                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12786                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12787                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12788                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12789                }
12790            }
12791            /* Copy all characters, preserving len */
12792            len1 = len;
12793            while (len1--) {
12794                *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12795                rescnt--;
12796            }
12797            while (--width >= len) {
12798                --rescnt;
12799                *res++ = ' ';
12800            }
12801            if (dict && (argidx < arglen) && c != '%') {
12802                PyErr_SetString(PyExc_TypeError,
12803                                "not all arguments converted during string formatting");
12804                Py_XDECREF(temp);
12805                goto onError;
12806            }
12807            Py_XDECREF(temp);
12808        } /* '%' */
12809    } /* until end */
12810    if (argidx < arglen && !dict) {
12811        PyErr_SetString(PyExc_TypeError,
12812                        "not all arguments converted during string formatting");
12813        goto onError;
12814    }
12815
12816
12817    for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12818        if (*res > max)
12819            max = *res;
12820    result = PyUnicode_New(reslen - rescnt, max);
12821    if (!result)
12822        goto onError;
12823    kind = PyUnicode_KIND(result);
12824    for (res = res0; res < res0+reslen-rescnt; res++)
12825        PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12826    PyMem_Free(res0);
12827    if (args_owned) {
12828        Py_DECREF(args);
12829    }
12830    Py_DECREF(uformat);
12831    return (PyObject *)result;
12832
12833  onError:
12834    PyMem_Free(res0);
12835    Py_DECREF(uformat);
12836    if (args_owned) {
12837        Py_DECREF(args);
12838    }
12839    return NULL;
12840}
12841
12842static PyObject *
12843unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12844
12845static PyObject *
12846unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12847{
12848    PyObject *x = NULL;
12849    static char *kwlist[] = {"object", "encoding", "errors", 0};
12850    char *encoding = NULL;
12851    char *errors = NULL;
12852
12853    if (type != &PyUnicode_Type)
12854        return unicode_subtype_new(type, args, kwds);
12855    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
12856                                     kwlist, &x, &encoding, &errors))
12857        return NULL;
12858    if (x == NULL)
12859        return (PyObject *)PyUnicode_New(0, 0);
12860    if (encoding == NULL && errors == NULL)
12861        return PyObject_Str(x);
12862    else
12863        return PyUnicode_FromEncodedObject(x, encoding, errors);
12864}
12865
12866static PyObject *
12867unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12868{
12869    PyUnicodeObject *unicode, *self;
12870    Py_ssize_t length, char_size;
12871    int share_wstr, share_utf8;
12872    unsigned int kind;
12873    void *data;
12874
12875    assert(PyType_IsSubtype(type, &PyUnicode_Type));
12876
12877    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
12878    if (unicode == NULL)
12879        return NULL;
12880    assert(_PyUnicode_CHECK(unicode));
12881    if (_PyUnicode_READY_REPLACE(&unicode))
12882        return NULL;
12883
12884    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
12885    if (self == NULL) {
12886        Py_DECREF(unicode);
12887        return NULL;
12888    }
12889    kind = PyUnicode_KIND(unicode);
12890    length = PyUnicode_GET_LENGTH(unicode);
12891
12892    _PyUnicode_LENGTH(self) = length;
12893    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
12894    _PyUnicode_STATE(self).interned = 0;
12895    _PyUnicode_STATE(self).kind = kind;
12896    _PyUnicode_STATE(self).compact = 0;
12897    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
12898    _PyUnicode_STATE(self).ready = 1;
12899    _PyUnicode_WSTR(self) = NULL;
12900    _PyUnicode_UTF8_LENGTH(self) = 0;
12901    _PyUnicode_UTF8(self) = NULL;
12902    _PyUnicode_WSTR_LENGTH(self) = 0;
12903    _PyUnicode_DATA_ANY(self) = NULL;
12904
12905    share_utf8 = 0;
12906    share_wstr = 0;
12907    if (kind == PyUnicode_1BYTE_KIND) {
12908        char_size = 1;
12909        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
12910            share_utf8 = 1;
12911    }
12912    else if (kind == PyUnicode_2BYTE_KIND) {
12913        char_size = 2;
12914        if (sizeof(wchar_t) == 2)
12915            share_wstr = 1;
12916    }
12917    else {
12918        assert(kind == PyUnicode_4BYTE_KIND);
12919        char_size = 4;
12920        if (sizeof(wchar_t) == 4)
12921            share_wstr = 1;
12922    }
12923
12924    /* Ensure we won't overflow the length. */
12925    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
12926        PyErr_NoMemory();
12927        goto onError;
12928    }
12929    data = PyObject_MALLOC((length + 1) * char_size);
12930    if (data == NULL) {
12931        PyErr_NoMemory();
12932        goto onError;
12933    }
12934
12935    _PyUnicode_DATA_ANY(self) = data;
12936    if (share_utf8) {
12937        _PyUnicode_UTF8_LENGTH(self) = length;
12938        _PyUnicode_UTF8(self) = data;
12939    }
12940    if (share_wstr) {
12941        _PyUnicode_WSTR_LENGTH(self) = length;
12942        _PyUnicode_WSTR(self) = (wchar_t *)data;
12943    }
12944
12945    Py_MEMCPY(data, PyUnicode_DATA(unicode),
12946              PyUnicode_KIND_SIZE(kind, length + 1));
12947    Py_DECREF(unicode);
12948    return (PyObject *)self;
12949
12950onError:
12951    Py_DECREF(unicode);
12952    Py_DECREF(self);
12953    return NULL;
12954}
12955
12956PyDoc_STRVAR(unicode_doc,
12957             "str(string[, encoding[, errors]]) -> str\n\
12958\n\
12959Create a new string object from the given encoded string.\n\
12960encoding defaults to the current default string encoding.\n\
12961errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
12962
12963static PyObject *unicode_iter(PyObject *seq);
12964
12965PyTypeObject PyUnicode_Type = {
12966    PyVarObject_HEAD_INIT(&PyType_Type, 0)
12967    "str",              /* tp_name */
12968    sizeof(PyUnicodeObject),        /* tp_size */
12969    0,                  /* tp_itemsize */
12970    /* Slots */
12971    (destructor)unicode_dealloc,    /* tp_dealloc */
12972    0,                  /* tp_print */
12973    0,                  /* tp_getattr */
12974    0,                  /* tp_setattr */
12975    0,                  /* tp_reserved */
12976    unicode_repr,           /* tp_repr */
12977    &unicode_as_number,         /* tp_as_number */
12978    &unicode_as_sequence,       /* tp_as_sequence */
12979    &unicode_as_mapping,        /* tp_as_mapping */
12980    (hashfunc) unicode_hash,        /* tp_hash*/
12981    0,                  /* tp_call*/
12982    (reprfunc) unicode_str,     /* tp_str */
12983    PyObject_GenericGetAttr,        /* tp_getattro */
12984    0,                  /* tp_setattro */
12985    0,                  /* tp_as_buffer */
12986    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
12987    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
12988    unicode_doc,            /* tp_doc */
12989    0,                  /* tp_traverse */
12990    0,                  /* tp_clear */
12991    PyUnicode_RichCompare,      /* tp_richcompare */
12992    0,                  /* tp_weaklistoffset */
12993    unicode_iter,           /* tp_iter */
12994    0,                  /* tp_iternext */
12995    unicode_methods,            /* tp_methods */
12996    0,                  /* tp_members */
12997    0,                  /* tp_getset */
12998    &PyBaseObject_Type,         /* tp_base */
12999    0,                  /* tp_dict */
13000    0,                  /* tp_descr_get */
13001    0,                  /* tp_descr_set */
13002    0,                  /* tp_dictoffset */
13003    0,                  /* tp_init */
13004    0,                  /* tp_alloc */
13005    unicode_new,            /* tp_new */
13006    PyObject_Del,           /* tp_free */
13007};
13008
13009/* Initialize the Unicode implementation */
13010
13011void _PyUnicode_Init(void)
13012{
13013    int i;
13014
13015    /* XXX - move this array to unicodectype.c ? */
13016    Py_UCS2 linebreak[] = {
13017        0x000A, /* LINE FEED */
13018        0x000D, /* CARRIAGE RETURN */
13019        0x001C, /* FILE SEPARATOR */
13020        0x001D, /* GROUP SEPARATOR */
13021        0x001E, /* RECORD SEPARATOR */
13022        0x0085, /* NEXT LINE */
13023        0x2028, /* LINE SEPARATOR */
13024        0x2029, /* PARAGRAPH SEPARATOR */
13025    };
13026
13027    /* Init the implementation */
13028    unicode_empty = PyUnicode_New(0, 0);
13029    if (!unicode_empty)
13030        Py_FatalError("Can't create empty string");
13031
13032    for (i = 0; i < 256; i++)
13033        unicode_latin1[i] = NULL;
13034    if (PyType_Ready(&PyUnicode_Type) < 0)
13035        Py_FatalError("Can't initialize 'unicode'");
13036
13037    /* initialize the linebreak bloom filter */
13038    bloom_linebreak = make_bloom_mask(
13039        PyUnicode_2BYTE_KIND, linebreak,
13040        Py_ARRAY_LENGTH(linebreak));
13041
13042    PyType_Ready(&EncodingMapType);
13043}
13044
13045/* Finalize the Unicode implementation */
13046
13047int
13048PyUnicode_ClearFreeList(void)
13049{
13050    return 0;
13051}
13052
13053void
13054_PyUnicode_Fini(void)
13055{
13056    int i;
13057
13058    Py_XDECREF(unicode_empty);
13059    unicode_empty = NULL;
13060
13061    for (i = 0; i < 256; i++) {
13062        if (unicode_latin1[i]) {
13063            Py_DECREF(unicode_latin1[i]);
13064            unicode_latin1[i] = NULL;
13065        }
13066    }
13067    (void)PyUnicode_ClearFreeList();
13068}
13069
13070void
13071PyUnicode_InternInPlace(PyObject **p)
13072{
13073    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13074    PyObject *t;
13075#ifdef Py_DEBUG
13076    assert(s != NULL);
13077    assert(_PyUnicode_CHECK(s));
13078#else
13079    if (s == NULL || !PyUnicode_Check(s))
13080        return;
13081#endif
13082    /* If it's a subclass, we don't really know what putting
13083       it in the interned dict might do. */
13084    if (!PyUnicode_CheckExact(s))
13085        return;
13086    if (PyUnicode_CHECK_INTERNED(s))
13087        return;
13088    if (_PyUnicode_READY_REPLACE(p)) {
13089        assert(0 && "PyUnicode_READY fail in PyUnicode_InternInPlace");
13090        return;
13091    }
13092    s = (PyUnicodeObject *)(*p);
13093    if (interned == NULL) {
13094        interned = PyDict_New();
13095        if (interned == NULL) {
13096            PyErr_Clear(); /* Don't leave an exception */
13097            return;
13098        }
13099    }
13100    /* It might be that the GetItem call fails even
13101       though the key is present in the dictionary,
13102       namely when this happens during a stack overflow. */
13103    Py_ALLOW_RECURSION
13104        t = PyDict_GetItem(interned, (PyObject *)s);
13105    Py_END_ALLOW_RECURSION
13106
13107        if (t) {
13108            Py_INCREF(t);
13109            Py_DECREF(*p);
13110            *p = t;
13111            return;
13112        }
13113
13114    PyThreadState_GET()->recursion_critical = 1;
13115    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13116        PyErr_Clear();
13117        PyThreadState_GET()->recursion_critical = 0;
13118        return;
13119    }
13120    PyThreadState_GET()->recursion_critical = 0;
13121    /* The two references in interned are not counted by refcnt.
13122       The deallocator will take care of this */
13123    Py_REFCNT(s) -= 2;
13124    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13125}
13126
13127void
13128PyUnicode_InternImmortal(PyObject **p)
13129{
13130    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13131
13132    PyUnicode_InternInPlace(p);
13133    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13134        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13135        Py_INCREF(*p);
13136    }
13137}
13138
13139PyObject *
13140PyUnicode_InternFromString(const char *cp)
13141{
13142    PyObject *s = PyUnicode_FromString(cp);
13143    if (s == NULL)
13144        return NULL;
13145    PyUnicode_InternInPlace(&s);
13146    return s;
13147}
13148
13149void
13150_Py_ReleaseInternedUnicodeStrings(void)
13151{
13152    PyObject *keys;
13153    PyUnicodeObject *s;
13154    Py_ssize_t i, n;
13155    Py_ssize_t immortal_size = 0, mortal_size = 0;
13156
13157    if (interned == NULL || !PyDict_Check(interned))
13158        return;
13159    keys = PyDict_Keys(interned);
13160    if (keys == NULL || !PyList_Check(keys)) {
13161        PyErr_Clear();
13162        return;
13163    }
13164
13165    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13166       detector, interned unicode strings are not forcibly deallocated;
13167       rather, we give them their stolen references back, and then clear
13168       and DECREF the interned dict. */
13169
13170    n = PyList_GET_SIZE(keys);
13171    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13172            n);
13173    for (i = 0; i < n; i++) {
13174        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13175        if (PyUnicode_READY(s) == -1)
13176            fprintf(stderr, "could not ready string\n");
13177        switch (PyUnicode_CHECK_INTERNED(s)) {
13178        case SSTATE_NOT_INTERNED:
13179            /* XXX Shouldn't happen */
13180            break;
13181        case SSTATE_INTERNED_IMMORTAL:
13182            Py_REFCNT(s) += 1;
13183            immortal_size += PyUnicode_GET_LENGTH(s);
13184            break;
13185        case SSTATE_INTERNED_MORTAL:
13186            Py_REFCNT(s) += 2;
13187            mortal_size += PyUnicode_GET_LENGTH(s);
13188            break;
13189        default:
13190            Py_FatalError("Inconsistent interned string state.");
13191        }
13192        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13193    }
13194    fprintf(stderr, "total size of all interned strings: "
13195            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13196            "mortal/immortal\n", mortal_size, immortal_size);
13197    Py_DECREF(keys);
13198    PyDict_Clear(interned);
13199    Py_DECREF(interned);
13200    interned = NULL;
13201}
13202
13203
13204/********************* Unicode Iterator **************************/
13205
13206typedef struct {
13207    PyObject_HEAD
13208    Py_ssize_t it_index;
13209    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13210} unicodeiterobject;
13211
13212static void
13213unicodeiter_dealloc(unicodeiterobject *it)
13214{
13215    _PyObject_GC_UNTRACK(it);
13216    Py_XDECREF(it->it_seq);
13217    PyObject_GC_Del(it);
13218}
13219
13220static int
13221unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13222{
13223    Py_VISIT(it->it_seq);
13224    return 0;
13225}
13226
13227static PyObject *
13228unicodeiter_next(unicodeiterobject *it)
13229{
13230    PyUnicodeObject *seq;
13231    PyObject *item;
13232
13233    assert(it != NULL);
13234    seq = it->it_seq;
13235    if (seq == NULL)
13236        return NULL;
13237    assert(_PyUnicode_CHECK(seq));
13238
13239    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13240        int kind = PyUnicode_KIND(seq);
13241        void *data = PyUnicode_DATA(seq);
13242        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13243        item = PyUnicode_FromOrdinal(chr);
13244        if (item != NULL)
13245            ++it->it_index;
13246        return item;
13247    }
13248
13249    Py_DECREF(seq);
13250    it->it_seq = NULL;
13251    return NULL;
13252}
13253
13254static PyObject *
13255unicodeiter_len(unicodeiterobject *it)
13256{
13257    Py_ssize_t len = 0;
13258    if (it->it_seq)
13259        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13260    return PyLong_FromSsize_t(len);
13261}
13262
13263PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13264
13265static PyMethodDef unicodeiter_methods[] = {
13266    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13267     length_hint_doc},
13268    {NULL,      NULL}       /* sentinel */
13269};
13270
13271PyTypeObject PyUnicodeIter_Type = {
13272    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13273    "str_iterator",         /* tp_name */
13274    sizeof(unicodeiterobject),      /* tp_basicsize */
13275    0,                  /* tp_itemsize */
13276    /* methods */
13277    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13278    0,                  /* tp_print */
13279    0,                  /* tp_getattr */
13280    0,                  /* tp_setattr */
13281    0,                  /* tp_reserved */
13282    0,                  /* tp_repr */
13283    0,                  /* tp_as_number */
13284    0,                  /* tp_as_sequence */
13285    0,                  /* tp_as_mapping */
13286    0,                  /* tp_hash */
13287    0,                  /* tp_call */
13288    0,                  /* tp_str */
13289    PyObject_GenericGetAttr,        /* tp_getattro */
13290    0,                  /* tp_setattro */
13291    0,                  /* tp_as_buffer */
13292    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13293    0,                  /* tp_doc */
13294    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13295    0,                  /* tp_clear */
13296    0,                  /* tp_richcompare */
13297    0,                  /* tp_weaklistoffset */
13298    PyObject_SelfIter,          /* tp_iter */
13299    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13300    unicodeiter_methods,            /* tp_methods */
13301    0,
13302};
13303
13304static PyObject *
13305unicode_iter(PyObject *seq)
13306{
13307    unicodeiterobject *it;
13308
13309    if (!PyUnicode_Check(seq)) {
13310        PyErr_BadInternalCall();
13311        return NULL;
13312    }
13313    if (PyUnicode_READY(seq) == -1)
13314        return NULL;
13315    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13316    if (it == NULL)
13317        return NULL;
13318    it->it_index = 0;
13319    Py_INCREF(seq);
13320    it->it_seq = (PyUnicodeObject *)seq;
13321    _PyObject_GC_TRACK(it);
13322    return (PyObject *)it;
13323}
13324
13325#define UNIOP(x) Py_UNICODE_##x
13326#define UNIOP_t Py_UNICODE
13327#include "uniops.h"
13328#undef UNIOP
13329#undef UNIOP_t
13330#define UNIOP(x) Py_UCS4_##x
13331#define UNIOP_t Py_UCS4
13332#include "uniops.h"
13333#undef UNIOP
13334#undef UNIOP_t
13335
13336Py_UNICODE*
13337PyUnicode_AsUnicodeCopy(PyObject *object)
13338{
13339    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13340    Py_UNICODE *copy;
13341    Py_ssize_t size;
13342
13343    if (!PyUnicode_Check(unicode)) {
13344        PyErr_BadArgument();
13345        return NULL;
13346    }
13347    /* Ensure we won't overflow the size. */
13348    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13349        PyErr_NoMemory();
13350        return NULL;
13351    }
13352    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13353    size *= sizeof(Py_UNICODE);
13354    copy = PyMem_Malloc(size);
13355    if (copy == NULL) {
13356        PyErr_NoMemory();
13357        return NULL;
13358    }
13359    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13360    return copy;
13361}
13362
13363/* A _string module, to export formatter_parser and formatter_field_name_split
13364   to the string.Formatter class implemented in Python. */
13365
13366static PyMethodDef _string_methods[] = {
13367    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13368     METH_O, PyDoc_STR("split the argument as a field name")},
13369    {"formatter_parser", (PyCFunction) formatter_parser,
13370     METH_O, PyDoc_STR("parse the argument as a format string")},
13371    {NULL, NULL}
13372};
13373
13374static struct PyModuleDef _string_module = {
13375    PyModuleDef_HEAD_INIT,
13376    "_string",
13377    PyDoc_STR("string helper module"),
13378    0,
13379    _string_methods,
13380    NULL,
13381    NULL,
13382    NULL,
13383    NULL
13384};
13385
13386PyMODINIT_FUNC
13387PyInit__string(void)
13388{
13389    return PyModule_Create(&_string_module);
13390}
13391
13392
13393#ifdef __cplusplus
13394}
13395#endif
13396