unicodeobject.c revision 7597addbd4f56e6a3a8a595db404824c5f825c3a
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49/* Limit for the Unicode object free list */
50
51#define PyUnicode_MAXFREELIST       1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55   The implementation will keep allocated Unicode memory intact for
56   all objects on the free list having a size less than this
57   limit. This reduces malloc() overhead for small Unicode objects.
58
59   At worst this will result in PyUnicode_MAXFREELIST *
60   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
61   malloc()-overhead) bytes of unused garbage.
62
63   Setting the limit to 0 effectively turns the feature off.
64
65   Note: This is an experimental feature ! If you get core dumps when
66   using Unicode objects, turn this feature off.
67
68*/
69
70#define KEEPALIVE_SIZE_LIMIT       9
71
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
80/* --- Globals ------------------------------------------------------------
81
82   The globals are initialized by the _PyUnicode_Init() API and should
83   not be used before calling that API.
84
85*/
86
87
88#ifdef __cplusplus
89extern "C" {
90#endif
91
92#ifdef Py_DEBUG
93#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
97
98#define _PyUnicode_UTF8(op)                             \
99    (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op)                              \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     assert(PyUnicode_IS_READY(op)),                    \
103     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
104         ((char*)((PyASCIIObject*)(op) + 1)) :          \
105         _PyUnicode_UTF8(op))
106#define _PyUnicode_UTF8_LENGTH(op)                      \
107    (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     assert(PyUnicode_IS_READY(op)),                    \
111     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
112         ((PyASCIIObject*)(op))->length :               \
113         _PyUnicode_UTF8_LENGTH(op))
114#define _PyUnicode_WSTR(op)                             \
115    (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op)                      \
117    (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op)                           \
119    (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op)                            \
121    (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op)                             \
123    (((PyASCIIObject *)(op))->hash)
124#define _PyUnicode_KIND(op)                             \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     ((PyASCIIObject *)(op))->state.kind)
127#define _PyUnicode_GET_LENGTH(op)                       \
128    (assert(_PyUnicode_CHECK(op)),                      \
129     ((PyASCIIObject *)(op))->length)
130#define _PyUnicode_DATA_ANY(op)                         \
131    (((PyUnicodeObject*)(op))->data.any)
132
133#undef PyUnicode_READY
134#define PyUnicode_READY(op)                             \
135    (assert(_PyUnicode_CHECK(op)),                      \
136     (PyUnicode_IS_READY(op) ?                          \
137      0 :                                               \
138      _PyUnicode_Ready((PyObject *)(op))))
139
140#define _PyUnicode_READY_REPLACE(p_obj)                 \
141    (assert(_PyUnicode_CHECK(*p_obj)),                  \
142     (PyUnicode_IS_READY(*p_obj) ?                      \
143      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
145#define _PyUnicode_SHARE_UTF8(op)                       \
146    (assert(_PyUnicode_CHECK(op)),                      \
147     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
148     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op)                       \
150    (assert(_PyUnicode_CHECK(op)),                      \
151     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
153/* true if the Unicode object has an allocated UTF-8 memory block
154   (not shared with other data) */
155#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
156    (assert(_PyUnicode_CHECK(op)),                      \
157     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
158      && _PyUnicode_UTF8(op)                            \
159      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
161/* true if the Unicode object has an allocated wstr memory block
162   (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
164    (assert(_PyUnicode_CHECK(op)),                      \
165     (_PyUnicode_WSTR(op) &&                            \
166      (!PyUnicode_IS_READY(op) ||                       \
167       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
169/* Generic helper macro to convert characters of different types.
170   from_type and to_type have to be valid type names, begin and end
171   are pointers to the source characters which should be of type
172   "from_type *".  to is a pointer of type "to_type *" and points to the
173   buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175    do {                                                \
176        const from_type *iter_; to_type *to_;           \
177        for (iter_ = (begin), to_ = (to_type *)(to);    \
178             iter_ < (end);                             \
179             ++iter_, ++to_) {                          \
180            *to_ = (to_type)*iter_;                     \
181        }                                               \
182    } while (0)
183
184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
187/* This dictionary holds all interned unicode strings.  Note that references
188   to strings in this dictionary are *not* counted in the string's ob_refcnt.
189   When the interned string reaches a refcnt of 0 the string deallocation
190   function will delete the reference from this dictionary.
191
192   Another way to look at this is that to say that the actual reference
193   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
194*/
195static PyObject *interned;
196
197/* The empty Unicode object is shared to improve performance. */
198static PyObject *unicode_empty;
199
200/* Single character Unicode strings in the Latin-1 range are being
201   shared as well. */
202static PyObject *unicode_latin1[256];
203
204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
206    0, 0, 0, 0, 0, 0, 0, 0,
207/*     case 0x0009: * CHARACTER TABULATION */
208/*     case 0x000A: * LINE FEED */
209/*     case 0x000B: * LINE TABULATION */
210/*     case 0x000C: * FORM FEED */
211/*     case 0x000D: * CARRIAGE RETURN */
212    0, 1, 1, 1, 1, 1, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214/*     case 0x001C: * FILE SEPARATOR */
215/*     case 0x001D: * GROUP SEPARATOR */
216/*     case 0x001E: * RECORD SEPARATOR */
217/*     case 0x001F: * UNIT SEPARATOR */
218    0, 0, 0, 0, 1, 1, 1, 1,
219/*     case 0x0020: * SPACE */
220    1, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228    0, 0, 0, 0, 0, 0, 0, 0,
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0
233};
234
235/* forward */
236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
237static PyObject* get_latin1_char(unsigned char ch);
238
239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
241       PyObject **errorHandler,const char *encoding, const char *reason,
242       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
245static void
246raise_encode_exception(PyObject **exceptionObject,
247                       const char *encoding,
248                       const Py_UNICODE *unicode, Py_ssize_t size,
249                       Py_ssize_t startpos, Py_ssize_t endpos,
250                       const char *reason);
251
252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
254    0, 0, 0, 0, 0, 0, 0, 0,
255/*         0x000A, * LINE FEED */
256/*         0x000B, * LINE TABULATION */
257/*         0x000C, * FORM FEED */
258/*         0x000D, * CARRIAGE RETURN */
259    0, 0, 1, 1, 1, 1, 0, 0,
260    0, 0, 0, 0, 0, 0, 0, 0,
261/*         0x001C, * FILE SEPARATOR */
262/*         0x001D, * GROUP SEPARATOR */
263/*         0x001E, * RECORD SEPARATOR */
264    0, 0, 0, 0, 1, 1, 1, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0,
269
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0
278};
279
280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281   This function is kept for backward compatibility with the old API. */
282Py_UNICODE
283PyUnicode_GetMax(void)
284{
285#ifdef Py_UNICODE_WIDE
286    return 0x10FFFF;
287#else
288    /* This is actually an illegal character, so it should
289       not be passed to unichr. */
290    return 0xFFFF;
291#endif
292}
293
294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298    PyASCIIObject *ascii;
299    unsigned int kind;
300
301    assert(PyUnicode_Check(op));
302
303    ascii = (PyASCIIObject *)op;
304    kind = ascii->state.kind;
305
306    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
307        assert(kind == PyUnicode_1BYTE_KIND);
308        assert(ascii->state.ready == 1);
309    }
310    else {
311        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
312        void *data;
313
314        if (ascii->state.compact == 1) {
315            data = compact + 1;
316            assert(kind == PyUnicode_1BYTE_KIND
317                   || kind == PyUnicode_2BYTE_KIND
318                   || kind == PyUnicode_4BYTE_KIND);
319            assert(ascii->state.ascii == 0);
320            assert(ascii->state.ready == 1);
321            assert (compact->utf8 != data);
322        } else {
323            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325            data = unicode->data.any;
326            if (kind == PyUnicode_WCHAR_KIND) {
327                assert(ascii->state.compact == 0);
328                assert(ascii->state.ascii == 0);
329                assert(ascii->state.ready == 0);
330                assert(ascii->wstr != NULL);
331                assert(data == NULL);
332                assert(compact->utf8 == NULL);
333                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334            }
335            else {
336                assert(kind == PyUnicode_1BYTE_KIND
337                       || kind == PyUnicode_2BYTE_KIND
338                       || kind == PyUnicode_4BYTE_KIND);
339                assert(ascii->state.compact == 0);
340                assert(ascii->state.ready == 1);
341                assert(data != NULL);
342                if (ascii->state.ascii) {
343                    assert (compact->utf8 == data);
344                    assert (compact->utf8_length == ascii->length);
345                }
346                else
347                    assert (compact->utf8 != data);
348            }
349        }
350        if (kind != PyUnicode_WCHAR_KIND) {
351            if (
352#if SIZEOF_WCHAR_T == 2
353                kind == PyUnicode_2BYTE_KIND
354#else
355                kind == PyUnicode_4BYTE_KIND
356#endif
357               )
358            {
359                assert(ascii->wstr == data);
360                assert(compact->wstr_length == ascii->length);
361            } else
362                assert(ascii->wstr != data);
363        }
364
365        if (compact->utf8 == NULL)
366            assert(compact->utf8_length == 0);
367        if (ascii->wstr == NULL)
368            assert(compact->wstr_length == 0);
369    }
370    return 1;
371}
372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376    return 1;
377}
378#endif
379
380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383   to keep things simple, we use a single bitmask, using the least 5
384   bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
404
405#define BLOOM_LINEBREAK(ch)                                             \
406    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
407     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
408
409Py_LOCAL_INLINE(BLOOM_MASK)
410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
411{
412    /* calculate simple bloom-style bitmask for a given unicode string */
413
414    BLOOM_MASK mask;
415    Py_ssize_t i;
416
417    mask = 0;
418    for (i = 0; i < len; i++)
419        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
420
421    return mask;
422}
423
424#define BLOOM_MEMBER(mask, chr, str) \
425    (BLOOM(mask, chr) \
426     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
427
428/* --- Unicode Object ----------------------------------------------------- */
429
430static PyObject *
431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434                                 Py_ssize_t size, Py_UCS4 ch,
435                                 int direction)
436{
437    /* like wcschr, but doesn't stop at NULL characters */
438    Py_ssize_t i;
439    if (direction == 1) {
440        for(i = 0; i < size; i++)
441            if (PyUnicode_READ(kind, s, i) == ch)
442                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443    }
444    else {
445        for(i = size-1; i >= 0; i--)
446            if (PyUnicode_READ(kind, s, i) == ch)
447                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448    }
449    return NULL;
450}
451
452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455    Py_ssize_t char_size;
456    Py_ssize_t struct_size;
457    Py_ssize_t new_size;
458    int share_wstr;
459
460    assert(PyUnicode_IS_READY(unicode));
461    char_size = PyUnicode_CHARACTER_SIZE(unicode);
462    if (PyUnicode_IS_COMPACT_ASCII(unicode))
463        struct_size = sizeof(PyASCIIObject);
464    else
465        struct_size = sizeof(PyCompactUnicodeObject);
466    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
467
468    _Py_DEC_REFTOTAL;
469    _Py_ForgetReference(unicode);
470
471    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472        PyErr_NoMemory();
473        return NULL;
474    }
475    new_size = (struct_size + (length + 1) * char_size);
476
477    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478    if (unicode == NULL) {
479        PyObject_Del(unicode);
480        PyErr_NoMemory();
481        return NULL;
482    }
483    _Py_NewReference(unicode);
484    _PyUnicode_LENGTH(unicode) = length;
485    if (share_wstr) {
486        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
487        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488            _PyUnicode_WSTR_LENGTH(unicode) = length;
489    }
490    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491                    length, 0);
492    return unicode;
493}
494
495static int
496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
497{
498    wchar_t *wstr;
499    assert(!PyUnicode_IS_COMPACT(unicode));
500    assert(Py_REFCNT(unicode) == 1);
501
502    _PyUnicode_DIRTY(unicode);
503
504    if (PyUnicode_IS_READY(unicode)) {
505        Py_ssize_t char_size;
506        Py_ssize_t new_size;
507        int share_wstr, share_utf8;
508        void *data;
509
510        data = _PyUnicode_DATA_ANY(unicode);
511        assert(data != NULL);
512        char_size = PyUnicode_CHARACTER_SIZE(unicode);
513        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
515        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516        {
517            PyObject_DEL(_PyUnicode_UTF8(unicode));
518            _PyUnicode_UTF8(unicode) = NULL;
519            _PyUnicode_UTF8_LENGTH(unicode) = 0;
520        }
521
522        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523            PyErr_NoMemory();
524            return -1;
525        }
526        new_size = (length + 1) * char_size;
527
528        data = (PyObject *)PyObject_REALLOC(data, new_size);
529        if (data == NULL) {
530            PyErr_NoMemory();
531            return -1;
532        }
533        _PyUnicode_DATA_ANY(unicode) = data;
534        if (share_wstr) {
535            _PyUnicode_WSTR(unicode) = data;
536            _PyUnicode_WSTR_LENGTH(unicode) = length;
537        }
538        if (share_utf8) {
539            _PyUnicode_UTF8(unicode) = data;
540            _PyUnicode_UTF8_LENGTH(unicode) = length;
541        }
542        _PyUnicode_LENGTH(unicode) = length;
543        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
544        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
545            _PyUnicode_CheckConsistency(unicode);
546            return 0;
547        }
548    }
549    assert(_PyUnicode_WSTR(unicode) != NULL);
550
551    /* check for integer overflow */
552    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553        PyErr_NoMemory();
554        return -1;
555    }
556    wstr =  _PyUnicode_WSTR(unicode);
557    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558    if (!wstr) {
559        PyErr_NoMemory();
560        return -1;
561    }
562    _PyUnicode_WSTR(unicode) = wstr;
563    _PyUnicode_WSTR(unicode)[length] = 0;
564    _PyUnicode_WSTR_LENGTH(unicode) = length;
565    _PyUnicode_CheckConsistency(unicode);
566    return 0;
567}
568
569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572    Py_ssize_t copy_length;
573    if (PyUnicode_IS_COMPACT(unicode)) {
574        PyObject *copy;
575        assert(PyUnicode_IS_READY(unicode));
576
577        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578        if (copy == NULL)
579            return NULL;
580
581        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582        if (PyUnicode_CopyCharacters(copy, 0,
583                                     unicode, 0,
584                                     copy_length) < 0)
585        {
586            Py_DECREF(copy);
587            return NULL;
588        }
589        return copy;
590    }
591    else {
592        PyUnicodeObject *w;
593        assert(_PyUnicode_WSTR(unicode) != NULL);
594        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
595        w = _PyUnicode_New(length);
596        if (w == NULL)
597            return NULL;
598        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599        copy_length = Py_MIN(copy_length, length);
600        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601                        copy_length);
602        return (PyObject*)w;
603    }
604}
605
606/* We allocate one more byte to make sure the string is
607   Ux0000 terminated; some code (e.g. new_identifier)
608   relies on that.
609
610   XXX This allocator could further be enhanced by assuring that the
611   free list never reduces its size below 1.
612
613*/
614
615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
621{
622    register PyUnicodeObject *unicode;
623    size_t new_size;
624
625    /* Optimization for empty strings */
626    if (length == 0 && unicode_empty != NULL) {
627        Py_INCREF(unicode_empty);
628        return (PyUnicodeObject*)unicode_empty;
629    }
630
631    /* Ensure we won't overflow the size. */
632    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633        return (PyUnicodeObject *)PyErr_NoMemory();
634    }
635    if (length < 0) {
636        PyErr_SetString(PyExc_SystemError,
637                        "Negative size passed to _PyUnicode_New");
638        return NULL;
639    }
640
641#ifdef Py_DEBUG
642    ++unicode_old_new_calls;
643#endif
644
645    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646    if (unicode == NULL)
647        return NULL;
648    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650    if (!_PyUnicode_WSTR(unicode)) {
651        PyErr_NoMemory();
652        goto onError;
653    }
654
655    /* Initialize the first element to guard against cases where
656     * the caller fails before initializing str -- unicode_resize()
657     * reads str[0], and the Keep-Alive optimization can keep memory
658     * allocated for str alive across a call to unicode_dealloc(unicode).
659     * We don't want unicode_resize to read uninitialized memory in
660     * that case.
661     */
662    _PyUnicode_WSTR(unicode)[0] = 0;
663    _PyUnicode_WSTR(unicode)[length] = 0;
664    _PyUnicode_WSTR_LENGTH(unicode) = length;
665    _PyUnicode_HASH(unicode) = -1;
666    _PyUnicode_STATE(unicode).interned = 0;
667    _PyUnicode_STATE(unicode).kind = 0;
668    _PyUnicode_STATE(unicode).compact = 0;
669    _PyUnicode_STATE(unicode).ready = 0;
670    _PyUnicode_STATE(unicode).ascii = 0;
671    _PyUnicode_DATA_ANY(unicode) = NULL;
672    _PyUnicode_LENGTH(unicode) = 0;
673    _PyUnicode_UTF8(unicode) = NULL;
674    _PyUnicode_UTF8_LENGTH(unicode) = 0;
675    return unicode;
676
677  onError:
678    /* XXX UNREF/NEWREF interface should be more symmetrical */
679    _Py_DEC_REFTOTAL;
680    _Py_ForgetReference((PyObject *)unicode);
681    PyObject_Del(unicode);
682    return NULL;
683}
684
685static const char*
686unicode_kind_name(PyObject *unicode)
687{
688    /* don't check consistency: unicode_kind_name() is called from
689       _PyUnicode_Dump() */
690    if (!PyUnicode_IS_COMPACT(unicode))
691    {
692        if (!PyUnicode_IS_READY(unicode))
693            return "wstr";
694        switch(PyUnicode_KIND(unicode))
695        {
696        case PyUnicode_1BYTE_KIND:
697            if (PyUnicode_IS_ASCII(unicode))
698                return "legacy ascii";
699            else
700                return "legacy latin1";
701        case PyUnicode_2BYTE_KIND:
702            return "legacy UCS2";
703        case PyUnicode_4BYTE_KIND:
704            return "legacy UCS4";
705        default:
706            return "<legacy invalid kind>";
707        }
708    }
709    assert(PyUnicode_IS_READY(unicode));
710    switch(PyUnicode_KIND(unicode))
711    {
712    case PyUnicode_1BYTE_KIND:
713        if (PyUnicode_IS_ASCII(unicode))
714            return "ascii";
715        else
716            return "latin1";
717    case PyUnicode_2BYTE_KIND:
718        return "UCS2";
719    case PyUnicode_4BYTE_KIND:
720        return "UCS4";
721    default:
722        return "<invalid compact kind>";
723    }
724}
725
726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
731    return PyUnicode_UTF8(unicode);
732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735    return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738    printf("obj %p\n", unicode);
739    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744    return PyUnicode_DATA(unicode);
745}
746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750    PyASCIIObject *ascii = (PyASCIIObject *)op;
751    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753    void *data;
754    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755    if (ascii->state.compact)
756        data = (compact + 1);
757    else
758        data = unicode->data.any;
759    if (ascii->wstr == data)
760        printf("shared ");
761    printf("wstr=%p", ascii->wstr);
762    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
763        printf(" (%zu), ", compact->wstr_length);
764        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765            printf("shared ");
766        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
767    }
768    printf(", data=%p\n", data);
769}
770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775    PyObject *obj;
776    PyCompactUnicodeObject *unicode;
777    void *data;
778    int kind_state;
779    int is_sharing, is_ascii;
780    Py_ssize_t char_size;
781    Py_ssize_t struct_size;
782
783    /* Optimization for empty strings */
784    if (size == 0 && unicode_empty != NULL) {
785        Py_INCREF(unicode_empty);
786        return unicode_empty;
787    }
788
789#ifdef Py_DEBUG
790    ++unicode_new_new_calls;
791#endif
792
793    is_ascii = 0;
794    is_sharing = 0;
795    struct_size = sizeof(PyCompactUnicodeObject);
796    if (maxchar < 128) {
797        kind_state = PyUnicode_1BYTE_KIND;
798        char_size = 1;
799        is_ascii = 1;
800        struct_size = sizeof(PyASCIIObject);
801    }
802    else if (maxchar < 256) {
803        kind_state = PyUnicode_1BYTE_KIND;
804        char_size = 1;
805    }
806    else if (maxchar < 65536) {
807        kind_state = PyUnicode_2BYTE_KIND;
808        char_size = 2;
809        if (sizeof(wchar_t) == 2)
810            is_sharing = 1;
811    }
812    else {
813        kind_state = PyUnicode_4BYTE_KIND;
814        char_size = 4;
815        if (sizeof(wchar_t) == 4)
816            is_sharing = 1;
817    }
818
819    /* Ensure we won't overflow the size. */
820    if (size < 0) {
821        PyErr_SetString(PyExc_SystemError,
822                        "Negative size passed to PyUnicode_New");
823        return NULL;
824    }
825    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826        return PyErr_NoMemory();
827
828    /* Duplicated allocation code from _PyObject_New() instead of a call to
829     * PyObject_New() so we are able to allocate space for the object and
830     * it's data buffer.
831     */
832    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833    if (obj == NULL)
834        return PyErr_NoMemory();
835    obj = PyObject_INIT(obj, &PyUnicode_Type);
836    if (obj == NULL)
837        return NULL;
838
839    unicode = (PyCompactUnicodeObject *)obj;
840    if (is_ascii)
841        data = ((PyASCIIObject*)obj) + 1;
842    else
843        data = unicode + 1;
844    _PyUnicode_LENGTH(unicode) = size;
845    _PyUnicode_HASH(unicode) = -1;
846    _PyUnicode_STATE(unicode).interned = 0;
847    _PyUnicode_STATE(unicode).kind = kind_state;
848    _PyUnicode_STATE(unicode).compact = 1;
849    _PyUnicode_STATE(unicode).ready = 1;
850    _PyUnicode_STATE(unicode).ascii = is_ascii;
851    if (is_ascii) {
852        ((char*)data)[size] = 0;
853        _PyUnicode_WSTR(unicode) = NULL;
854    }
855    else if (kind_state == PyUnicode_1BYTE_KIND) {
856        ((char*)data)[size] = 0;
857        _PyUnicode_WSTR(unicode) = NULL;
858        _PyUnicode_WSTR_LENGTH(unicode) = 0;
859        unicode->utf8 = NULL;
860        unicode->utf8_length = 0;
861        }
862    else {
863        unicode->utf8 = NULL;
864        unicode->utf8_length = 0;
865        if (kind_state == PyUnicode_2BYTE_KIND)
866            ((Py_UCS2*)data)[size] = 0;
867        else /* kind_state == PyUnicode_4BYTE_KIND */
868            ((Py_UCS4*)data)[size] = 0;
869        if (is_sharing) {
870            _PyUnicode_WSTR_LENGTH(unicode) = size;
871            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872        }
873        else {
874            _PyUnicode_WSTR_LENGTH(unicode) = 0;
875            _PyUnicode_WSTR(unicode) = NULL;
876        }
877    }
878    return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883   will decode surrogate pairs, the other conversions are implemented as macros
884   for efficiency.
885
886   This function assumes that unicode can hold one more code point than wstr
887   characters for a terminating null character. */
888static void
889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890                              PyUnicodeObject *unicode)
891{
892    const wchar_t *iter;
893    Py_UCS4 *ucs4_out;
894
895    assert(unicode != NULL);
896    assert(_PyUnicode_CHECK(unicode));
897    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900    for (iter = begin; iter < end; ) {
901        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902                           _PyUnicode_GET_LENGTH(unicode)));
903        if (*iter >= 0xD800 && *iter <= 0xDBFF
904            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905        {
906            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907            iter += 2;
908        }
909        else {
910            *ucs4_out++ = *iter;
911            iter++;
912        }
913    }
914    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915                        _PyUnicode_GET_LENGTH(unicode)));
916
917}
918#endif
919
920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
923    assert(_PyUnicode_CHECK(unicode));
924    if (Py_REFCNT(unicode) != 1) {
925        PyErr_SetString(PyExc_SystemError,
926                        "Cannot modify a string having more than 1 reference");
927        return -1;
928    }
929    _PyUnicode_DIRTY(unicode);
930    return 0;
931}
932
933Py_ssize_t
934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935                         PyObject *from, Py_ssize_t from_start,
936                         Py_ssize_t how_many)
937{
938    unsigned int from_kind, to_kind;
939    void *from_data, *to_data;
940
941    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942        PyErr_BadInternalCall();
943        return -1;
944    }
945
946    if (PyUnicode_READY(from))
947        return -1;
948    if (PyUnicode_READY(to))
949        return -1;
950
951    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
952    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
953        PyErr_Format(PyExc_SystemError,
954                     "Cannot write %zi characters at %zi "
955                     "in a string of %zi characters",
956                     how_many, to_start, PyUnicode_GET_LENGTH(to));
957        return -1;
958    }
959    if (how_many == 0)
960        return 0;
961
962    if (_PyUnicode_Dirty(to))
963        return -1;
964
965    from_kind = PyUnicode_KIND(from);
966    from_data = PyUnicode_DATA(from);
967    to_kind = PyUnicode_KIND(to);
968    to_data = PyUnicode_DATA(to);
969
970    if (from_kind == to_kind
971        /* deny latin1 => ascii */
972        && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
973    {
974        Py_MEMCPY((char*)to_data
975                      + PyUnicode_KIND_SIZE(to_kind, to_start),
976                  (char*)from_data
977                      + PyUnicode_KIND_SIZE(from_kind, from_start),
978                  PyUnicode_KIND_SIZE(to_kind, how_many));
979    }
980    else if (from_kind == PyUnicode_1BYTE_KIND
981             && to_kind == PyUnicode_2BYTE_KIND)
982    {
983        _PyUnicode_CONVERT_BYTES(
984            Py_UCS1, Py_UCS2,
985            PyUnicode_1BYTE_DATA(from) + from_start,
986            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987            PyUnicode_2BYTE_DATA(to) + to_start
988            );
989    }
990    else if (from_kind == PyUnicode_1BYTE_KIND
991             && to_kind == PyUnicode_4BYTE_KIND)
992    {
993        _PyUnicode_CONVERT_BYTES(
994            Py_UCS1, Py_UCS4,
995            PyUnicode_1BYTE_DATA(from) + from_start,
996            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997            PyUnicode_4BYTE_DATA(to) + to_start
998            );
999    }
1000    else if (from_kind == PyUnicode_2BYTE_KIND
1001             && to_kind == PyUnicode_4BYTE_KIND)
1002    {
1003        _PyUnicode_CONVERT_BYTES(
1004            Py_UCS2, Py_UCS4,
1005            PyUnicode_2BYTE_DATA(from) + from_start,
1006            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007            PyUnicode_4BYTE_DATA(to) + to_start
1008            );
1009    }
1010    else {
1011        int invalid_kinds;
1012
1013        /* check if max_char(from substring) <= max_char(to) */
1014        if (from_kind > to_kind
1015                /* latin1 => ascii */
1016            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1017        {
1018            /* slow path to check for character overflow */
1019            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1020            Py_UCS4 ch, maxchar;
1021            Py_ssize_t i;
1022
1023            maxchar = 0;
1024            invalid_kinds = 0;
1025            for (i=0; i < how_many; i++) {
1026                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1027                if (ch > maxchar) {
1028                    maxchar = ch;
1029                    if (maxchar > to_maxchar) {
1030                        invalid_kinds = 1;
1031                        break;
1032                    }
1033                }
1034                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1035            }
1036        }
1037        else
1038            invalid_kinds = 1;
1039        if (invalid_kinds) {
1040            PyErr_Format(PyExc_SystemError,
1041                         "Cannot copy %s characters "
1042                         "into a string of %s characters",
1043                         unicode_kind_name(from),
1044                         unicode_kind_name(to));
1045            return -1;
1046        }
1047    }
1048    return how_many;
1049}
1050
1051/* Find the maximum code point and count the number of surrogate pairs so a
1052   correct string length can be computed before converting a string to UCS4.
1053   This function counts single surrogates as a character and not as a pair.
1054
1055   Return 0 on success, or -1 on error. */
1056static int
1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1058                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1059{
1060    const wchar_t *iter;
1061
1062    assert(num_surrogates != NULL && maxchar != NULL);
1063    *num_surrogates = 0;
1064    *maxchar = 0;
1065
1066    for (iter = begin; iter < end; ) {
1067        if (*iter > *maxchar) {
1068            *maxchar = *iter;
1069#if SIZEOF_WCHAR_T != 2
1070            if (*maxchar >= 0x10000)
1071                return 0;
1072#endif
1073        }
1074#if SIZEOF_WCHAR_T == 2
1075        if (*iter >= 0xD800 && *iter <= 0xDBFF
1076            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1077        {
1078            Py_UCS4 surrogate_val;
1079            surrogate_val = (((iter[0] & 0x3FF)<<10)
1080                             | (iter[1] & 0x3FF)) + 0x10000;
1081            ++(*num_surrogates);
1082            if (surrogate_val > *maxchar)
1083                *maxchar = surrogate_val;
1084            iter += 2;
1085        }
1086        else
1087            iter++;
1088#else
1089        iter++;
1090#endif
1091    }
1092    return 0;
1093}
1094
1095#ifdef Py_DEBUG
1096int unicode_ready_calls = 0;
1097#endif
1098
1099static int
1100unicode_ready(PyObject **p_obj, int replace)
1101{
1102    PyUnicodeObject *unicode;
1103    wchar_t *end;
1104    Py_UCS4 maxchar = 0;
1105    Py_ssize_t num_surrogates;
1106#if SIZEOF_WCHAR_T == 2
1107    Py_ssize_t length_wo_surrogates;
1108#endif
1109
1110    assert(p_obj != NULL);
1111    unicode = (PyUnicodeObject *)*p_obj;
1112
1113    /* _PyUnicode_Ready() is only intended for old-style API usage where
1114       strings were created using _PyObject_New() and where no canonical
1115       representation (the str field) has been set yet aka strings
1116       which are not yet ready. */
1117    assert(_PyUnicode_CHECK(unicode));
1118    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1119    assert(_PyUnicode_WSTR(unicode) != NULL);
1120    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1121    assert(_PyUnicode_UTF8(unicode) == NULL);
1122    /* Actually, it should neither be interned nor be anything else: */
1123    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1124
1125#ifdef Py_DEBUG
1126    ++unicode_ready_calls;
1127#endif
1128
1129#ifdef Py_DEBUG
1130    assert(!replace || Py_REFCNT(unicode) == 1);
1131#else
1132    if (replace && Py_REFCNT(unicode) != 1)
1133        replace = 0;
1134#endif
1135    if (replace) {
1136        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1137        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1138        /* Optimization for empty strings */
1139        if (len == 0) {
1140            Py_INCREF(unicode_empty);
1141            Py_DECREF(*p_obj);
1142            *p_obj = unicode_empty;
1143            return 0;
1144        }
1145        if (len == 1 && wstr[0] < 256) {
1146            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1147            if (latin1_char == NULL)
1148                return -1;
1149            Py_DECREF(*p_obj);
1150            *p_obj = latin1_char;
1151            return 0;
1152        }
1153    }
1154
1155    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1156    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1157                                &maxchar, &num_surrogates) == -1)
1158        return -1;
1159
1160    if (maxchar < 256) {
1161        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1162        if (!_PyUnicode_DATA_ANY(unicode)) {
1163            PyErr_NoMemory();
1164            return -1;
1165        }
1166        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1167                                _PyUnicode_WSTR(unicode), end,
1168                                PyUnicode_1BYTE_DATA(unicode));
1169        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1170        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1171        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1172        if (maxchar < 128) {
1173            _PyUnicode_STATE(unicode).ascii = 1;
1174            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1175            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1176        }
1177        else {
1178            _PyUnicode_STATE(unicode).ascii = 0;
1179            _PyUnicode_UTF8(unicode) = NULL;
1180            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1181        }
1182        PyObject_FREE(_PyUnicode_WSTR(unicode));
1183        _PyUnicode_WSTR(unicode) = NULL;
1184        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1185    }
1186    /* In this case we might have to convert down from 4-byte native
1187       wchar_t to 2-byte unicode. */
1188    else if (maxchar < 65536) {
1189        assert(num_surrogates == 0 &&
1190               "FindMaxCharAndNumSurrogatePairs() messed up");
1191
1192#if SIZEOF_WCHAR_T == 2
1193        /* We can share representations and are done. */
1194        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1195        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1196        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1197        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1198        _PyUnicode_UTF8(unicode) = NULL;
1199        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1200#else
1201        /* sizeof(wchar_t) == 4 */
1202        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1203            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1204        if (!_PyUnicode_DATA_ANY(unicode)) {
1205            PyErr_NoMemory();
1206            return -1;
1207        }
1208        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1209                                _PyUnicode_WSTR(unicode), end,
1210                                PyUnicode_2BYTE_DATA(unicode));
1211        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1212        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1213        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1214        _PyUnicode_UTF8(unicode) = NULL;
1215        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1216        PyObject_FREE(_PyUnicode_WSTR(unicode));
1217        _PyUnicode_WSTR(unicode) = NULL;
1218        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1219#endif
1220    }
1221    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1222    else {
1223#if SIZEOF_WCHAR_T == 2
1224        /* in case the native representation is 2-bytes, we need to allocate a
1225           new normalized 4-byte version. */
1226        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1227        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1228        if (!_PyUnicode_DATA_ANY(unicode)) {
1229            PyErr_NoMemory();
1230            return -1;
1231        }
1232        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1233        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1234        _PyUnicode_UTF8(unicode) = NULL;
1235        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1236        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1237        _PyUnicode_STATE(unicode).ready = 1;
1238        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1239        PyObject_FREE(_PyUnicode_WSTR(unicode));
1240        _PyUnicode_WSTR(unicode) = NULL;
1241        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1242#else
1243        assert(num_surrogates == 0);
1244
1245        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1246        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1247        _PyUnicode_UTF8(unicode) = NULL;
1248        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1249        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1250#endif
1251        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1252    }
1253    _PyUnicode_STATE(unicode).ready = 1;
1254    return 0;
1255}
1256
1257int
1258_PyUnicode_ReadyReplace(PyObject **op)
1259{
1260    return unicode_ready(op, 1);
1261}
1262
1263int
1264_PyUnicode_Ready(PyObject *op)
1265{
1266    return unicode_ready(&op, 0);
1267}
1268
1269static void
1270unicode_dealloc(register PyUnicodeObject *unicode)
1271{
1272    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1273    case SSTATE_NOT_INTERNED:
1274        break;
1275
1276    case SSTATE_INTERNED_MORTAL:
1277        /* revive dead object temporarily for DelItem */
1278        Py_REFCNT(unicode) = 3;
1279        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1280            Py_FatalError(
1281                "deletion of interned string failed");
1282        break;
1283
1284    case SSTATE_INTERNED_IMMORTAL:
1285        Py_FatalError("Immortal interned string died.");
1286
1287    default:
1288        Py_FatalError("Inconsistent interned string state.");
1289    }
1290
1291    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1292        PyObject_DEL(_PyUnicode_WSTR(unicode));
1293    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1294        PyObject_DEL(_PyUnicode_UTF8(unicode));
1295
1296    if (PyUnicode_IS_COMPACT(unicode)) {
1297        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1298    }
1299    else {
1300        if (_PyUnicode_DATA_ANY(unicode))
1301            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1302        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1303    }
1304}
1305
1306static int
1307unicode_resizable(PyObject *unicode)
1308{
1309    if (Py_REFCNT(unicode) != 1)
1310        return 0;
1311    if (PyUnicode_CHECK_INTERNED(unicode))
1312        return 0;
1313    assert(unicode != unicode_empty);
1314#ifdef Py_DEBUG
1315    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1316        && PyUnicode_GET_LENGTH(unicode) == 1)
1317    {
1318        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1319        if (ch < 256 && unicode_latin1[ch] == unicode)
1320            return 0;
1321    }
1322#endif
1323    return 1;
1324}
1325
1326static int
1327unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1328{
1329    PyObject *unicode;
1330    Py_ssize_t old_length;
1331
1332    assert(p_unicode != NULL);
1333    unicode = *p_unicode;
1334
1335    assert(unicode != NULL);
1336    assert(PyUnicode_Check(unicode));
1337    assert(0 <= length);
1338
1339    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1340        old_length = PyUnicode_WSTR_LENGTH(unicode);
1341    else
1342        old_length = PyUnicode_GET_LENGTH(unicode);
1343    if (old_length == length)
1344        return 0;
1345
1346    if (!unicode_resizable(unicode)) {
1347        PyObject *copy = resize_copy(unicode, length);
1348        if (copy == NULL)
1349            return -1;
1350        Py_DECREF(*p_unicode);
1351        *p_unicode = copy;
1352        return 0;
1353    }
1354
1355    if (PyUnicode_IS_COMPACT(unicode)) {
1356        *p_unicode = resize_compact(unicode, length);
1357        if (*p_unicode == NULL)
1358            return -1;
1359        _PyUnicode_CheckConsistency(*p_unicode);
1360        return 0;
1361    }
1362    return resize_inplace((PyUnicodeObject*)unicode, length);
1363}
1364
1365int
1366PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1367{
1368    PyObject *unicode;
1369    if (p_unicode == NULL) {
1370        PyErr_BadInternalCall();
1371        return -1;
1372    }
1373    unicode = *p_unicode;
1374    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1375        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1376    {
1377        PyErr_BadInternalCall();
1378        return -1;
1379    }
1380    return unicode_resize(p_unicode, length);
1381}
1382
1383static PyObject*
1384get_latin1_char(unsigned char ch)
1385{
1386    PyObject *unicode = unicode_latin1[ch];
1387    if (!unicode) {
1388        unicode = PyUnicode_New(1, ch);
1389        if (!unicode)
1390            return NULL;
1391        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1392        unicode_latin1[ch] = unicode;
1393    }
1394    Py_INCREF(unicode);
1395    return unicode;
1396}
1397
1398PyObject *
1399PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1400{
1401    PyUnicodeObject *unicode;
1402    Py_UCS4 maxchar = 0;
1403    Py_ssize_t num_surrogates;
1404
1405    if (u == NULL)
1406        return (PyObject*)_PyUnicode_New(size);
1407
1408    /* If the Unicode data is known at construction time, we can apply
1409       some optimizations which share commonly used objects. */
1410
1411    /* Optimization for empty strings */
1412    if (size == 0 && unicode_empty != NULL) {
1413        Py_INCREF(unicode_empty);
1414        return unicode_empty;
1415    }
1416
1417    /* Single character Unicode objects in the Latin-1 range are
1418       shared when using this constructor */
1419    if (size == 1 && *u < 256)
1420        return get_latin1_char((unsigned char)*u);
1421
1422    /* If not empty and not single character, copy the Unicode data
1423       into the new object */
1424    if (find_maxchar_surrogates(u, u + size,
1425                                &maxchar, &num_surrogates) == -1)
1426        return NULL;
1427
1428    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1429                                                maxchar);
1430    if (!unicode)
1431        return NULL;
1432
1433    switch (PyUnicode_KIND(unicode)) {
1434    case PyUnicode_1BYTE_KIND:
1435        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1436                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1437        break;
1438    case PyUnicode_2BYTE_KIND:
1439#if Py_UNICODE_SIZE == 2
1440        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1441#else
1442        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1443                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1444#endif
1445        break;
1446    case PyUnicode_4BYTE_KIND:
1447#if SIZEOF_WCHAR_T == 2
1448        /* This is the only case which has to process surrogates, thus
1449           a simple copy loop is not enough and we need a function. */
1450        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1451#else
1452        assert(num_surrogates == 0);
1453        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1454#endif
1455        break;
1456    default:
1457        assert(0 && "Impossible state");
1458    }
1459
1460    return (PyObject *)unicode;
1461}
1462
1463PyObject *
1464PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1465{
1466    PyUnicodeObject *unicode;
1467
1468    if (size < 0) {
1469        PyErr_SetString(PyExc_SystemError,
1470                        "Negative size passed to PyUnicode_FromStringAndSize");
1471        return NULL;
1472    }
1473
1474    /* If the Unicode data is known at construction time, we can apply
1475       some optimizations which share commonly used objects.
1476       Also, this means the input must be UTF-8, so fall back to the
1477       UTF-8 decoder at the end. */
1478    if (u != NULL) {
1479
1480        /* Optimization for empty strings */
1481        if (size == 0 && unicode_empty != NULL) {
1482            Py_INCREF(unicode_empty);
1483            return unicode_empty;
1484        }
1485
1486        /* Single characters are shared when using this constructor.
1487           Restrict to ASCII, since the input must be UTF-8. */
1488        if (size == 1 && Py_CHARMASK(*u) < 128)
1489            return get_latin1_char(Py_CHARMASK(*u));
1490
1491        return PyUnicode_DecodeUTF8(u, size, NULL);
1492    }
1493
1494    unicode = _PyUnicode_New(size);
1495    if (!unicode)
1496        return NULL;
1497
1498    return (PyObject *)unicode;
1499}
1500
1501PyObject *
1502PyUnicode_FromString(const char *u)
1503{
1504    size_t size = strlen(u);
1505    if (size > PY_SSIZE_T_MAX) {
1506        PyErr_SetString(PyExc_OverflowError, "input too long");
1507        return NULL;
1508    }
1509
1510    return PyUnicode_FromStringAndSize(u, size);
1511}
1512
1513static PyObject*
1514unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1515{
1516    PyObject *res = PyUnicode_New(size, 127);
1517    if (!res)
1518        return NULL;
1519    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1520    return res;
1521}
1522
1523static Py_UCS4
1524kind_maxchar_limit(unsigned int kind)
1525{
1526    switch(kind) {
1527    case PyUnicode_1BYTE_KIND:
1528        return 0x80;
1529    case PyUnicode_2BYTE_KIND:
1530        return 0x100;
1531    case PyUnicode_4BYTE_KIND:
1532        return 0x10000;
1533    default:
1534        assert(0 && "invalid kind");
1535        return 0x10ffff;
1536    }
1537}
1538
1539static PyObject*
1540_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1541{
1542    PyObject *res;
1543    unsigned char max_char = 127;
1544    Py_ssize_t i;
1545
1546    assert(size >= 0);
1547    for (i = 0; i < size; i++) {
1548        if (u[i] & 0x80) {
1549            max_char = 255;
1550            break;
1551        }
1552    }
1553    res = PyUnicode_New(size, max_char);
1554    if (!res)
1555        return NULL;
1556    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1557    return res;
1558}
1559
1560static PyObject*
1561_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1562{
1563    PyObject *res;
1564    Py_UCS2 max_char = 0;
1565    Py_ssize_t i;
1566
1567    assert(size >= 0);
1568    for (i = 0; i < size; i++) {
1569        if (u[i] > max_char) {
1570            max_char = u[i];
1571            if (max_char >= 256)
1572                break;
1573        }
1574    }
1575    res = PyUnicode_New(size, max_char);
1576    if (!res)
1577        return NULL;
1578    if (max_char >= 256)
1579        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1580    else
1581        for (i = 0; i < size; i++)
1582            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1583    return res;
1584}
1585
1586static PyObject*
1587_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1588{
1589    PyObject *res;
1590    Py_UCS4 max_char = 0;
1591    Py_ssize_t i;
1592
1593    assert(size >= 0);
1594    for (i = 0; i < size; i++) {
1595        if (u[i] > max_char) {
1596            max_char = u[i];
1597            if (max_char >= 0x10000)
1598                break;
1599        }
1600    }
1601    res = PyUnicode_New(size, max_char);
1602    if (!res)
1603        return NULL;
1604    if (max_char >= 0x10000)
1605        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1606    else {
1607        int kind = PyUnicode_KIND(res);
1608        void *data = PyUnicode_DATA(res);
1609        for (i = 0; i < size; i++)
1610            PyUnicode_WRITE(kind, data, i, u[i]);
1611    }
1612    return res;
1613}
1614
1615PyObject*
1616PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1617{
1618    switch(kind) {
1619    case PyUnicode_1BYTE_KIND:
1620        return _PyUnicode_FromUCS1(buffer, size);
1621    case PyUnicode_2BYTE_KIND:
1622        return _PyUnicode_FromUCS2(buffer, size);
1623    case PyUnicode_4BYTE_KIND:
1624        return _PyUnicode_FromUCS4(buffer, size);
1625    default:
1626        assert(0 && "invalid kind");
1627        PyErr_SetString(PyExc_SystemError, "invalid kind");
1628        return NULL;
1629    }
1630}
1631
1632PyObject*
1633PyUnicode_Copy(PyObject *unicode)
1634{
1635    Py_ssize_t size;
1636    PyObject *copy;
1637    void *data;
1638
1639    if (!PyUnicode_Check(unicode)) {
1640        PyErr_BadInternalCall();
1641        return NULL;
1642    }
1643    if (PyUnicode_READY(unicode))
1644        return NULL;
1645
1646    size = PyUnicode_GET_LENGTH(unicode);
1647    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1648    if (!copy)
1649        return NULL;
1650    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1651
1652    data = PyUnicode_DATA(unicode);
1653    switch (PyUnicode_KIND(unicode))
1654    {
1655    case PyUnicode_1BYTE_KIND:
1656        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1657        break;
1658    case PyUnicode_2BYTE_KIND:
1659        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1660        break;
1661    case PyUnicode_4BYTE_KIND:
1662        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1663        break;
1664    default:
1665        assert(0);
1666        break;
1667    }
1668    return copy;
1669}
1670
1671
1672/* Widen Unicode objects to larger buffers. Don't write terminating null
1673   character. Return NULL on error. */
1674
1675void*
1676_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1677{
1678    Py_ssize_t len;
1679    void *result;
1680    unsigned int skind;
1681
1682    if (PyUnicode_READY(s))
1683        return NULL;
1684
1685    len = PyUnicode_GET_LENGTH(s);
1686    skind = PyUnicode_KIND(s);
1687    if (skind >= kind) {
1688        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1689        return NULL;
1690    }
1691    switch(kind) {
1692    case PyUnicode_2BYTE_KIND:
1693        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1694        if (!result)
1695            return PyErr_NoMemory();
1696        assert(skind == PyUnicode_1BYTE_KIND);
1697        _PyUnicode_CONVERT_BYTES(
1698            Py_UCS1, Py_UCS2,
1699            PyUnicode_1BYTE_DATA(s),
1700            PyUnicode_1BYTE_DATA(s) + len,
1701            result);
1702        return result;
1703    case PyUnicode_4BYTE_KIND:
1704        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1705        if (!result)
1706            return PyErr_NoMemory();
1707        if (skind == PyUnicode_2BYTE_KIND) {
1708            _PyUnicode_CONVERT_BYTES(
1709                Py_UCS2, Py_UCS4,
1710                PyUnicode_2BYTE_DATA(s),
1711                PyUnicode_2BYTE_DATA(s) + len,
1712                result);
1713        }
1714        else {
1715            assert(skind == PyUnicode_1BYTE_KIND);
1716            _PyUnicode_CONVERT_BYTES(
1717                Py_UCS1, Py_UCS4,
1718                PyUnicode_1BYTE_DATA(s),
1719                PyUnicode_1BYTE_DATA(s) + len,
1720                result);
1721        }
1722        return result;
1723    default:
1724        break;
1725    }
1726    PyErr_SetString(PyExc_SystemError, "invalid kind");
1727    return NULL;
1728}
1729
1730static Py_UCS4*
1731as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1732        int copy_null)
1733{
1734    int kind;
1735    void *data;
1736    Py_ssize_t len, targetlen;
1737    if (PyUnicode_READY(string) == -1)
1738        return NULL;
1739    kind = PyUnicode_KIND(string);
1740    data = PyUnicode_DATA(string);
1741    len = PyUnicode_GET_LENGTH(string);
1742    targetlen = len;
1743    if (copy_null)
1744        targetlen++;
1745    if (!target) {
1746        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1747            PyErr_NoMemory();
1748            return NULL;
1749        }
1750        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1751        if (!target) {
1752            PyErr_NoMemory();
1753            return NULL;
1754        }
1755    }
1756    else {
1757        if (targetsize < targetlen) {
1758            PyErr_Format(PyExc_SystemError,
1759                         "string is longer than the buffer");
1760            if (copy_null && 0 < targetsize)
1761                target[0] = 0;
1762            return NULL;
1763        }
1764    }
1765    if (kind != PyUnicode_4BYTE_KIND) {
1766        Py_ssize_t i;
1767        for (i = 0; i < len; i++)
1768            target[i] = PyUnicode_READ(kind, data, i);
1769    }
1770    else
1771        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1772    if (copy_null)
1773        target[len] = 0;
1774    return target;
1775}
1776
1777Py_UCS4*
1778PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1779                 int copy_null)
1780{
1781    if (target == NULL || targetsize < 1) {
1782        PyErr_BadInternalCall();
1783        return NULL;
1784    }
1785    return as_ucs4(string, target, targetsize, copy_null);
1786}
1787
1788Py_UCS4*
1789PyUnicode_AsUCS4Copy(PyObject *string)
1790{
1791    return as_ucs4(string, NULL, 0, 1);
1792}
1793
1794#ifdef HAVE_WCHAR_H
1795
1796PyObject *
1797PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
1798{
1799    if (w == NULL) {
1800        if (size == 0)
1801            return PyUnicode_New(0, 0);
1802        PyErr_BadInternalCall();
1803        return NULL;
1804    }
1805
1806    if (size == -1) {
1807        size = wcslen(w);
1808    }
1809
1810    return PyUnicode_FromUnicode(w, size);
1811}
1812
1813#endif /* HAVE_WCHAR_H */
1814
1815static void
1816makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1817        int zeropad, int width, int precision, char c)
1818{
1819    *fmt++ = '%';
1820    if (width) {
1821        if (zeropad)
1822            *fmt++ = '0';
1823        fmt += sprintf(fmt, "%d", width);
1824    }
1825    if (precision)
1826        fmt += sprintf(fmt, ".%d", precision);
1827    if (longflag)
1828        *fmt++ = 'l';
1829    else if (longlongflag) {
1830        /* longlongflag should only ever be nonzero on machines with
1831           HAVE_LONG_LONG defined */
1832#ifdef HAVE_LONG_LONG
1833        char *f = PY_FORMAT_LONG_LONG;
1834        while (*f)
1835            *fmt++ = *f++;
1836#else
1837        /* we shouldn't ever get here */
1838        assert(0);
1839        *fmt++ = 'l';
1840#endif
1841    }
1842    else if (size_tflag) {
1843        char *f = PY_FORMAT_SIZE_T;
1844        while (*f)
1845            *fmt++ = *f++;
1846    }
1847    *fmt++ = c;
1848    *fmt = '\0';
1849}
1850
1851/* helper for PyUnicode_FromFormatV() */
1852
1853static const char*
1854parse_format_flags(const char *f,
1855                   int *p_width, int *p_precision,
1856                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1857{
1858    int width, precision, longflag, longlongflag, size_tflag;
1859
1860    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1861    f++;
1862    width = 0;
1863    while (Py_ISDIGIT((unsigned)*f))
1864        width = (width*10) + *f++ - '0';
1865    precision = 0;
1866    if (*f == '.') {
1867        f++;
1868        while (Py_ISDIGIT((unsigned)*f))
1869            precision = (precision*10) + *f++ - '0';
1870        if (*f == '%') {
1871            /* "%.3%s" => f points to "3" */
1872            f--;
1873        }
1874    }
1875    if (*f == '\0') {
1876        /* bogus format "%.1" => go backward, f points to "1" */
1877        f--;
1878    }
1879    if (p_width != NULL)
1880        *p_width = width;
1881    if (p_precision != NULL)
1882        *p_precision = precision;
1883
1884    /* Handle %ld, %lu, %lld and %llu. */
1885    longflag = 0;
1886    longlongflag = 0;
1887    size_tflag = 0;
1888
1889    if (*f == 'l') {
1890        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
1891            longflag = 1;
1892            ++f;
1893        }
1894#ifdef HAVE_LONG_LONG
1895        else if (f[1] == 'l' &&
1896                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
1897            longlongflag = 1;
1898            f += 2;
1899        }
1900#endif
1901    }
1902    /* handle the size_t flag. */
1903    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
1904        size_tflag = 1;
1905        ++f;
1906    }
1907    if (p_longflag != NULL)
1908        *p_longflag = longflag;
1909    if (p_longlongflag != NULL)
1910        *p_longlongflag = longlongflag;
1911    if (p_size_tflag != NULL)
1912        *p_size_tflag = size_tflag;
1913    return f;
1914}
1915
1916/* maximum number of characters required for output of %ld.  21 characters
1917   allows for 64-bit integers (in decimal) and an optional sign. */
1918#define MAX_LONG_CHARS 21
1919/* maximum number of characters required for output of %lld.
1920   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1921   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
1922#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1923
1924PyObject *
1925PyUnicode_FromFormatV(const char *format, va_list vargs)
1926{
1927    va_list count;
1928    Py_ssize_t callcount = 0;
1929    PyObject **callresults = NULL;
1930    PyObject **callresult = NULL;
1931    Py_ssize_t n = 0;
1932    int width = 0;
1933    int precision = 0;
1934    int zeropad;
1935    const char* f;
1936    PyUnicodeObject *string;
1937    /* used by sprintf */
1938    char fmt[61]; /* should be enough for %0width.precisionlld */
1939    Py_UCS4 maxchar = 127; /* result is ASCII by default */
1940    Py_UCS4 argmaxchar;
1941    Py_ssize_t numbersize = 0;
1942    char *numberresults = NULL;
1943    char *numberresult = NULL;
1944    Py_ssize_t i;
1945    int kind;
1946    void *data;
1947
1948    Py_VA_COPY(count, vargs);
1949    /* step 1: count the number of %S/%R/%A/%s format specifications
1950     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1951     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
1952     * result in an array)
1953     * also estimate a upper bound for all the number formats in the string,
1954     * numbers will be formatted in step 3 and be kept in a '\0'-separated
1955     * buffer before putting everything together. */
1956    for (f = format; *f; f++) {
1957        if (*f == '%') {
1958            int longlongflag;
1959            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1960            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1961            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1962                ++callcount;
1963
1964            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
1965#ifdef HAVE_LONG_LONG
1966                if (longlongflag) {
1967                    if (width < MAX_LONG_LONG_CHARS)
1968                        width = MAX_LONG_LONG_CHARS;
1969                }
1970                else
1971#endif
1972                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1973                       including sign.  Decimal takes the most space.  This
1974                       isn't enough for octal.  If a width is specified we
1975                       need more (which we allocate later). */
1976                    if (width < MAX_LONG_CHARS)
1977                        width = MAX_LONG_CHARS;
1978
1979                /* account for the size + '\0' to separate numbers
1980                   inside of the numberresults buffer */
1981                numbersize += (width + 1);
1982            }
1983        }
1984        else if ((unsigned char)*f > 127) {
1985            PyErr_Format(PyExc_ValueError,
1986                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1987                "string, got a non-ASCII byte: 0x%02x",
1988                (unsigned char)*f);
1989            return NULL;
1990        }
1991    }
1992    /* step 2: allocate memory for the results of
1993     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1994    if (callcount) {
1995        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1996        if (!callresults) {
1997            PyErr_NoMemory();
1998            return NULL;
1999        }
2000        callresult = callresults;
2001    }
2002    /* step 2.5: allocate memory for the results of formating numbers */
2003    if (numbersize) {
2004        numberresults = PyObject_Malloc(numbersize);
2005        if (!numberresults) {
2006            PyErr_NoMemory();
2007            goto fail;
2008        }
2009        numberresult = numberresults;
2010    }
2011
2012    /* step 3: format numbers and figure out how large a buffer we need */
2013    for (f = format; *f; f++) {
2014        if (*f == '%') {
2015            const char* p;
2016            int longflag;
2017            int longlongflag;
2018            int size_tflag;
2019            int numprinted;
2020
2021            p = f;
2022            zeropad = (f[1] == '0');
2023            f = parse_format_flags(f, &width, &precision,
2024                                   &longflag, &longlongflag, &size_tflag);
2025            switch (*f) {
2026            case 'c':
2027            {
2028                Py_UCS4 ordinal = va_arg(count, int);
2029                maxchar = Py_MAX(maxchar, ordinal);
2030                n++;
2031                break;
2032            }
2033            case '%':
2034                n++;
2035                break;
2036            case 'i':
2037            case 'd':
2038                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2039                        width, precision, *f);
2040                if (longflag)
2041                    numprinted = sprintf(numberresult, fmt,
2042                                         va_arg(count, long));
2043#ifdef HAVE_LONG_LONG
2044                else if (longlongflag)
2045                    numprinted = sprintf(numberresult, fmt,
2046                                         va_arg(count, PY_LONG_LONG));
2047#endif
2048                else if (size_tflag)
2049                    numprinted = sprintf(numberresult, fmt,
2050                                         va_arg(count, Py_ssize_t));
2051                else
2052                    numprinted = sprintf(numberresult, fmt,
2053                                         va_arg(count, int));
2054                n += numprinted;
2055                /* advance by +1 to skip over the '\0' */
2056                numberresult += (numprinted + 1);
2057                assert(*(numberresult - 1) == '\0');
2058                assert(*(numberresult - 2) != '\0');
2059                assert(numprinted >= 0);
2060                assert(numberresult <= numberresults + numbersize);
2061                break;
2062            case 'u':
2063                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2064                        width, precision, 'u');
2065                if (longflag)
2066                    numprinted = sprintf(numberresult, fmt,
2067                                         va_arg(count, unsigned long));
2068#ifdef HAVE_LONG_LONG
2069                else if (longlongflag)
2070                    numprinted = sprintf(numberresult, fmt,
2071                                         va_arg(count, unsigned PY_LONG_LONG));
2072#endif
2073                else if (size_tflag)
2074                    numprinted = sprintf(numberresult, fmt,
2075                                         va_arg(count, size_t));
2076                else
2077                    numprinted = sprintf(numberresult, fmt,
2078                                         va_arg(count, unsigned int));
2079                n += numprinted;
2080                numberresult += (numprinted + 1);
2081                assert(*(numberresult - 1) == '\0');
2082                assert(*(numberresult - 2) != '\0');
2083                assert(numprinted >= 0);
2084                assert(numberresult <= numberresults + numbersize);
2085                break;
2086            case 'x':
2087                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2088                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2089                n += numprinted;
2090                numberresult += (numprinted + 1);
2091                assert(*(numberresult - 1) == '\0');
2092                assert(*(numberresult - 2) != '\0');
2093                assert(numprinted >= 0);
2094                assert(numberresult <= numberresults + numbersize);
2095                break;
2096            case 'p':
2097                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2098                /* %p is ill-defined:  ensure leading 0x. */
2099                if (numberresult[1] == 'X')
2100                    numberresult[1] = 'x';
2101                else if (numberresult[1] != 'x') {
2102                    memmove(numberresult + 2, numberresult,
2103                            strlen(numberresult) + 1);
2104                    numberresult[0] = '0';
2105                    numberresult[1] = 'x';
2106                    numprinted += 2;
2107                }
2108                n += numprinted;
2109                numberresult += (numprinted + 1);
2110                assert(*(numberresult - 1) == '\0');
2111                assert(*(numberresult - 2) != '\0');
2112                assert(numprinted >= 0);
2113                assert(numberresult <= numberresults + numbersize);
2114                break;
2115            case 's':
2116            {
2117                /* UTF-8 */
2118                const char *s = va_arg(count, const char*);
2119                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2120                if (!str)
2121                    goto fail;
2122                /* since PyUnicode_DecodeUTF8 returns already flexible
2123                   unicode objects, there is no need to call ready on them */
2124                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2125                maxchar = Py_MAX(maxchar, argmaxchar);
2126                n += PyUnicode_GET_LENGTH(str);
2127                /* Remember the str and switch to the next slot */
2128                *callresult++ = str;
2129                break;
2130            }
2131            case 'U':
2132            {
2133                PyObject *obj = va_arg(count, PyObject *);
2134                assert(obj && _PyUnicode_CHECK(obj));
2135                if (PyUnicode_READY(obj) == -1)
2136                    goto fail;
2137                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2138                maxchar = Py_MAX(maxchar, argmaxchar);
2139                n += PyUnicode_GET_LENGTH(obj);
2140                break;
2141            }
2142            case 'V':
2143            {
2144                PyObject *obj = va_arg(count, PyObject *);
2145                const char *str = va_arg(count, const char *);
2146                PyObject *str_obj;
2147                assert(obj || str);
2148                assert(!obj || _PyUnicode_CHECK(obj));
2149                if (obj) {
2150                    if (PyUnicode_READY(obj) == -1)
2151                        goto fail;
2152                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2153                    maxchar = Py_MAX(maxchar, argmaxchar);
2154                    n += PyUnicode_GET_LENGTH(obj);
2155                    *callresult++ = NULL;
2156                }
2157                else {
2158                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2159                    if (!str_obj)
2160                        goto fail;
2161                    if (PyUnicode_READY(str_obj)) {
2162                        Py_DECREF(str_obj);
2163                        goto fail;
2164                    }
2165                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2166                    maxchar = Py_MAX(maxchar, argmaxchar);
2167                    n += PyUnicode_GET_LENGTH(str_obj);
2168                    *callresult++ = str_obj;
2169                }
2170                break;
2171            }
2172            case 'S':
2173            {
2174                PyObject *obj = va_arg(count, PyObject *);
2175                PyObject *str;
2176                assert(obj);
2177                str = PyObject_Str(obj);
2178                if (!str || PyUnicode_READY(str) == -1)
2179                    goto fail;
2180                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2181                maxchar = Py_MAX(maxchar, argmaxchar);
2182                n += PyUnicode_GET_LENGTH(str);
2183                /* Remember the str and switch to the next slot */
2184                *callresult++ = str;
2185                break;
2186            }
2187            case 'R':
2188            {
2189                PyObject *obj = va_arg(count, PyObject *);
2190                PyObject *repr;
2191                assert(obj);
2192                repr = PyObject_Repr(obj);
2193                if (!repr || PyUnicode_READY(repr) == -1)
2194                    goto fail;
2195                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2196                maxchar = Py_MAX(maxchar, argmaxchar);
2197                n += PyUnicode_GET_LENGTH(repr);
2198                /* Remember the repr and switch to the next slot */
2199                *callresult++ = repr;
2200                break;
2201            }
2202            case 'A':
2203            {
2204                PyObject *obj = va_arg(count, PyObject *);
2205                PyObject *ascii;
2206                assert(obj);
2207                ascii = PyObject_ASCII(obj);
2208                if (!ascii || PyUnicode_READY(ascii) == -1)
2209                    goto fail;
2210                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2211                maxchar = Py_MAX(maxchar, argmaxchar);
2212                n += PyUnicode_GET_LENGTH(ascii);
2213                /* Remember the repr and switch to the next slot */
2214                *callresult++ = ascii;
2215                break;
2216            }
2217            default:
2218                /* if we stumble upon an unknown
2219                   formatting code, copy the rest of
2220                   the format string to the output
2221                   string. (we cannot just skip the
2222                   code, since there's no way to know
2223                   what's in the argument list) */
2224                n += strlen(p);
2225                goto expand;
2226            }
2227        } else
2228            n++;
2229    }
2230  expand:
2231    /* step 4: fill the buffer */
2232    /* Since we've analyzed how much space we need,
2233       we don't have to resize the string.
2234       There can be no errors beyond this point. */
2235    string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
2236    if (!string)
2237        goto fail;
2238    kind = PyUnicode_KIND(string);
2239    data = PyUnicode_DATA(string);
2240    callresult = callresults;
2241    numberresult = numberresults;
2242
2243    for (i = 0, f = format; *f; f++) {
2244        if (*f == '%') {
2245            const char* p;
2246
2247            p = f;
2248            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2249            /* checking for == because the last argument could be a empty
2250               string, which causes i to point to end, the assert at the end of
2251               the loop */
2252            assert(i <= PyUnicode_GET_LENGTH(string));
2253
2254            switch (*f) {
2255            case 'c':
2256            {
2257                const int ordinal = va_arg(vargs, int);
2258                PyUnicode_WRITE(kind, data, i++, ordinal);
2259                break;
2260            }
2261            case 'i':
2262            case 'd':
2263            case 'u':
2264            case 'x':
2265            case 'p':
2266                /* unused, since we already have the result */
2267                if (*f == 'p')
2268                    (void) va_arg(vargs, void *);
2269                else
2270                    (void) va_arg(vargs, int);
2271                /* extract the result from numberresults and append. */
2272                for (; *numberresult; ++i, ++numberresult)
2273                    PyUnicode_WRITE(kind, data, i, *numberresult);
2274                /* skip over the separating '\0' */
2275                assert(*numberresult == '\0');
2276                numberresult++;
2277                assert(numberresult <= numberresults + numbersize);
2278                break;
2279            case 's':
2280            {
2281                /* unused, since we already have the result */
2282                Py_ssize_t size;
2283                (void) va_arg(vargs, char *);
2284                size = PyUnicode_GET_LENGTH(*callresult);
2285                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2286                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2287                                             *callresult, 0,
2288                                             size) < 0)
2289                    goto fail;
2290                i += size;
2291                /* We're done with the unicode()/repr() => forget it */
2292                Py_DECREF(*callresult);
2293                /* switch to next unicode()/repr() result */
2294                ++callresult;
2295                break;
2296            }
2297            case 'U':
2298            {
2299                PyObject *obj = va_arg(vargs, PyObject *);
2300                Py_ssize_t size;
2301                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2302                size = PyUnicode_GET_LENGTH(obj);
2303                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2304                                             obj, 0,
2305                                             size) < 0)
2306                    goto fail;
2307                i += size;
2308                break;
2309            }
2310            case 'V':
2311            {
2312                Py_ssize_t size;
2313                PyObject *obj = va_arg(vargs, PyObject *);
2314                va_arg(vargs, const char *);
2315                if (obj) {
2316                    size = PyUnicode_GET_LENGTH(obj);
2317                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2318                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2319                                                 obj, 0,
2320                                                 size) < 0)
2321                        goto fail;
2322                    i += size;
2323                } else {
2324                    size = PyUnicode_GET_LENGTH(*callresult);
2325                    assert(PyUnicode_KIND(*callresult) <=
2326                           PyUnicode_KIND(string));
2327                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2328                                                 *callresult,
2329                                                 0, size) < 0)
2330                        goto fail;
2331                    i += size;
2332                    Py_DECREF(*callresult);
2333                }
2334                ++callresult;
2335                break;
2336            }
2337            case 'S':
2338            case 'R':
2339            case 'A':
2340            {
2341                /* unused, since we already have the result */
2342                (void) va_arg(vargs, PyObject *);
2343                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2344                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2345                                             *callresult, 0,
2346                                             PyUnicode_GET_LENGTH(*callresult)) < 0)
2347                    goto fail;
2348                i += PyUnicode_GET_LENGTH(*callresult);
2349                /* We're done with the unicode()/repr() => forget it */
2350                Py_DECREF(*callresult);
2351                /* switch to next unicode()/repr() result */
2352                ++callresult;
2353                break;
2354            }
2355            case '%':
2356                PyUnicode_WRITE(kind, data, i++, '%');
2357                break;
2358            default:
2359                for (; *p; ++p, ++i)
2360                    PyUnicode_WRITE(kind, data, i, *p);
2361                assert(i == PyUnicode_GET_LENGTH(string));
2362                goto end;
2363            }
2364        }
2365        else {
2366            assert(i < PyUnicode_GET_LENGTH(string));
2367            PyUnicode_WRITE(kind, data, i++, *f);
2368        }
2369    }
2370    assert(i == PyUnicode_GET_LENGTH(string));
2371
2372  end:
2373    if (callresults)
2374        PyObject_Free(callresults);
2375    if (numberresults)
2376        PyObject_Free(numberresults);
2377    return (PyObject *)string;
2378  fail:
2379    if (callresults) {
2380        PyObject **callresult2 = callresults;
2381        while (callresult2 < callresult) {
2382            Py_XDECREF(*callresult2);
2383            ++callresult2;
2384        }
2385        PyObject_Free(callresults);
2386    }
2387    if (numberresults)
2388        PyObject_Free(numberresults);
2389    return NULL;
2390}
2391
2392PyObject *
2393PyUnicode_FromFormat(const char *format, ...)
2394{
2395    PyObject* ret;
2396    va_list vargs;
2397
2398#ifdef HAVE_STDARG_PROTOTYPES
2399    va_start(vargs, format);
2400#else
2401    va_start(vargs);
2402#endif
2403    ret = PyUnicode_FromFormatV(format, vargs);
2404    va_end(vargs);
2405    return ret;
2406}
2407
2408#ifdef HAVE_WCHAR_H
2409
2410/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2411   convert a Unicode object to a wide character string.
2412
2413   - If w is NULL: return the number of wide characters (including the null
2414     character) required to convert the unicode object. Ignore size argument.
2415
2416   - Otherwise: return the number of wide characters (excluding the null
2417     character) written into w. Write at most size wide characters (including
2418     the null character). */
2419static Py_ssize_t
2420unicode_aswidechar(PyUnicodeObject *unicode,
2421                   wchar_t *w,
2422                   Py_ssize_t size)
2423{
2424    Py_ssize_t res;
2425    const wchar_t *wstr;
2426
2427    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2428    if (wstr == NULL)
2429        return -1;
2430
2431    if (w != NULL) {
2432        if (size > res)
2433            size = res + 1;
2434        else
2435            res = size;
2436        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2437        return res;
2438    }
2439    else
2440        return res + 1;
2441}
2442
2443Py_ssize_t
2444PyUnicode_AsWideChar(PyObject *unicode,
2445                     wchar_t *w,
2446                     Py_ssize_t size)
2447{
2448    if (unicode == NULL) {
2449        PyErr_BadInternalCall();
2450        return -1;
2451    }
2452    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2453}
2454
2455wchar_t*
2456PyUnicode_AsWideCharString(PyObject *unicode,
2457                           Py_ssize_t *size)
2458{
2459    wchar_t* buffer;
2460    Py_ssize_t buflen;
2461
2462    if (unicode == NULL) {
2463        PyErr_BadInternalCall();
2464        return NULL;
2465    }
2466
2467    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2468    if (buflen == -1)
2469        return NULL;
2470    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2471        PyErr_NoMemory();
2472        return NULL;
2473    }
2474
2475    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2476    if (buffer == NULL) {
2477        PyErr_NoMemory();
2478        return NULL;
2479    }
2480    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2481    if (buflen == -1)
2482        return NULL;
2483    if (size != NULL)
2484        *size = buflen;
2485    return buffer;
2486}
2487
2488#endif /* HAVE_WCHAR_H */
2489
2490PyObject *
2491PyUnicode_FromOrdinal(int ordinal)
2492{
2493    PyObject *v;
2494    if (ordinal < 0 || ordinal > 0x10ffff) {
2495        PyErr_SetString(PyExc_ValueError,
2496                        "chr() arg not in range(0x110000)");
2497        return NULL;
2498    }
2499
2500    if (ordinal < 256)
2501        return get_latin1_char(ordinal);
2502
2503    v = PyUnicode_New(1, ordinal);
2504    if (v == NULL)
2505        return NULL;
2506    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2507    return v;
2508}
2509
2510PyObject *
2511PyUnicode_FromObject(register PyObject *obj)
2512{
2513    /* XXX Perhaps we should make this API an alias of
2514       PyObject_Str() instead ?! */
2515    if (PyUnicode_CheckExact(obj)) {
2516        if (PyUnicode_READY(obj))
2517            return NULL;
2518        Py_INCREF(obj);
2519        return obj;
2520    }
2521    if (PyUnicode_Check(obj)) {
2522        /* For a Unicode subtype that's not a Unicode object,
2523           return a true Unicode object with the same data. */
2524        return PyUnicode_Copy(obj);
2525    }
2526    PyErr_Format(PyExc_TypeError,
2527                 "Can't convert '%.100s' object to str implicitly",
2528                 Py_TYPE(obj)->tp_name);
2529    return NULL;
2530}
2531
2532PyObject *
2533PyUnicode_FromEncodedObject(register PyObject *obj,
2534                            const char *encoding,
2535                            const char *errors)
2536{
2537    Py_buffer buffer;
2538    PyObject *v;
2539
2540    if (obj == NULL) {
2541        PyErr_BadInternalCall();
2542        return NULL;
2543    }
2544
2545    /* Decoding bytes objects is the most common case and should be fast */
2546    if (PyBytes_Check(obj)) {
2547        if (PyBytes_GET_SIZE(obj) == 0) {
2548            Py_INCREF(unicode_empty);
2549            v = unicode_empty;
2550        }
2551        else {
2552            v = PyUnicode_Decode(
2553                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2554                    encoding, errors);
2555        }
2556        return v;
2557    }
2558
2559    if (PyUnicode_Check(obj)) {
2560        PyErr_SetString(PyExc_TypeError,
2561                        "decoding str is not supported");
2562        return NULL;
2563    }
2564
2565    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2566    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2567        PyErr_Format(PyExc_TypeError,
2568                     "coercing to str: need bytes, bytearray "
2569                     "or buffer-like object, %.80s found",
2570                     Py_TYPE(obj)->tp_name);
2571        return NULL;
2572    }
2573
2574    if (buffer.len == 0) {
2575        Py_INCREF(unicode_empty);
2576        v = unicode_empty;
2577    }
2578    else
2579        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2580
2581    PyBuffer_Release(&buffer);
2582    return v;
2583}
2584
2585/* Convert encoding to lower case and replace '_' with '-' in order to
2586   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2587   1 on success. */
2588static int
2589normalize_encoding(const char *encoding,
2590                   char *lower,
2591                   size_t lower_len)
2592{
2593    const char *e;
2594    char *l;
2595    char *l_end;
2596
2597    e = encoding;
2598    l = lower;
2599    l_end = &lower[lower_len - 1];
2600    while (*e) {
2601        if (l == l_end)
2602            return 0;
2603        if (Py_ISUPPER(*e)) {
2604            *l++ = Py_TOLOWER(*e++);
2605        }
2606        else if (*e == '_') {
2607            *l++ = '-';
2608            e++;
2609        }
2610        else {
2611            *l++ = *e++;
2612        }
2613    }
2614    *l = '\0';
2615    return 1;
2616}
2617
2618PyObject *
2619PyUnicode_Decode(const char *s,
2620                 Py_ssize_t size,
2621                 const char *encoding,
2622                 const char *errors)
2623{
2624    PyObject *buffer = NULL, *unicode;
2625    Py_buffer info;
2626    char lower[11];  /* Enough for any encoding shortcut */
2627
2628    if (encoding == NULL)
2629        return PyUnicode_DecodeUTF8(s, size, errors);
2630
2631    /* Shortcuts for common default encodings */
2632    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2633        if ((strcmp(lower, "utf-8") == 0) ||
2634            (strcmp(lower, "utf8") == 0))
2635            return PyUnicode_DecodeUTF8(s, size, errors);
2636        else if ((strcmp(lower, "latin-1") == 0) ||
2637                 (strcmp(lower, "latin1") == 0) ||
2638                 (strcmp(lower, "iso-8859-1") == 0))
2639            return PyUnicode_DecodeLatin1(s, size, errors);
2640#ifdef HAVE_MBCS
2641        else if (strcmp(lower, "mbcs") == 0)
2642            return PyUnicode_DecodeMBCS(s, size, errors);
2643#endif
2644        else if (strcmp(lower, "ascii") == 0)
2645            return PyUnicode_DecodeASCII(s, size, errors);
2646        else if (strcmp(lower, "utf-16") == 0)
2647            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2648        else if (strcmp(lower, "utf-32") == 0)
2649            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2650    }
2651
2652    /* Decode via the codec registry */
2653    buffer = NULL;
2654    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2655        goto onError;
2656    buffer = PyMemoryView_FromBuffer(&info);
2657    if (buffer == NULL)
2658        goto onError;
2659    unicode = PyCodec_Decode(buffer, encoding, errors);
2660    if (unicode == NULL)
2661        goto onError;
2662    if (!PyUnicode_Check(unicode)) {
2663        PyErr_Format(PyExc_TypeError,
2664                     "decoder did not return a str object (type=%.400s)",
2665                     Py_TYPE(unicode)->tp_name);
2666        Py_DECREF(unicode);
2667        goto onError;
2668    }
2669    Py_DECREF(buffer);
2670#ifndef DONT_MAKE_RESULT_READY
2671    if (_PyUnicode_READY_REPLACE(&unicode)) {
2672        Py_DECREF(unicode);
2673        return NULL;
2674    }
2675#endif
2676    return unicode;
2677
2678  onError:
2679    Py_XDECREF(buffer);
2680    return NULL;
2681}
2682
2683PyObject *
2684PyUnicode_AsDecodedObject(PyObject *unicode,
2685                          const char *encoding,
2686                          const char *errors)
2687{
2688    PyObject *v;
2689
2690    if (!PyUnicode_Check(unicode)) {
2691        PyErr_BadArgument();
2692        goto onError;
2693    }
2694
2695    if (encoding == NULL)
2696        encoding = PyUnicode_GetDefaultEncoding();
2697
2698    /* Decode via the codec registry */
2699    v = PyCodec_Decode(unicode, encoding, errors);
2700    if (v == NULL)
2701        goto onError;
2702    return v;
2703
2704  onError:
2705    return NULL;
2706}
2707
2708PyObject *
2709PyUnicode_AsDecodedUnicode(PyObject *unicode,
2710                           const char *encoding,
2711                           const char *errors)
2712{
2713    PyObject *v;
2714
2715    if (!PyUnicode_Check(unicode)) {
2716        PyErr_BadArgument();
2717        goto onError;
2718    }
2719
2720    if (encoding == NULL)
2721        encoding = PyUnicode_GetDefaultEncoding();
2722
2723    /* Decode via the codec registry */
2724    v = PyCodec_Decode(unicode, encoding, errors);
2725    if (v == NULL)
2726        goto onError;
2727    if (!PyUnicode_Check(v)) {
2728        PyErr_Format(PyExc_TypeError,
2729                     "decoder did not return a str object (type=%.400s)",
2730                     Py_TYPE(v)->tp_name);
2731        Py_DECREF(v);
2732        goto onError;
2733    }
2734    return v;
2735
2736  onError:
2737    return NULL;
2738}
2739
2740PyObject *
2741PyUnicode_Encode(const Py_UNICODE *s,
2742                 Py_ssize_t size,
2743                 const char *encoding,
2744                 const char *errors)
2745{
2746    PyObject *v, *unicode;
2747
2748    unicode = PyUnicode_FromUnicode(s, size);
2749    if (unicode == NULL)
2750        return NULL;
2751    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2752    Py_DECREF(unicode);
2753    return v;
2754}
2755
2756PyObject *
2757PyUnicode_AsEncodedObject(PyObject *unicode,
2758                          const char *encoding,
2759                          const char *errors)
2760{
2761    PyObject *v;
2762
2763    if (!PyUnicode_Check(unicode)) {
2764        PyErr_BadArgument();
2765        goto onError;
2766    }
2767
2768    if (encoding == NULL)
2769        encoding = PyUnicode_GetDefaultEncoding();
2770
2771    /* Encode via the codec registry */
2772    v = PyCodec_Encode(unicode, encoding, errors);
2773    if (v == NULL)
2774        goto onError;
2775    return v;
2776
2777  onError:
2778    return NULL;
2779}
2780
2781PyObject *
2782PyUnicode_EncodeFSDefault(PyObject *unicode)
2783{
2784#ifdef HAVE_MBCS
2785    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2786                                PyUnicode_GET_SIZE(unicode),
2787                                NULL);
2788#elif defined(__APPLE__)
2789    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2790#else
2791    PyInterpreterState *interp = PyThreadState_GET()->interp;
2792    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2793       cannot use it to encode and decode filenames before it is loaded. Load
2794       the Python codec requires to encode at least its own filename. Use the C
2795       version of the locale codec until the codec registry is initialized and
2796       the Python codec is loaded.
2797
2798       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2799       cannot only rely on it: check also interp->fscodec_initialized for
2800       subinterpreters. */
2801    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2802        return PyUnicode_AsEncodedString(unicode,
2803                                         Py_FileSystemDefaultEncoding,
2804                                         "surrogateescape");
2805    }
2806    else {
2807        /* locale encoding with surrogateescape */
2808        wchar_t *wchar;
2809        char *bytes;
2810        PyObject *bytes_obj;
2811        size_t error_pos;
2812
2813        wchar = PyUnicode_AsWideCharString(unicode, NULL);
2814        if (wchar == NULL)
2815            return NULL;
2816        bytes = _Py_wchar2char(wchar, &error_pos);
2817        if (bytes == NULL) {
2818            if (error_pos != (size_t)-1) {
2819                char *errmsg = strerror(errno);
2820                PyObject *exc = NULL;
2821                if (errmsg == NULL)
2822                    errmsg = "Py_wchar2char() failed";
2823                raise_encode_exception(&exc,
2824                    "filesystemencoding",
2825                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2826                    error_pos, error_pos+1,
2827                    errmsg);
2828                Py_XDECREF(exc);
2829            }
2830            else
2831                PyErr_NoMemory();
2832            PyMem_Free(wchar);
2833            return NULL;
2834        }
2835        PyMem_Free(wchar);
2836
2837        bytes_obj = PyBytes_FromString(bytes);
2838        PyMem_Free(bytes);
2839        return bytes_obj;
2840    }
2841#endif
2842}
2843
2844PyObject *
2845PyUnicode_AsEncodedString(PyObject *unicode,
2846                          const char *encoding,
2847                          const char *errors)
2848{
2849    PyObject *v;
2850    char lower[11];  /* Enough for any encoding shortcut */
2851
2852    if (!PyUnicode_Check(unicode)) {
2853        PyErr_BadArgument();
2854        return NULL;
2855    }
2856
2857    if (encoding == NULL) {
2858        if (errors == NULL || strcmp(errors, "strict") == 0)
2859            return _PyUnicode_AsUTF8String(unicode, NULL);
2860        else
2861            return _PyUnicode_AsUTF8String(unicode, errors);
2862    }
2863
2864    /* Shortcuts for common default encodings */
2865    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2866        if ((strcmp(lower, "utf-8") == 0) ||
2867            (strcmp(lower, "utf8") == 0))
2868        {
2869            if (errors == NULL || strcmp(errors, "strict") == 0)
2870                return _PyUnicode_AsUTF8String(unicode, NULL);
2871            else
2872                return _PyUnicode_AsUTF8String(unicode, errors);
2873        }
2874        else if ((strcmp(lower, "latin-1") == 0) ||
2875                 (strcmp(lower, "latin1") == 0) ||
2876                 (strcmp(lower, "iso-8859-1") == 0))
2877            return _PyUnicode_AsLatin1String(unicode, errors);
2878#ifdef HAVE_MBCS
2879        else if (strcmp(lower, "mbcs") == 0)
2880            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2881                                        PyUnicode_GET_SIZE(unicode),
2882                                        errors);
2883#endif
2884        else if (strcmp(lower, "ascii") == 0)
2885            return _PyUnicode_AsASCIIString(unicode, errors);
2886    }
2887
2888    /* Encode via the codec registry */
2889    v = PyCodec_Encode(unicode, encoding, errors);
2890    if (v == NULL)
2891        return NULL;
2892
2893    /* The normal path */
2894    if (PyBytes_Check(v))
2895        return v;
2896
2897    /* If the codec returns a buffer, raise a warning and convert to bytes */
2898    if (PyByteArray_Check(v)) {
2899        int error;
2900        PyObject *b;
2901
2902        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2903            "encoder %s returned bytearray instead of bytes",
2904            encoding);
2905        if (error) {
2906            Py_DECREF(v);
2907            return NULL;
2908        }
2909
2910        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2911        Py_DECREF(v);
2912        return b;
2913    }
2914
2915    PyErr_Format(PyExc_TypeError,
2916                 "encoder did not return a bytes object (type=%.400s)",
2917                 Py_TYPE(v)->tp_name);
2918    Py_DECREF(v);
2919    return NULL;
2920}
2921
2922PyObject *
2923PyUnicode_AsEncodedUnicode(PyObject *unicode,
2924                           const char *encoding,
2925                           const char *errors)
2926{
2927    PyObject *v;
2928
2929    if (!PyUnicode_Check(unicode)) {
2930        PyErr_BadArgument();
2931        goto onError;
2932    }
2933
2934    if (encoding == NULL)
2935        encoding = PyUnicode_GetDefaultEncoding();
2936
2937    /* Encode via the codec registry */
2938    v = PyCodec_Encode(unicode, encoding, errors);
2939    if (v == NULL)
2940        goto onError;
2941    if (!PyUnicode_Check(v)) {
2942        PyErr_Format(PyExc_TypeError,
2943                     "encoder did not return an str object (type=%.400s)",
2944                     Py_TYPE(v)->tp_name);
2945        Py_DECREF(v);
2946        goto onError;
2947    }
2948    return v;
2949
2950  onError:
2951    return NULL;
2952}
2953
2954PyObject*
2955PyUnicode_DecodeFSDefault(const char *s) {
2956    Py_ssize_t size = (Py_ssize_t)strlen(s);
2957    return PyUnicode_DecodeFSDefaultAndSize(s, size);
2958}
2959
2960PyObject*
2961PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2962{
2963#ifdef HAVE_MBCS
2964    return PyUnicode_DecodeMBCS(s, size, NULL);
2965#elif defined(__APPLE__)
2966    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2967#else
2968    PyInterpreterState *interp = PyThreadState_GET()->interp;
2969    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2970       cannot use it to encode and decode filenames before it is loaded. Load
2971       the Python codec requires to encode at least its own filename. Use the C
2972       version of the locale codec until the codec registry is initialized and
2973       the Python codec is loaded.
2974
2975       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2976       cannot only rely on it: check also interp->fscodec_initialized for
2977       subinterpreters. */
2978    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2979        return PyUnicode_Decode(s, size,
2980                                Py_FileSystemDefaultEncoding,
2981                                "surrogateescape");
2982    }
2983    else {
2984        /* locale encoding with surrogateescape */
2985        wchar_t *wchar;
2986        PyObject *unicode;
2987        size_t len;
2988
2989        if (s[size] != '\0' || size != strlen(s)) {
2990            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2991            return NULL;
2992        }
2993
2994        wchar = _Py_char2wchar(s, &len);
2995        if (wchar == NULL)
2996            return PyErr_NoMemory();
2997
2998        unicode = PyUnicode_FromWideChar(wchar, len);
2999        PyMem_Free(wchar);
3000        return unicode;
3001    }
3002#endif
3003}
3004
3005
3006int
3007PyUnicode_FSConverter(PyObject* arg, void* addr)
3008{
3009    PyObject *output = NULL;
3010    Py_ssize_t size;
3011    void *data;
3012    if (arg == NULL) {
3013        Py_DECREF(*(PyObject**)addr);
3014        return 1;
3015    }
3016    if (PyBytes_Check(arg)) {
3017        output = arg;
3018        Py_INCREF(output);
3019    }
3020    else {
3021        arg = PyUnicode_FromObject(arg);
3022        if (!arg)
3023            return 0;
3024        output = PyUnicode_EncodeFSDefault(arg);
3025        Py_DECREF(arg);
3026        if (!output)
3027            return 0;
3028        if (!PyBytes_Check(output)) {
3029            Py_DECREF(output);
3030            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3031            return 0;
3032        }
3033    }
3034    size = PyBytes_GET_SIZE(output);
3035    data = PyBytes_AS_STRING(output);
3036    if (size != strlen(data)) {
3037        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3038        Py_DECREF(output);
3039        return 0;
3040    }
3041    *(PyObject**)addr = output;
3042    return Py_CLEANUP_SUPPORTED;
3043}
3044
3045
3046int
3047PyUnicode_FSDecoder(PyObject* arg, void* addr)
3048{
3049    PyObject *output = NULL;
3050    if (arg == NULL) {
3051        Py_DECREF(*(PyObject**)addr);
3052        return 1;
3053    }
3054    if (PyUnicode_Check(arg)) {
3055        if (PyUnicode_READY(arg))
3056            return 0;
3057        output = arg;
3058        Py_INCREF(output);
3059    }
3060    else {
3061        arg = PyBytes_FromObject(arg);
3062        if (!arg)
3063            return 0;
3064        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3065                                                  PyBytes_GET_SIZE(arg));
3066        Py_DECREF(arg);
3067        if (!output)
3068            return 0;
3069        if (!PyUnicode_Check(output)) {
3070            Py_DECREF(output);
3071            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3072            return 0;
3073        }
3074    }
3075    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3076                 PyUnicode_GET_LENGTH(output), 0, 1)) {
3077        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3078        Py_DECREF(output);
3079        return 0;
3080    }
3081    *(PyObject**)addr = output;
3082    return Py_CLEANUP_SUPPORTED;
3083}
3084
3085
3086char*
3087PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3088{
3089    PyObject *bytes;
3090    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3091
3092    if (!PyUnicode_Check(unicode)) {
3093        PyErr_BadArgument();
3094        return NULL;
3095    }
3096    if (PyUnicode_READY(u) == -1)
3097        return NULL;
3098
3099    if (PyUnicode_UTF8(unicode) == NULL) {
3100        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3101        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3102        if (bytes == NULL)
3103            return NULL;
3104        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3105        if (_PyUnicode_UTF8(u) == NULL) {
3106            Py_DECREF(bytes);
3107            return NULL;
3108        }
3109        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3110        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
3111        Py_DECREF(bytes);
3112    }
3113
3114    if (psize)
3115        *psize = PyUnicode_UTF8_LENGTH(unicode);
3116    return PyUnicode_UTF8(unicode);
3117}
3118
3119char*
3120PyUnicode_AsUTF8(PyObject *unicode)
3121{
3122    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3123}
3124
3125#ifdef Py_DEBUG
3126int unicode_as_unicode_calls = 0;
3127#endif
3128
3129
3130Py_UNICODE *
3131PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3132{
3133    PyUnicodeObject *u;
3134    const unsigned char *one_byte;
3135#if SIZEOF_WCHAR_T == 4
3136    const Py_UCS2 *two_bytes;
3137#else
3138    const Py_UCS4 *four_bytes;
3139    const Py_UCS4 *ucs4_end;
3140    Py_ssize_t num_surrogates;
3141#endif
3142    wchar_t *w;
3143    wchar_t *wchar_end;
3144
3145    if (!PyUnicode_Check(unicode)) {
3146        PyErr_BadArgument();
3147        return NULL;
3148    }
3149    u = (PyUnicodeObject*)unicode;
3150    if (_PyUnicode_WSTR(u) == NULL) {
3151        /* Non-ASCII compact unicode object */
3152        assert(_PyUnicode_KIND(u) != 0);
3153        assert(PyUnicode_IS_READY(u));
3154
3155#ifdef Py_DEBUG
3156        ++unicode_as_unicode_calls;
3157#endif
3158
3159        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3160#if SIZEOF_WCHAR_T == 2
3161            four_bytes = PyUnicode_4BYTE_DATA(u);
3162            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3163            num_surrogates = 0;
3164
3165            for (; four_bytes < ucs4_end; ++four_bytes) {
3166                if (*four_bytes > 0xFFFF)
3167                    ++num_surrogates;
3168            }
3169
3170            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3171                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3172            if (!_PyUnicode_WSTR(u)) {
3173                PyErr_NoMemory();
3174                return NULL;
3175            }
3176            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3177
3178            w = _PyUnicode_WSTR(u);
3179            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3180            four_bytes = PyUnicode_4BYTE_DATA(u);
3181            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3182                if (*four_bytes > 0xFFFF) {
3183                    /* encode surrogate pair in this case */
3184                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3185                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3186                }
3187                else
3188                    *w = *four_bytes;
3189
3190                if (w > wchar_end) {
3191                    assert(0 && "Miscalculated string end");
3192                }
3193            }
3194            *w = 0;
3195#else
3196            /* sizeof(wchar_t) == 4 */
3197            Py_FatalError("Impossible unicode object state, wstr and str "
3198                          "should share memory already.");
3199            return NULL;
3200#endif
3201        }
3202        else {
3203            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3204                                                  (_PyUnicode_LENGTH(u) + 1));
3205            if (!_PyUnicode_WSTR(u)) {
3206                PyErr_NoMemory();
3207                return NULL;
3208            }
3209            if (!PyUnicode_IS_COMPACT_ASCII(u))
3210                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3211            w = _PyUnicode_WSTR(u);
3212            wchar_end = w + _PyUnicode_LENGTH(u);
3213
3214            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3215                one_byte = PyUnicode_1BYTE_DATA(u);
3216                for (; w < wchar_end; ++one_byte, ++w)
3217                    *w = *one_byte;
3218                /* null-terminate the wstr */
3219                *w = 0;
3220            }
3221            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3222#if SIZEOF_WCHAR_T == 4
3223                two_bytes = PyUnicode_2BYTE_DATA(u);
3224                for (; w < wchar_end; ++two_bytes, ++w)
3225                    *w = *two_bytes;
3226                /* null-terminate the wstr */
3227                *w = 0;
3228#else
3229                /* sizeof(wchar_t) == 2 */
3230                PyObject_FREE(_PyUnicode_WSTR(u));
3231                _PyUnicode_WSTR(u) = NULL;
3232                Py_FatalError("Impossible unicode object state, wstr "
3233                              "and str should share memory already.");
3234                return NULL;
3235#endif
3236            }
3237            else {
3238                assert(0 && "This should never happen.");
3239            }
3240        }
3241    }
3242    if (size != NULL)
3243        *size = PyUnicode_WSTR_LENGTH(u);
3244    return _PyUnicode_WSTR(u);
3245}
3246
3247Py_UNICODE *
3248PyUnicode_AsUnicode(PyObject *unicode)
3249{
3250    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3251}
3252
3253
3254Py_ssize_t
3255PyUnicode_GetSize(PyObject *unicode)
3256{
3257    if (!PyUnicode_Check(unicode)) {
3258        PyErr_BadArgument();
3259        goto onError;
3260    }
3261    return PyUnicode_GET_SIZE(unicode);
3262
3263  onError:
3264    return -1;
3265}
3266
3267Py_ssize_t
3268PyUnicode_GetLength(PyObject *unicode)
3269{
3270    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3271        PyErr_BadArgument();
3272        return -1;
3273    }
3274
3275    return PyUnicode_GET_LENGTH(unicode);
3276}
3277
3278Py_UCS4
3279PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3280{
3281    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3282        PyErr_BadArgument();
3283        return (Py_UCS4)-1;
3284    }
3285    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3286        PyErr_SetString(PyExc_IndexError, "string index out of range");
3287        return (Py_UCS4)-1;
3288    }
3289    return PyUnicode_READ_CHAR(unicode, index);
3290}
3291
3292int
3293PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3294{
3295    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3296        PyErr_BadArgument();
3297        return -1;
3298    }
3299    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3300        PyErr_SetString(PyExc_IndexError, "string index out of range");
3301        return -1;
3302    }
3303    if (_PyUnicode_Dirty(unicode))
3304        return -1;
3305    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3306                    index, ch);
3307    return 0;
3308}
3309
3310const char *
3311PyUnicode_GetDefaultEncoding(void)
3312{
3313    return "utf-8";
3314}
3315
3316/* create or adjust a UnicodeDecodeError */
3317static void
3318make_decode_exception(PyObject **exceptionObject,
3319                      const char *encoding,
3320                      const char *input, Py_ssize_t length,
3321                      Py_ssize_t startpos, Py_ssize_t endpos,
3322                      const char *reason)
3323{
3324    if (*exceptionObject == NULL) {
3325        *exceptionObject = PyUnicodeDecodeError_Create(
3326            encoding, input, length, startpos, endpos, reason);
3327    }
3328    else {
3329        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3330            goto onError;
3331        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3332            goto onError;
3333        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3334            goto onError;
3335    }
3336    return;
3337
3338onError:
3339    Py_DECREF(*exceptionObject);
3340    *exceptionObject = NULL;
3341}
3342
3343/* error handling callback helper:
3344   build arguments, call the callback and check the arguments,
3345   if no exception occurred, copy the replacement to the output
3346   and adjust various state variables.
3347   return 0 on success, -1 on error
3348*/
3349
3350static int
3351unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3352                                 const char *encoding, const char *reason,
3353                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3354                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3355                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3356{
3357    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3358
3359    PyObject *restuple = NULL;
3360    PyObject *repunicode = NULL;
3361    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3362    Py_ssize_t insize;
3363    Py_ssize_t requiredsize;
3364    Py_ssize_t newpos;
3365    const Py_UNICODE *repptr;
3366    PyObject *inputobj = NULL;
3367    Py_ssize_t repsize;
3368    int res = -1;
3369
3370    if (*errorHandler == NULL) {
3371        *errorHandler = PyCodec_LookupError(errors);
3372        if (*errorHandler == NULL)
3373            goto onError;
3374    }
3375
3376    make_decode_exception(exceptionObject,
3377        encoding,
3378        *input, *inend - *input,
3379        *startinpos, *endinpos,
3380        reason);
3381    if (*exceptionObject == NULL)
3382        goto onError;
3383
3384    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3385    if (restuple == NULL)
3386        goto onError;
3387    if (!PyTuple_Check(restuple)) {
3388        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3389        goto onError;
3390    }
3391    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3392        goto onError;
3393
3394    /* Copy back the bytes variables, which might have been modified by the
3395       callback */
3396    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3397    if (!inputobj)
3398        goto onError;
3399    if (!PyBytes_Check(inputobj)) {
3400        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3401    }
3402    *input = PyBytes_AS_STRING(inputobj);
3403    insize = PyBytes_GET_SIZE(inputobj);
3404    *inend = *input + insize;
3405    /* we can DECREF safely, as the exception has another reference,
3406       so the object won't go away. */
3407    Py_DECREF(inputobj);
3408
3409    if (newpos<0)
3410        newpos = insize+newpos;
3411    if (newpos<0 || newpos>insize) {
3412        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3413        goto onError;
3414    }
3415
3416    /* need more space? (at least enough for what we
3417       have+the replacement+the rest of the string (starting
3418       at the new input position), so we won't have to check space
3419       when there are no errors in the rest of the string) */
3420    repptr = PyUnicode_AS_UNICODE(repunicode);
3421    repsize = PyUnicode_GET_SIZE(repunicode);
3422    requiredsize = *outpos + repsize + insize-newpos;
3423    if (requiredsize > outsize) {
3424        if (requiredsize<2*outsize)
3425            requiredsize = 2*outsize;
3426        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3427            goto onError;
3428        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3429    }
3430    *endinpos = newpos;
3431    *inptr = *input + newpos;
3432    Py_UNICODE_COPY(*outptr, repptr, repsize);
3433    *outptr += repsize;
3434    *outpos += repsize;
3435
3436    /* we made it! */
3437    res = 0;
3438
3439  onError:
3440    Py_XDECREF(restuple);
3441    return res;
3442}
3443
3444/* --- UTF-7 Codec -------------------------------------------------------- */
3445
3446/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3447
3448/* Three simple macros defining base-64. */
3449
3450/* Is c a base-64 character? */
3451
3452#define IS_BASE64(c) \
3453    (((c) >= 'A' && (c) <= 'Z') ||     \
3454     ((c) >= 'a' && (c) <= 'z') ||     \
3455     ((c) >= '0' && (c) <= '9') ||     \
3456     (c) == '+' || (c) == '/')
3457
3458/* given that c is a base-64 character, what is its base-64 value? */
3459
3460#define FROM_BASE64(c)                                                  \
3461    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3462     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3463     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3464     (c) == '+' ? 62 : 63)
3465
3466/* What is the base-64 character of the bottom 6 bits of n? */
3467
3468#define TO_BASE64(n)  \
3469    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3470
3471/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3472 * decoded as itself.  We are permissive on decoding; the only ASCII
3473 * byte not decoding to itself is the + which begins a base64
3474 * string. */
3475
3476#define DECODE_DIRECT(c)                                \
3477    ((c) <= 127 && (c) != '+')
3478
3479/* The UTF-7 encoder treats ASCII characters differently according to
3480 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3481 * the above).  See RFC2152.  This array identifies these different
3482 * sets:
3483 * 0 : "Set D"
3484 *     alphanumeric and '(),-./:?
3485 * 1 : "Set O"
3486 *     !"#$%&*;<=>@[]^_`{|}
3487 * 2 : "whitespace"
3488 *     ht nl cr sp
3489 * 3 : special (must be base64 encoded)
3490 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3491 */
3492
3493static
3494char utf7_category[128] = {
3495/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3496    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3497/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3498    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3499/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3500    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3501/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3502    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3503/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3504    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3505/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3506    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3507/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3508    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3509/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3510    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3511};
3512
3513/* ENCODE_DIRECT: this character should be encoded as itself.  The
3514 * answer depends on whether we are encoding set O as itself, and also
3515 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3516 * clear that the answers to these questions vary between
3517 * applications, so this code needs to be flexible.  */
3518
3519#define ENCODE_DIRECT(c, directO, directWS)             \
3520    ((c) < 128 && (c) > 0 &&                            \
3521     ((utf7_category[(c)] == 0) ||                      \
3522      (directWS && (utf7_category[(c)] == 2)) ||        \
3523      (directO && (utf7_category[(c)] == 1))))
3524
3525PyObject *
3526PyUnicode_DecodeUTF7(const char *s,
3527                     Py_ssize_t size,
3528                     const char *errors)
3529{
3530    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3531}
3532
3533/* The decoder.  The only state we preserve is our read position,
3534 * i.e. how many characters we have consumed.  So if we end in the
3535 * middle of a shift sequence we have to back off the read position
3536 * and the output to the beginning of the sequence, otherwise we lose
3537 * all the shift state (seen bits, number of bits seen, high
3538 * surrogate). */
3539
3540PyObject *
3541PyUnicode_DecodeUTF7Stateful(const char *s,
3542                             Py_ssize_t size,
3543                             const char *errors,
3544                             Py_ssize_t *consumed)
3545{
3546    const char *starts = s;
3547    Py_ssize_t startinpos;
3548    Py_ssize_t endinpos;
3549    Py_ssize_t outpos;
3550    const char *e;
3551    PyUnicodeObject *unicode;
3552    Py_UNICODE *p;
3553    const char *errmsg = "";
3554    int inShift = 0;
3555    Py_UNICODE *shiftOutStart;
3556    unsigned int base64bits = 0;
3557    unsigned long base64buffer = 0;
3558    Py_UNICODE surrogate = 0;
3559    PyObject *errorHandler = NULL;
3560    PyObject *exc = NULL;
3561
3562    unicode = _PyUnicode_New(size);
3563    if (!unicode)
3564        return NULL;
3565    if (size == 0) {
3566        if (consumed)
3567            *consumed = 0;
3568        return (PyObject *)unicode;
3569    }
3570
3571    p = PyUnicode_AS_UNICODE(unicode);
3572    shiftOutStart = p;
3573    e = s + size;
3574
3575    while (s < e) {
3576        Py_UNICODE ch;
3577      restart:
3578        ch = (unsigned char) *s;
3579
3580        if (inShift) { /* in a base-64 section */
3581            if (IS_BASE64(ch)) { /* consume a base-64 character */
3582                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3583                base64bits += 6;
3584                s++;
3585                if (base64bits >= 16) {
3586                    /* we have enough bits for a UTF-16 value */
3587                    Py_UNICODE outCh = (Py_UNICODE)
3588                                       (base64buffer >> (base64bits-16));
3589                    base64bits -= 16;
3590                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3591                    if (surrogate) {
3592                        /* expecting a second surrogate */
3593                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3594#ifdef Py_UNICODE_WIDE
3595                            *p++ = (((surrogate & 0x3FF)<<10)
3596                                    | (outCh & 0x3FF)) + 0x10000;
3597#else
3598                            *p++ = surrogate;
3599                            *p++ = outCh;
3600#endif
3601                            surrogate = 0;
3602                        }
3603                        else {
3604                            surrogate = 0;
3605                            errmsg = "second surrogate missing";
3606                            goto utf7Error;
3607                        }
3608                    }
3609                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3610                        /* first surrogate */
3611                        surrogate = outCh;
3612                    }
3613                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3614                        errmsg = "unexpected second surrogate";
3615                        goto utf7Error;
3616                    }
3617                    else {
3618                        *p++ = outCh;
3619                    }
3620                }
3621            }
3622            else { /* now leaving a base-64 section */
3623                inShift = 0;
3624                s++;
3625                if (surrogate) {
3626                    errmsg = "second surrogate missing at end of shift sequence";
3627                    goto utf7Error;
3628                }
3629                if (base64bits > 0) { /* left-over bits */
3630                    if (base64bits >= 6) {
3631                        /* We've seen at least one base-64 character */
3632                        errmsg = "partial character in shift sequence";
3633                        goto utf7Error;
3634                    }
3635                    else {
3636                        /* Some bits remain; they should be zero */
3637                        if (base64buffer != 0) {
3638                            errmsg = "non-zero padding bits in shift sequence";
3639                            goto utf7Error;
3640                        }
3641                    }
3642                }
3643                if (ch != '-') {
3644                    /* '-' is absorbed; other terminating
3645                       characters are preserved */
3646                    *p++ = ch;
3647                }
3648            }
3649        }
3650        else if ( ch == '+' ) {
3651            startinpos = s-starts;
3652            s++; /* consume '+' */
3653            if (s < e && *s == '-') { /* '+-' encodes '+' */
3654                s++;
3655                *p++ = '+';
3656            }
3657            else { /* begin base64-encoded section */
3658                inShift = 1;
3659                shiftOutStart = p;
3660                base64bits = 0;
3661            }
3662        }
3663        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3664            *p++ = ch;
3665            s++;
3666        }
3667        else {
3668            startinpos = s-starts;
3669            s++;
3670            errmsg = "unexpected special character";
3671            goto utf7Error;
3672        }
3673        continue;
3674utf7Error:
3675        outpos = p-PyUnicode_AS_UNICODE(unicode);
3676        endinpos = s-starts;
3677        if (unicode_decode_call_errorhandler(
3678                errors, &errorHandler,
3679                "utf7", errmsg,
3680                &starts, &e, &startinpos, &endinpos, &exc, &s,
3681                &unicode, &outpos, &p))
3682            goto onError;
3683    }
3684
3685    /* end of string */
3686
3687    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3688        /* if we're in an inconsistent state, that's an error */
3689        if (surrogate ||
3690                (base64bits >= 6) ||
3691                (base64bits > 0 && base64buffer != 0)) {
3692            outpos = p-PyUnicode_AS_UNICODE(unicode);
3693            endinpos = size;
3694            if (unicode_decode_call_errorhandler(
3695                    errors, &errorHandler,
3696                    "utf7", "unterminated shift sequence",
3697                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3698                    &unicode, &outpos, &p))
3699                goto onError;
3700            if (s < e)
3701                goto restart;
3702        }
3703    }
3704
3705    /* return state */
3706    if (consumed) {
3707        if (inShift) {
3708            p = shiftOutStart; /* back off output */
3709            *consumed = startinpos;
3710        }
3711        else {
3712            *consumed = s-starts;
3713        }
3714    }
3715
3716    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3717        goto onError;
3718
3719    Py_XDECREF(errorHandler);
3720    Py_XDECREF(exc);
3721#ifndef DONT_MAKE_RESULT_READY
3722    if (_PyUnicode_READY_REPLACE(&unicode)) {
3723        Py_DECREF(unicode);
3724        return NULL;
3725    }
3726#endif
3727    return (PyObject *)unicode;
3728
3729  onError:
3730    Py_XDECREF(errorHandler);
3731    Py_XDECREF(exc);
3732    Py_DECREF(unicode);
3733    return NULL;
3734}
3735
3736
3737PyObject *
3738PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3739                     Py_ssize_t size,
3740                     int base64SetO,
3741                     int base64WhiteSpace,
3742                     const char *errors)
3743{
3744    PyObject *v;
3745    /* It might be possible to tighten this worst case */
3746    Py_ssize_t allocated = 8 * size;
3747    int inShift = 0;
3748    Py_ssize_t i = 0;
3749    unsigned int base64bits = 0;
3750    unsigned long base64buffer = 0;
3751    char * out;
3752    char * start;
3753
3754    if (size == 0)
3755        return PyBytes_FromStringAndSize(NULL, 0);
3756
3757    if (allocated / 8 != size)
3758        return PyErr_NoMemory();
3759
3760    v = PyBytes_FromStringAndSize(NULL, allocated);
3761    if (v == NULL)
3762        return NULL;
3763
3764    start = out = PyBytes_AS_STRING(v);
3765    for (;i < size; ++i) {
3766        Py_UNICODE ch = s[i];
3767
3768        if (inShift) {
3769            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3770                /* shifting out */
3771                if (base64bits) { /* output remaining bits */
3772                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3773                    base64buffer = 0;
3774                    base64bits = 0;
3775                }
3776                inShift = 0;
3777                /* Characters not in the BASE64 set implicitly unshift the sequence
3778                   so no '-' is required, except if the character is itself a '-' */
3779                if (IS_BASE64(ch) || ch == '-') {
3780                    *out++ = '-';
3781                }
3782                *out++ = (char) ch;
3783            }
3784            else {
3785                goto encode_char;
3786            }
3787        }
3788        else { /* not in a shift sequence */
3789            if (ch == '+') {
3790                *out++ = '+';
3791                        *out++ = '-';
3792            }
3793            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3794                *out++ = (char) ch;
3795            }
3796            else {
3797                *out++ = '+';
3798                inShift = 1;
3799                goto encode_char;
3800            }
3801        }
3802        continue;
3803encode_char:
3804#ifdef Py_UNICODE_WIDE
3805        if (ch >= 0x10000) {
3806            /* code first surrogate */
3807            base64bits += 16;
3808            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3809            while (base64bits >= 6) {
3810                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3811                base64bits -= 6;
3812            }
3813            /* prepare second surrogate */
3814            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
3815        }
3816#endif
3817        base64bits += 16;
3818        base64buffer = (base64buffer << 16) | ch;
3819        while (base64bits >= 6) {
3820            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3821            base64bits -= 6;
3822        }
3823    }
3824    if (base64bits)
3825        *out++= TO_BASE64(base64buffer << (6-base64bits) );
3826    if (inShift)
3827        *out++ = '-';
3828    if (_PyBytes_Resize(&v, out - start) < 0)
3829        return NULL;
3830    return v;
3831}
3832
3833#undef IS_BASE64
3834#undef FROM_BASE64
3835#undef TO_BASE64
3836#undef DECODE_DIRECT
3837#undef ENCODE_DIRECT
3838
3839/* --- UTF-8 Codec -------------------------------------------------------- */
3840
3841static
3842char utf8_code_length[256] = {
3843    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
3844       illegal prefix.  See RFC 3629 for details */
3845    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3846    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3847    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3848    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3849    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3850    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3851    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3852    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3853    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
3854    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3855    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3856    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3857    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3858    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3859    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3860    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
3861};
3862
3863PyObject *
3864PyUnicode_DecodeUTF8(const char *s,
3865                     Py_ssize_t size,
3866                     const char *errors)
3867{
3868    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3869}
3870
3871/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3872#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3873
3874/* Mask to quickly check whether a C 'long' contains a
3875   non-ASCII, UTF8-encoded char. */
3876#if (SIZEOF_LONG == 8)
3877# define ASCII_CHAR_MASK 0x8080808080808080L
3878#elif (SIZEOF_LONG == 4)
3879# define ASCII_CHAR_MASK 0x80808080L
3880#else
3881# error C 'long' size should be either 4 or 8!
3882#endif
3883
3884/* Scans a UTF-8 string and returns the maximum character to be expected,
3885   the size of the decoded unicode string and if any major errors were
3886   encountered.
3887
3888   This function does check basic UTF-8 sanity, it does however NOT CHECK
3889   if the string contains surrogates, and if all continuation bytes are
3890   within the correct ranges, these checks are performed in
3891   PyUnicode_DecodeUTF8Stateful.
3892
3893   If it sets has_errors to 1, it means the value of unicode_size and max_char
3894   will be bogus and you should not rely on useful information in them.
3895   */
3896static Py_UCS4
3897utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3898                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3899                                  int *has_errors)
3900{
3901    Py_ssize_t n;
3902    Py_ssize_t char_count = 0;
3903    Py_UCS4 max_char = 127, new_max;
3904    Py_UCS4 upper_bound;
3905    const unsigned char *p = (const unsigned char *)s;
3906    const unsigned char *end = p + string_size;
3907    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3908    int err = 0;
3909
3910    for (; p < end && !err; ++p, ++char_count) {
3911        /* Only check value if it's not a ASCII char... */
3912        if (*p < 0x80) {
3913            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3914               an explanation. */
3915            if (!((size_t) p & LONG_PTR_MASK)) {
3916                /* Help register allocation */
3917                register const unsigned char *_p = p;
3918                while (_p < aligned_end) {
3919                    unsigned long value = *(unsigned long *) _p;
3920                    if (value & ASCII_CHAR_MASK)
3921                        break;
3922                    _p += SIZEOF_LONG;
3923                    char_count += SIZEOF_LONG;
3924                }
3925                p = _p;
3926                if (p == end)
3927                    break;
3928            }
3929        }
3930        if (*p >= 0x80) {
3931            n = utf8_code_length[*p];
3932            new_max = max_char;
3933            switch (n) {
3934            /* invalid start byte */
3935            case 0:
3936                err = 1;
3937                break;
3938            case 2:
3939                /* Code points between 0x00FF and 0x07FF inclusive.
3940                   Approximate the upper bound of the code point,
3941                   if this flips over 255 we can be sure it will be more
3942                   than 255 and the string will need 2 bytes per code coint,
3943                   if it stays under or equal to 255, we can be sure 1 byte
3944                   is enough.
3945                   ((*p & 0b00011111) << 6) | 0b00111111 */
3946                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3947                if (max_char < upper_bound)
3948                    new_max = upper_bound;
3949                /* Ensure we track at least that we left ASCII space. */
3950                if (new_max < 128)
3951                    new_max = 128;
3952                break;
3953            case 3:
3954                /* Between 0x0FFF and 0xFFFF inclusive, so values are
3955                   always > 255 and <= 65535 and will always need 2 bytes. */
3956                if (max_char < 65535)
3957                    new_max = 65535;
3958                break;
3959            case 4:
3960                /* Code point will be above 0xFFFF for sure in this case. */
3961                new_max = 65537;
3962                break;
3963            /* Internal error, this should be caught by the first if */
3964            case 1:
3965            default:
3966                assert(0 && "Impossible case in utf8_max_char_and_size");
3967                err = 1;
3968            }
3969            /* Instead of number of overall bytes for this code point,
3970               n contains the number of following bytes: */
3971            --n;
3972            /* Check if the follow up chars are all valid continuation bytes */
3973            if (n >= 1) {
3974                const unsigned char *cont;
3975                if ((p + n) >= end) {
3976                    if (consumed == 0)
3977                        /* incomplete data, non-incremental decoding */
3978                        err = 1;
3979                    break;
3980                }
3981                for (cont = p + 1; cont < (p + n); ++cont) {
3982                    if ((*cont & 0xc0) != 0x80) {
3983                        err = 1;
3984                        break;
3985                    }
3986                }
3987                p += n;
3988            }
3989            else
3990                err = 1;
3991            max_char = new_max;
3992        }
3993    }
3994
3995    if (unicode_size)
3996        *unicode_size = char_count;
3997    if (has_errors)
3998        *has_errors = err;
3999    return max_char;
4000}
4001
4002/* Similar to PyUnicode_WRITE but can also write into wstr field
4003   of the legacy unicode representation */
4004#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
4005    do { \
4006        const int k_ = (kind); \
4007        if (k_ == PyUnicode_WCHAR_KIND) \
4008            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
4009        else if (k_ == PyUnicode_1BYTE_KIND) \
4010            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
4011        else if (k_ == PyUnicode_2BYTE_KIND) \
4012            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
4013        else \
4014            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4015    } while (0)
4016
4017PyObject *
4018PyUnicode_DecodeUTF8Stateful(const char *s,
4019                             Py_ssize_t size,
4020                             const char *errors,
4021                             Py_ssize_t *consumed)
4022{
4023    const char *starts = s;
4024    int n;
4025    int k;
4026    Py_ssize_t startinpos;
4027    Py_ssize_t endinpos;
4028    const char *e, *aligned_end;
4029    PyUnicodeObject *unicode;
4030    const char *errmsg = "";
4031    PyObject *errorHandler = NULL;
4032    PyObject *exc = NULL;
4033    Py_UCS4 maxchar = 0;
4034    Py_ssize_t unicode_size;
4035    Py_ssize_t i;
4036    int kind;
4037    void *data;
4038    int has_errors;
4039    Py_UNICODE *error_outptr;
4040#if SIZEOF_WCHAR_T == 2
4041    Py_ssize_t wchar_offset = 0;
4042#endif
4043
4044    if (size == 0) {
4045        if (consumed)
4046            *consumed = 0;
4047        return (PyObject *)PyUnicode_New(0, 0);
4048    }
4049    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4050                                                consumed, &has_errors);
4051    if (has_errors) {
4052        unicode = _PyUnicode_New(size);
4053        if (!unicode)
4054            return NULL;
4055        kind = PyUnicode_WCHAR_KIND;
4056        data = PyUnicode_AS_UNICODE(unicode);
4057        assert(data != NULL);
4058    }
4059    else {
4060        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4061        if (!unicode)
4062            return NULL;
4063        /* When the string is ASCII only, just use memcpy and return.
4064           unicode_size may be != size if there is an incomplete UTF-8
4065           sequence at the end of the ASCII block.  */
4066        if (maxchar < 128 && size == unicode_size) {
4067            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4068            return (PyObject *)unicode;
4069        }
4070        kind = PyUnicode_KIND(unicode);
4071        data = PyUnicode_DATA(unicode);
4072    }
4073    /* Unpack UTF-8 encoded data */
4074    i = 0;
4075    e = s + size;
4076    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4077
4078    while (s < e) {
4079        Py_UCS4 ch = (unsigned char)*s;
4080
4081        if (ch < 0x80) {
4082            /* Fast path for runs of ASCII characters. Given that common UTF-8
4083               input will consist of an overwhelming majority of ASCII
4084               characters, we try to optimize for this case by checking
4085               as many characters as a C 'long' can contain.
4086               First, check if we can do an aligned read, as most CPUs have
4087               a penalty for unaligned reads.
4088            */
4089            if (!((size_t) s & LONG_PTR_MASK)) {
4090                /* Help register allocation */
4091                register const char *_s = s;
4092                register Py_ssize_t _i = i;
4093                while (_s < aligned_end) {
4094                    /* Read a whole long at a time (either 4 or 8 bytes),
4095                       and do a fast unrolled copy if it only contains ASCII
4096                       characters. */
4097                    unsigned long value = *(unsigned long *) _s;
4098                    if (value & ASCII_CHAR_MASK)
4099                        break;
4100                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4101                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4102                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4103                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4104#if (SIZEOF_LONG == 8)
4105                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4106                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4107                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4108                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4109#endif
4110                    _s += SIZEOF_LONG;
4111                    _i += SIZEOF_LONG;
4112                }
4113                s = _s;
4114                i = _i;
4115                if (s == e)
4116                    break;
4117                ch = (unsigned char)*s;
4118            }
4119        }
4120
4121        if (ch < 0x80) {
4122            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4123            s++;
4124            continue;
4125        }
4126
4127        n = utf8_code_length[ch];
4128
4129        if (s + n > e) {
4130            if (consumed)
4131                break;
4132            else {
4133                errmsg = "unexpected end of data";
4134                startinpos = s-starts;
4135                endinpos = startinpos+1;
4136                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4137                    endinpos++;
4138                goto utf8Error;
4139            }
4140        }
4141
4142        switch (n) {
4143
4144        case 0:
4145            errmsg = "invalid start byte";
4146            startinpos = s-starts;
4147            endinpos = startinpos+1;
4148            goto utf8Error;
4149
4150        case 1:
4151            errmsg = "internal error";
4152            startinpos = s-starts;
4153            endinpos = startinpos+1;
4154            goto utf8Error;
4155
4156        case 2:
4157            if ((s[1] & 0xc0) != 0x80) {
4158                errmsg = "invalid continuation byte";
4159                startinpos = s-starts;
4160                endinpos = startinpos + 1;
4161                goto utf8Error;
4162            }
4163            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4164            assert ((ch > 0x007F) && (ch <= 0x07FF));
4165            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4166            break;
4167
4168        case 3:
4169            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4170               will result in surrogates in range d800-dfff. Surrogates are
4171               not valid UTF-8 so they are rejected.
4172               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4173               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4174            if ((s[1] & 0xc0) != 0x80 ||
4175                (s[2] & 0xc0) != 0x80 ||
4176                ((unsigned char)s[0] == 0xE0 &&
4177                 (unsigned char)s[1] < 0xA0) ||
4178                ((unsigned char)s[0] == 0xED &&
4179                 (unsigned char)s[1] > 0x9F)) {
4180                errmsg = "invalid continuation byte";
4181                startinpos = s-starts;
4182                endinpos = startinpos + 1;
4183
4184                /* if s[1] first two bits are 1 and 0, then the invalid
4185                   continuation byte is s[2], so increment endinpos by 1,
4186                   if not, s[1] is invalid and endinpos doesn't need to
4187                   be incremented. */
4188                if ((s[1] & 0xC0) == 0x80)
4189                    endinpos++;
4190                goto utf8Error;
4191            }
4192            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4193            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4194            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4195            break;
4196
4197        case 4:
4198            if ((s[1] & 0xc0) != 0x80 ||
4199                (s[2] & 0xc0) != 0x80 ||
4200                (s[3] & 0xc0) != 0x80 ||
4201                ((unsigned char)s[0] == 0xF0 &&
4202                 (unsigned char)s[1] < 0x90) ||
4203                ((unsigned char)s[0] == 0xF4 &&
4204                 (unsigned char)s[1] > 0x8F)) {
4205                errmsg = "invalid continuation byte";
4206                startinpos = s-starts;
4207                endinpos = startinpos + 1;
4208                if ((s[1] & 0xC0) == 0x80) {
4209                    endinpos++;
4210                    if ((s[2] & 0xC0) == 0x80)
4211                        endinpos++;
4212                }
4213                goto utf8Error;
4214            }
4215            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4216                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4217            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4218
4219            /* If the string is flexible or we have native UCS-4, write
4220               directly.. */
4221            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4222                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4223
4224            else {
4225                /* compute and append the two surrogates: */
4226
4227                /* translate from 10000..10FFFF to 0..FFFF */
4228                ch -= 0x10000;
4229
4230                /* high surrogate = top 10 bits added to D800 */
4231                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4232                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4233
4234                /* low surrogate = bottom 10 bits added to DC00 */
4235                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4236                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4237            }
4238#if SIZEOF_WCHAR_T == 2
4239            wchar_offset++;
4240#endif
4241            break;
4242        }
4243        s += n;
4244        continue;
4245
4246      utf8Error:
4247        /* If this is not yet a resizable string, make it one.. */
4248        if (kind != PyUnicode_WCHAR_KIND) {
4249            const Py_UNICODE *u;
4250            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4251            if (!new_unicode)
4252                goto onError;
4253            u = PyUnicode_AsUnicode((PyObject *)unicode);
4254            if (!u)
4255                goto onError;
4256#if SIZEOF_WCHAR_T == 2
4257            i += wchar_offset;
4258#endif
4259            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4260            Py_DECREF(unicode);
4261            unicode = new_unicode;
4262            kind = 0;
4263            data = PyUnicode_AS_UNICODE(new_unicode);
4264            assert(data != NULL);
4265        }
4266        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4267        if (unicode_decode_call_errorhandler(
4268                errors, &errorHandler,
4269                "utf8", errmsg,
4270                &starts, &e, &startinpos, &endinpos, &exc, &s,
4271                &unicode, &i, &error_outptr))
4272            goto onError;
4273        /* Update data because unicode_decode_call_errorhandler might have
4274           re-created or resized the unicode object. */
4275        data = PyUnicode_AS_UNICODE(unicode);
4276        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4277    }
4278    /* Ensure the unicode_size calculation above was correct: */
4279    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4280
4281    if (consumed)
4282        *consumed = s-starts;
4283
4284    /* Adjust length and ready string when it contained errors and
4285       is of the old resizable kind. */
4286    if (kind == PyUnicode_WCHAR_KIND) {
4287        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
4288            goto onError;
4289    }
4290
4291    Py_XDECREF(errorHandler);
4292    Py_XDECREF(exc);
4293#ifndef DONT_MAKE_RESULT_READY
4294    if (_PyUnicode_READY_REPLACE(&unicode)) {
4295        Py_DECREF(unicode);
4296        return NULL;
4297    }
4298#endif
4299    return (PyObject *)unicode;
4300
4301  onError:
4302    Py_XDECREF(errorHandler);
4303    Py_XDECREF(exc);
4304    Py_DECREF(unicode);
4305    return NULL;
4306}
4307
4308#undef WRITE_FLEXIBLE_OR_WSTR
4309
4310#ifdef __APPLE__
4311
4312/* Simplified UTF-8 decoder using surrogateescape error handler,
4313   used to decode the command line arguments on Mac OS X. */
4314
4315wchar_t*
4316_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4317{
4318    int n;
4319    const char *e;
4320    wchar_t *unicode, *p;
4321
4322    /* Note: size will always be longer than the resulting Unicode
4323       character count */
4324    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4325        PyErr_NoMemory();
4326        return NULL;
4327    }
4328    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4329    if (!unicode)
4330        return NULL;
4331
4332    /* Unpack UTF-8 encoded data */
4333    p = unicode;
4334    e = s + size;
4335    while (s < e) {
4336        Py_UCS4 ch = (unsigned char)*s;
4337
4338        if (ch < 0x80) {
4339            *p++ = (wchar_t)ch;
4340            s++;
4341            continue;
4342        }
4343
4344        n = utf8_code_length[ch];
4345        if (s + n > e) {
4346            goto surrogateescape;
4347        }
4348
4349        switch (n) {
4350        case 0:
4351        case 1:
4352            goto surrogateescape;
4353
4354        case 2:
4355            if ((s[1] & 0xc0) != 0x80)
4356                goto surrogateescape;
4357            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4358            assert ((ch > 0x007F) && (ch <= 0x07FF));
4359            *p++ = (wchar_t)ch;
4360            break;
4361
4362        case 3:
4363            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4364               will result in surrogates in range d800-dfff. Surrogates are
4365               not valid UTF-8 so they are rejected.
4366               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4367               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4368            if ((s[1] & 0xc0) != 0x80 ||
4369                (s[2] & 0xc0) != 0x80 ||
4370                ((unsigned char)s[0] == 0xE0 &&
4371                 (unsigned char)s[1] < 0xA0) ||
4372                ((unsigned char)s[0] == 0xED &&
4373                 (unsigned char)s[1] > 0x9F)) {
4374
4375                goto surrogateescape;
4376            }
4377            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4378            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4379            *p++ = (wchar_t)ch;
4380            break;
4381
4382        case 4:
4383            if ((s[1] & 0xc0) != 0x80 ||
4384                (s[2] & 0xc0) != 0x80 ||
4385                (s[3] & 0xc0) != 0x80 ||
4386                ((unsigned char)s[0] == 0xF0 &&
4387                 (unsigned char)s[1] < 0x90) ||
4388                ((unsigned char)s[0] == 0xF4 &&
4389                 (unsigned char)s[1] > 0x8F)) {
4390                goto surrogateescape;
4391            }
4392            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4393                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4394            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4395
4396#if SIZEOF_WCHAR_T == 4
4397            *p++ = (wchar_t)ch;
4398#else
4399            /*  compute and append the two surrogates: */
4400
4401            /*  translate from 10000..10FFFF to 0..FFFF */
4402            ch -= 0x10000;
4403
4404            /*  high surrogate = top 10 bits added to D800 */
4405            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4406
4407            /*  low surrogate = bottom 10 bits added to DC00 */
4408            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4409#endif
4410            break;
4411        }
4412        s += n;
4413        continue;
4414
4415      surrogateescape:
4416        *p++ = 0xDC00 + ch;
4417        s++;
4418    }
4419    *p = L'\0';
4420    return unicode;
4421}
4422
4423#endif /* __APPLE__ */
4424
4425/* Primary internal function which creates utf8 encoded bytes objects.
4426
4427   Allocation strategy:  if the string is short, convert into a stack buffer
4428   and allocate exactly as much space needed at the end.  Else allocate the
4429   maximum possible needed (4 result bytes per Unicode character), and return
4430   the excess memory at the end.
4431*/
4432PyObject *
4433_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4434{
4435#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4436
4437    Py_ssize_t i;                /* index into s of next input byte */
4438    PyObject *result;            /* result string object */
4439    char *p;                     /* next free byte in output buffer */
4440    Py_ssize_t nallocated;      /* number of result bytes allocated */
4441    Py_ssize_t nneeded;            /* number of result bytes needed */
4442    char stackbuf[MAX_SHORT_UNICHARS * 4];
4443    PyObject *errorHandler = NULL;
4444    PyObject *exc = NULL;
4445    int kind;
4446    void *data;
4447    Py_ssize_t size;
4448    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4449#if SIZEOF_WCHAR_T == 2
4450    Py_ssize_t wchar_offset = 0;
4451#endif
4452
4453    if (!PyUnicode_Check(unicode)) {
4454        PyErr_BadArgument();
4455        return NULL;
4456    }
4457
4458    if (PyUnicode_READY(unicode) == -1)
4459        return NULL;
4460
4461    if (PyUnicode_UTF8(unicode))
4462        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4463                                         PyUnicode_UTF8_LENGTH(unicode));
4464
4465    kind = PyUnicode_KIND(unicode);
4466    data = PyUnicode_DATA(unicode);
4467    size = PyUnicode_GET_LENGTH(unicode);
4468
4469    assert(size >= 0);
4470
4471    if (size <= MAX_SHORT_UNICHARS) {
4472        /* Write into the stack buffer; nallocated can't overflow.
4473         * At the end, we'll allocate exactly as much heap space as it
4474         * turns out we need.
4475         */
4476        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4477        result = NULL;   /* will allocate after we're done */
4478        p = stackbuf;
4479    }
4480    else {
4481        /* Overallocate on the heap, and give the excess back at the end. */
4482        nallocated = size * 4;
4483        if (nallocated / 4 != size)  /* overflow! */
4484            return PyErr_NoMemory();
4485        result = PyBytes_FromStringAndSize(NULL, nallocated);
4486        if (result == NULL)
4487            return NULL;
4488        p = PyBytes_AS_STRING(result);
4489    }
4490
4491    for (i = 0; i < size;) {
4492        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4493
4494        if (ch < 0x80)
4495            /* Encode ASCII */
4496            *p++ = (char) ch;
4497
4498        else if (ch < 0x0800) {
4499            /* Encode Latin-1 */
4500            *p++ = (char)(0xc0 | (ch >> 6));
4501            *p++ = (char)(0x80 | (ch & 0x3f));
4502        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4503            Py_ssize_t newpos;
4504            PyObject *rep;
4505            Py_ssize_t repsize, k, startpos;
4506            startpos = i-1;
4507#if SIZEOF_WCHAR_T == 2
4508            startpos += wchar_offset;
4509#endif
4510            rep = unicode_encode_call_errorhandler(
4511                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4512                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4513                  &exc, startpos, startpos+1, &newpos);
4514            if (!rep)
4515                goto error;
4516
4517            if (PyBytes_Check(rep))
4518                repsize = PyBytes_GET_SIZE(rep);
4519            else
4520                repsize = PyUnicode_GET_SIZE(rep);
4521
4522            if (repsize > 4) {
4523                Py_ssize_t offset;
4524
4525                if (result == NULL)
4526                    offset = p - stackbuf;
4527                else
4528                    offset = p - PyBytes_AS_STRING(result);
4529
4530                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4531                    /* integer overflow */
4532                    PyErr_NoMemory();
4533                    goto error;
4534                }
4535                nallocated += repsize - 4;
4536                if (result != NULL) {
4537                    if (_PyBytes_Resize(&result, nallocated) < 0)
4538                        goto error;
4539                } else {
4540                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4541                    if (result == NULL)
4542                        goto error;
4543                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4544                }
4545                p = PyBytes_AS_STRING(result) + offset;
4546            }
4547
4548            if (PyBytes_Check(rep)) {
4549                char *prep = PyBytes_AS_STRING(rep);
4550                for(k = repsize; k > 0; k--)
4551                    *p++ = *prep++;
4552            } else /* rep is unicode */ {
4553                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4554                Py_UNICODE c;
4555
4556                for(k=0; k<repsize; k++) {
4557                    c = prep[k];
4558                    if (0x80 <= c) {
4559                        raise_encode_exception(&exc, "utf-8",
4560                                               PyUnicode_AS_UNICODE(unicode),
4561                                               size, i-1, i,
4562                                               "surrogates not allowed");
4563                        goto error;
4564                    }
4565                    *p++ = (char)prep[k];
4566                }
4567            }
4568            Py_DECREF(rep);
4569        } else if (ch < 0x10000) {
4570            *p++ = (char)(0xe0 | (ch >> 12));
4571            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4572            *p++ = (char)(0x80 | (ch & 0x3f));
4573        } else /* ch >= 0x10000 */ {
4574            /* Encode UCS4 Unicode ordinals */
4575            *p++ = (char)(0xf0 | (ch >> 18));
4576            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4577            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4578            *p++ = (char)(0x80 | (ch & 0x3f));
4579#if SIZEOF_WCHAR_T == 2
4580            wchar_offset++;
4581#endif
4582        }
4583    }
4584
4585    if (result == NULL) {
4586        /* This was stack allocated. */
4587        nneeded = p - stackbuf;
4588        assert(nneeded <= nallocated);
4589        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4590    }
4591    else {
4592        /* Cut back to size actually needed. */
4593        nneeded = p - PyBytes_AS_STRING(result);
4594        assert(nneeded <= nallocated);
4595        _PyBytes_Resize(&result, nneeded);
4596    }
4597
4598    Py_XDECREF(errorHandler);
4599    Py_XDECREF(exc);
4600    return result;
4601 error:
4602    Py_XDECREF(errorHandler);
4603    Py_XDECREF(exc);
4604    Py_XDECREF(result);
4605    return NULL;
4606
4607#undef MAX_SHORT_UNICHARS
4608}
4609
4610PyObject *
4611PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4612                     Py_ssize_t size,
4613                     const char *errors)
4614{
4615    PyObject *v, *unicode;
4616
4617    unicode = PyUnicode_FromUnicode(s, size);
4618    if (unicode == NULL)
4619        return NULL;
4620    v = _PyUnicode_AsUTF8String(unicode, errors);
4621    Py_DECREF(unicode);
4622    return v;
4623}
4624
4625PyObject *
4626PyUnicode_AsUTF8String(PyObject *unicode)
4627{
4628    return _PyUnicode_AsUTF8String(unicode, NULL);
4629}
4630
4631/* --- UTF-32 Codec ------------------------------------------------------- */
4632
4633PyObject *
4634PyUnicode_DecodeUTF32(const char *s,
4635                      Py_ssize_t size,
4636                      const char *errors,
4637                      int *byteorder)
4638{
4639    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4640}
4641
4642PyObject *
4643PyUnicode_DecodeUTF32Stateful(const char *s,
4644                              Py_ssize_t size,
4645                              const char *errors,
4646                              int *byteorder,
4647                              Py_ssize_t *consumed)
4648{
4649    const char *starts = s;
4650    Py_ssize_t startinpos;
4651    Py_ssize_t endinpos;
4652    Py_ssize_t outpos;
4653    PyUnicodeObject *unicode;
4654    Py_UNICODE *p;
4655#ifndef Py_UNICODE_WIDE
4656    int pairs = 0;
4657    const unsigned char *qq;
4658#else
4659    const int pairs = 0;
4660#endif
4661    const unsigned char *q, *e;
4662    int bo = 0;       /* assume native ordering by default */
4663    const char *errmsg = "";
4664    /* Offsets from q for retrieving bytes in the right order. */
4665#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4666    int iorder[] = {0, 1, 2, 3};
4667#else
4668    int iorder[] = {3, 2, 1, 0};
4669#endif
4670    PyObject *errorHandler = NULL;
4671    PyObject *exc = NULL;
4672
4673    q = (unsigned char *)s;
4674    e = q + size;
4675
4676    if (byteorder)
4677        bo = *byteorder;
4678
4679    /* Check for BOM marks (U+FEFF) in the input and adjust current
4680       byte order setting accordingly. In native mode, the leading BOM
4681       mark is skipped, in all other modes, it is copied to the output
4682       stream as-is (giving a ZWNBSP character). */
4683    if (bo == 0) {
4684        if (size >= 4) {
4685            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4686                (q[iorder[1]] << 8) | q[iorder[0]];
4687#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4688            if (bom == 0x0000FEFF) {
4689                q += 4;
4690                bo = -1;
4691            }
4692            else if (bom == 0xFFFE0000) {
4693                q += 4;
4694                bo = 1;
4695            }
4696#else
4697            if (bom == 0x0000FEFF) {
4698                q += 4;
4699                bo = 1;
4700            }
4701            else if (bom == 0xFFFE0000) {
4702                q += 4;
4703                bo = -1;
4704            }
4705#endif
4706        }
4707    }
4708
4709    if (bo == -1) {
4710        /* force LE */
4711        iorder[0] = 0;
4712        iorder[1] = 1;
4713        iorder[2] = 2;
4714        iorder[3] = 3;
4715    }
4716    else if (bo == 1) {
4717        /* force BE */
4718        iorder[0] = 3;
4719        iorder[1] = 2;
4720        iorder[2] = 1;
4721        iorder[3] = 0;
4722    }
4723
4724    /* On narrow builds we split characters outside the BMP into two
4725       codepoints => count how much extra space we need. */
4726#ifndef Py_UNICODE_WIDE
4727    for (qq = q; qq < e; qq += 4)
4728        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4729            pairs++;
4730#endif
4731
4732    /* This might be one to much, because of a BOM */
4733    unicode = _PyUnicode_New((size+3)/4+pairs);
4734    if (!unicode)
4735        return NULL;
4736    if (size == 0)
4737        return (PyObject *)unicode;
4738
4739    /* Unpack UTF-32 encoded data */
4740    p = PyUnicode_AS_UNICODE(unicode);
4741
4742    while (q < e) {
4743        Py_UCS4 ch;
4744        /* remaining bytes at the end? (size should be divisible by 4) */
4745        if (e-q<4) {
4746            if (consumed)
4747                break;
4748            errmsg = "truncated data";
4749            startinpos = ((const char *)q)-starts;
4750            endinpos = ((const char *)e)-starts;
4751            goto utf32Error;
4752            /* The remaining input chars are ignored if the callback
4753               chooses to skip the input */
4754        }
4755        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4756            (q[iorder[1]] << 8) | q[iorder[0]];
4757
4758        if (ch >= 0x110000)
4759        {
4760            errmsg = "codepoint not in range(0x110000)";
4761            startinpos = ((const char *)q)-starts;
4762            endinpos = startinpos+4;
4763            goto utf32Error;
4764        }
4765#ifndef Py_UNICODE_WIDE
4766        if (ch >= 0x10000)
4767        {
4768            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4769            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4770        }
4771        else
4772#endif
4773            *p++ = ch;
4774        q += 4;
4775        continue;
4776      utf32Error:
4777        outpos = p-PyUnicode_AS_UNICODE(unicode);
4778        if (unicode_decode_call_errorhandler(
4779                errors, &errorHandler,
4780                "utf32", errmsg,
4781                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4782                &unicode, &outpos, &p))
4783            goto onError;
4784    }
4785
4786    if (byteorder)
4787        *byteorder = bo;
4788
4789    if (consumed)
4790        *consumed = (const char *)q-starts;
4791
4792    /* Adjust length */
4793    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4794        goto onError;
4795
4796    Py_XDECREF(errorHandler);
4797    Py_XDECREF(exc);
4798#ifndef DONT_MAKE_RESULT_READY
4799    if (_PyUnicode_READY_REPLACE(&unicode)) {
4800        Py_DECREF(unicode);
4801        return NULL;
4802    }
4803#endif
4804    return (PyObject *)unicode;
4805
4806  onError:
4807    Py_DECREF(unicode);
4808    Py_XDECREF(errorHandler);
4809    Py_XDECREF(exc);
4810    return NULL;
4811}
4812
4813PyObject *
4814PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4815                      Py_ssize_t size,
4816                      const char *errors,
4817                      int byteorder)
4818{
4819    PyObject *v;
4820    unsigned char *p;
4821    Py_ssize_t nsize, bytesize;
4822#ifndef Py_UNICODE_WIDE
4823    Py_ssize_t i, pairs;
4824#else
4825    const int pairs = 0;
4826#endif
4827    /* Offsets from p for storing byte pairs in the right order. */
4828#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4829    int iorder[] = {0, 1, 2, 3};
4830#else
4831    int iorder[] = {3, 2, 1, 0};
4832#endif
4833
4834#define STORECHAR(CH)                           \
4835    do {                                        \
4836        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4837        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4838        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4839        p[iorder[0]] = (CH) & 0xff;             \
4840        p += 4;                                 \
4841    } while(0)
4842
4843    /* In narrow builds we can output surrogate pairs as one codepoint,
4844       so we need less space. */
4845#ifndef Py_UNICODE_WIDE
4846    for (i = pairs = 0; i < size-1; i++)
4847        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4848            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4849            pairs++;
4850#endif
4851    nsize = (size - pairs + (byteorder == 0));
4852    bytesize = nsize * 4;
4853    if (bytesize / 4 != nsize)
4854        return PyErr_NoMemory();
4855    v = PyBytes_FromStringAndSize(NULL, bytesize);
4856    if (v == NULL)
4857        return NULL;
4858
4859    p = (unsigned char *)PyBytes_AS_STRING(v);
4860    if (byteorder == 0)
4861        STORECHAR(0xFEFF);
4862    if (size == 0)
4863        goto done;
4864
4865    if (byteorder == -1) {
4866        /* force LE */
4867        iorder[0] = 0;
4868        iorder[1] = 1;
4869        iorder[2] = 2;
4870        iorder[3] = 3;
4871    }
4872    else if (byteorder == 1) {
4873        /* force BE */
4874        iorder[0] = 3;
4875        iorder[1] = 2;
4876        iorder[2] = 1;
4877        iorder[3] = 0;
4878    }
4879
4880    while (size-- > 0) {
4881        Py_UCS4 ch = *s++;
4882#ifndef Py_UNICODE_WIDE
4883        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4884            Py_UCS4 ch2 = *s;
4885            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4886                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4887                s++;
4888                size--;
4889            }
4890        }
4891#endif
4892        STORECHAR(ch);
4893    }
4894
4895  done:
4896    return v;
4897#undef STORECHAR
4898}
4899
4900PyObject *
4901PyUnicode_AsUTF32String(PyObject *unicode)
4902{
4903    if (!PyUnicode_Check(unicode)) {
4904        PyErr_BadArgument();
4905        return NULL;
4906    }
4907    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
4908                                 PyUnicode_GET_SIZE(unicode),
4909                                 NULL,
4910                                 0);
4911}
4912
4913/* --- UTF-16 Codec ------------------------------------------------------- */
4914
4915PyObject *
4916PyUnicode_DecodeUTF16(const char *s,
4917                      Py_ssize_t size,
4918                      const char *errors,
4919                      int *byteorder)
4920{
4921    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4922}
4923
4924/* Two masks for fast checking of whether a C 'long' may contain
4925   UTF16-encoded surrogate characters. This is an efficient heuristic,
4926   assuming that non-surrogate characters with a code point >= 0x8000 are
4927   rare in most input.
4928   FAST_CHAR_MASK is used when the input is in native byte ordering,
4929   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
4930*/
4931#if (SIZEOF_LONG == 8)
4932# define FAST_CHAR_MASK         0x8000800080008000L
4933# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4934#elif (SIZEOF_LONG == 4)
4935# define FAST_CHAR_MASK         0x80008000L
4936# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4937#else
4938# error C 'long' size should be either 4 or 8!
4939#endif
4940
4941PyObject *
4942PyUnicode_DecodeUTF16Stateful(const char *s,
4943                              Py_ssize_t size,
4944                              const char *errors,
4945                              int *byteorder,
4946                              Py_ssize_t *consumed)
4947{
4948    const char *starts = s;
4949    Py_ssize_t startinpos;
4950    Py_ssize_t endinpos;
4951    Py_ssize_t outpos;
4952    PyUnicodeObject *unicode;
4953    Py_UNICODE *p;
4954    const unsigned char *q, *e, *aligned_end;
4955    int bo = 0;       /* assume native ordering by default */
4956    int native_ordering = 0;
4957    const char *errmsg = "";
4958    /* Offsets from q for retrieving byte pairs in the right order. */
4959#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4960    int ihi = 1, ilo = 0;
4961#else
4962    int ihi = 0, ilo = 1;
4963#endif
4964    PyObject *errorHandler = NULL;
4965    PyObject *exc = NULL;
4966
4967    /* Note: size will always be longer than the resulting Unicode
4968       character count */
4969    unicode = _PyUnicode_New(size);
4970    if (!unicode)
4971        return NULL;
4972    if (size == 0)
4973        return (PyObject *)unicode;
4974
4975    /* Unpack UTF-16 encoded data */
4976    p = PyUnicode_AS_UNICODE(unicode);
4977    q = (unsigned char *)s;
4978    e = q + size - 1;
4979
4980    if (byteorder)
4981        bo = *byteorder;
4982
4983    /* Check for BOM marks (U+FEFF) in the input and adjust current
4984       byte order setting accordingly. In native mode, the leading BOM
4985       mark is skipped, in all other modes, it is copied to the output
4986       stream as-is (giving a ZWNBSP character). */
4987    if (bo == 0) {
4988        if (size >= 2) {
4989            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
4990#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4991            if (bom == 0xFEFF) {
4992                q += 2;
4993                bo = -1;
4994            }
4995            else if (bom == 0xFFFE) {
4996                q += 2;
4997                bo = 1;
4998            }
4999#else
5000            if (bom == 0xFEFF) {
5001                q += 2;
5002                bo = 1;
5003            }
5004            else if (bom == 0xFFFE) {
5005                q += 2;
5006                bo = -1;
5007            }
5008#endif
5009        }
5010    }
5011
5012    if (bo == -1) {
5013        /* force LE */
5014        ihi = 1;
5015        ilo = 0;
5016    }
5017    else if (bo == 1) {
5018        /* force BE */
5019        ihi = 0;
5020        ilo = 1;
5021    }
5022#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5023    native_ordering = ilo < ihi;
5024#else
5025    native_ordering = ilo > ihi;
5026#endif
5027
5028    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5029    while (q < e) {
5030        Py_UNICODE ch;
5031        /* First check for possible aligned read of a C 'long'. Unaligned
5032           reads are more expensive, better to defer to another iteration. */
5033        if (!((size_t) q & LONG_PTR_MASK)) {
5034            /* Fast path for runs of non-surrogate chars. */
5035            register const unsigned char *_q = q;
5036            Py_UNICODE *_p = p;
5037            if (native_ordering) {
5038                /* Native ordering is simple: as long as the input cannot
5039                   possibly contain a surrogate char, do an unrolled copy
5040                   of several 16-bit code points to the target object.
5041                   The non-surrogate check is done on several input bytes
5042                   at a time (as many as a C 'long' can contain). */
5043                while (_q < aligned_end) {
5044                    unsigned long data = * (unsigned long *) _q;
5045                    if (data & FAST_CHAR_MASK)
5046                        break;
5047                    _p[0] = ((unsigned short *) _q)[0];
5048                    _p[1] = ((unsigned short *) _q)[1];
5049#if (SIZEOF_LONG == 8)
5050                    _p[2] = ((unsigned short *) _q)[2];
5051                    _p[3] = ((unsigned short *) _q)[3];
5052#endif
5053                    _q += SIZEOF_LONG;
5054                    _p += SIZEOF_LONG / 2;
5055                }
5056            }
5057            else {
5058                /* Byteswapped ordering is similar, but we must decompose
5059                   the copy bytewise, and take care of zero'ing out the
5060                   upper bytes if the target object is in 32-bit units
5061                   (that is, in UCS-4 builds). */
5062                while (_q < aligned_end) {
5063                    unsigned long data = * (unsigned long *) _q;
5064                    if (data & SWAPPED_FAST_CHAR_MASK)
5065                        break;
5066                    /* Zero upper bytes in UCS-4 builds */
5067#if (Py_UNICODE_SIZE > 2)
5068                    _p[0] = 0;
5069                    _p[1] = 0;
5070#if (SIZEOF_LONG == 8)
5071                    _p[2] = 0;
5072                    _p[3] = 0;
5073#endif
5074#endif
5075                    /* Issue #4916; UCS-4 builds on big endian machines must
5076                       fill the two last bytes of each 4-byte unit. */
5077#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5078# define OFF 2
5079#else
5080# define OFF 0
5081#endif
5082                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5083                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5084                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5085                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5086#if (SIZEOF_LONG == 8)
5087                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5088                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5089                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5090                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5091#endif
5092#undef OFF
5093                    _q += SIZEOF_LONG;
5094                    _p += SIZEOF_LONG / 2;
5095                }
5096            }
5097            p = _p;
5098            q = _q;
5099            if (q >= e)
5100                break;
5101        }
5102        ch = (q[ihi] << 8) | q[ilo];
5103
5104        q += 2;
5105
5106        if (ch < 0xD800 || ch > 0xDFFF) {
5107            *p++ = ch;
5108            continue;
5109        }
5110
5111        /* UTF-16 code pair: */
5112        if (q > e) {
5113            errmsg = "unexpected end of data";
5114            startinpos = (((const char *)q) - 2) - starts;
5115            endinpos = ((const char *)e) + 1 - starts;
5116            goto utf16Error;
5117        }
5118        if (0xD800 <= ch && ch <= 0xDBFF) {
5119            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5120            q += 2;
5121            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5122#ifndef Py_UNICODE_WIDE
5123                *p++ = ch;
5124                *p++ = ch2;
5125#else
5126                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5127#endif
5128                continue;
5129            }
5130            else {
5131                errmsg = "illegal UTF-16 surrogate";
5132                startinpos = (((const char *)q)-4)-starts;
5133                endinpos = startinpos+2;
5134                goto utf16Error;
5135            }
5136
5137        }
5138        errmsg = "illegal encoding";
5139        startinpos = (((const char *)q)-2)-starts;
5140        endinpos = startinpos+2;
5141        /* Fall through to report the error */
5142
5143      utf16Error:
5144        outpos = p - PyUnicode_AS_UNICODE(unicode);
5145        if (unicode_decode_call_errorhandler(
5146                errors,
5147                &errorHandler,
5148                "utf16", errmsg,
5149                &starts,
5150                (const char **)&e,
5151                &startinpos,
5152                &endinpos,
5153                &exc,
5154                (const char **)&q,
5155                &unicode,
5156                &outpos,
5157                &p))
5158            goto onError;
5159    }
5160    /* remaining byte at the end? (size should be even) */
5161    if (e == q) {
5162        if (!consumed) {
5163            errmsg = "truncated data";
5164            startinpos = ((const char *)q) - starts;
5165            endinpos = ((const char *)e) + 1 - starts;
5166            outpos = p - PyUnicode_AS_UNICODE(unicode);
5167            if (unicode_decode_call_errorhandler(
5168                    errors,
5169                    &errorHandler,
5170                    "utf16", errmsg,
5171                    &starts,
5172                    (const char **)&e,
5173                    &startinpos,
5174                    &endinpos,
5175                    &exc,
5176                    (const char **)&q,
5177                    &unicode,
5178                    &outpos,
5179                    &p))
5180                goto onError;
5181            /* The remaining input chars are ignored if the callback
5182               chooses to skip the input */
5183        }
5184    }
5185
5186    if (byteorder)
5187        *byteorder = bo;
5188
5189    if (consumed)
5190        *consumed = (const char *)q-starts;
5191
5192    /* Adjust length */
5193    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5194        goto onError;
5195
5196    Py_XDECREF(errorHandler);
5197    Py_XDECREF(exc);
5198#ifndef DONT_MAKE_RESULT_READY
5199    if (_PyUnicode_READY_REPLACE(&unicode)) {
5200        Py_DECREF(unicode);
5201        return NULL;
5202    }
5203#endif
5204    return (PyObject *)unicode;
5205
5206  onError:
5207    Py_DECREF(unicode);
5208    Py_XDECREF(errorHandler);
5209    Py_XDECREF(exc);
5210    return NULL;
5211}
5212
5213#undef FAST_CHAR_MASK
5214#undef SWAPPED_FAST_CHAR_MASK
5215
5216PyObject *
5217PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5218                      Py_ssize_t size,
5219                      const char *errors,
5220                      int byteorder)
5221{
5222    PyObject *v;
5223    unsigned char *p;
5224    Py_ssize_t nsize, bytesize;
5225#ifdef Py_UNICODE_WIDE
5226    Py_ssize_t i, pairs;
5227#else
5228    const int pairs = 0;
5229#endif
5230    /* Offsets from p for storing byte pairs in the right order. */
5231#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5232    int ihi = 1, ilo = 0;
5233#else
5234    int ihi = 0, ilo = 1;
5235#endif
5236
5237#define STORECHAR(CH)                           \
5238    do {                                        \
5239        p[ihi] = ((CH) >> 8) & 0xff;            \
5240        p[ilo] = (CH) & 0xff;                   \
5241        p += 2;                                 \
5242    } while(0)
5243
5244#ifdef Py_UNICODE_WIDE
5245    for (i = pairs = 0; i < size; i++)
5246        if (s[i] >= 0x10000)
5247            pairs++;
5248#endif
5249    /* 2 * (size + pairs + (byteorder == 0)) */
5250    if (size > PY_SSIZE_T_MAX ||
5251        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5252        return PyErr_NoMemory();
5253    nsize = size + pairs + (byteorder == 0);
5254    bytesize = nsize * 2;
5255    if (bytesize / 2 != nsize)
5256        return PyErr_NoMemory();
5257    v = PyBytes_FromStringAndSize(NULL, bytesize);
5258    if (v == NULL)
5259        return NULL;
5260
5261    p = (unsigned char *)PyBytes_AS_STRING(v);
5262    if (byteorder == 0)
5263        STORECHAR(0xFEFF);
5264    if (size == 0)
5265        goto done;
5266
5267    if (byteorder == -1) {
5268        /* force LE */
5269        ihi = 1;
5270        ilo = 0;
5271    }
5272    else if (byteorder == 1) {
5273        /* force BE */
5274        ihi = 0;
5275        ilo = 1;
5276    }
5277
5278    while (size-- > 0) {
5279        Py_UNICODE ch = *s++;
5280        Py_UNICODE ch2 = 0;
5281#ifdef Py_UNICODE_WIDE
5282        if (ch >= 0x10000) {
5283            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5284            ch  = 0xD800 | ((ch-0x10000) >> 10);
5285        }
5286#endif
5287        STORECHAR(ch);
5288        if (ch2)
5289            STORECHAR(ch2);
5290    }
5291
5292  done:
5293    return v;
5294#undef STORECHAR
5295}
5296
5297PyObject *
5298PyUnicode_AsUTF16String(PyObject *unicode)
5299{
5300    if (!PyUnicode_Check(unicode)) {
5301        PyErr_BadArgument();
5302        return NULL;
5303    }
5304    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5305                                 PyUnicode_GET_SIZE(unicode),
5306                                 NULL,
5307                                 0);
5308}
5309
5310/* --- Unicode Escape Codec ----------------------------------------------- */
5311
5312/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5313   if all the escapes in the string make it still a valid ASCII string.
5314   Returns -1 if any escapes were found which cause the string to
5315   pop out of ASCII range.  Otherwise returns the length of the
5316   required buffer to hold the string.
5317   */
5318Py_ssize_t
5319length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5320{
5321    const unsigned char *p = (const unsigned char *)s;
5322    const unsigned char *end = p + size;
5323    Py_ssize_t length = 0;
5324
5325    if (size < 0)
5326        return -1;
5327
5328    for (; p < end; ++p) {
5329        if (*p > 127) {
5330            /* Non-ASCII */
5331            return -1;
5332        }
5333        else if (*p != '\\') {
5334            /* Normal character */
5335            ++length;
5336        }
5337        else {
5338            /* Backslash-escape, check next char */
5339            ++p;
5340            /* Escape sequence reaches till end of string or
5341               non-ASCII follow-up. */
5342            if (p >= end || *p > 127)
5343                return -1;
5344            switch (*p) {
5345            case '\n':
5346                /* backslash + \n result in zero characters */
5347                break;
5348            case '\\': case '\'': case '\"':
5349            case 'b': case 'f': case 't':
5350            case 'n': case 'r': case 'v': case 'a':
5351                ++length;
5352                break;
5353            case '0': case '1': case '2': case '3':
5354            case '4': case '5': case '6': case '7':
5355            case 'x': case 'u': case 'U': case 'N':
5356                /* these do not guarantee ASCII characters */
5357                return -1;
5358            default:
5359                /* count the backslash + the other character */
5360                length += 2;
5361            }
5362        }
5363    }
5364    return length;
5365}
5366
5367/* Similar to PyUnicode_WRITE but either write into wstr field
5368   or treat string as ASCII. */
5369#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5370    do { \
5371        if ((kind) != PyUnicode_WCHAR_KIND) \
5372            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5373        else \
5374            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5375    } while (0)
5376
5377#define WRITE_WSTR(buf, index, value) \
5378    assert(kind == PyUnicode_WCHAR_KIND), \
5379    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5380
5381
5382static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5383
5384PyObject *
5385PyUnicode_DecodeUnicodeEscape(const char *s,
5386                              Py_ssize_t size,
5387                              const char *errors)
5388{
5389    const char *starts = s;
5390    Py_ssize_t startinpos;
5391    Py_ssize_t endinpos;
5392    int j;
5393    PyUnicodeObject *v;
5394    Py_UNICODE *p;
5395    const char *end;
5396    char* message;
5397    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5398    PyObject *errorHandler = NULL;
5399    PyObject *exc = NULL;
5400    Py_ssize_t ascii_length;
5401    Py_ssize_t i;
5402    int kind;
5403    void *data;
5404
5405    ascii_length = length_of_escaped_ascii_string(s, size);
5406
5407    /* After length_of_escaped_ascii_string() there are two alternatives,
5408       either the string is pure ASCII with named escapes like \n, etc.
5409       and we determined it's exact size (common case)
5410       or it contains \x, \u, ... escape sequences.  then we create a
5411       legacy wchar string and resize it at the end of this function. */
5412    if (ascii_length >= 0) {
5413        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5414        if (!v)
5415            goto onError;
5416        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5417        kind = PyUnicode_1BYTE_KIND;
5418        data = PyUnicode_DATA(v);
5419    }
5420    else {
5421        /* Escaped strings will always be longer than the resulting
5422           Unicode string, so we start with size here and then reduce the
5423           length after conversion to the true value.
5424           (but if the error callback returns a long replacement string
5425           we'll have to allocate more space) */
5426        v = _PyUnicode_New(size);
5427        if (!v)
5428            goto onError;
5429        kind = PyUnicode_WCHAR_KIND;
5430        data = PyUnicode_AS_UNICODE(v);
5431    }
5432
5433    if (size == 0)
5434        return (PyObject *)v;
5435    i = 0;
5436    end = s + size;
5437
5438    while (s < end) {
5439        unsigned char c;
5440        Py_UNICODE x;
5441        int digits;
5442
5443        if (kind == PyUnicode_WCHAR_KIND) {
5444            assert(i < _PyUnicode_WSTR_LENGTH(v));
5445        }
5446        else {
5447            /* The only case in which i == ascii_length is a backslash
5448               followed by a newline. */
5449            assert(i <= ascii_length);
5450        }
5451
5452        /* Non-escape characters are interpreted as Unicode ordinals */
5453        if (*s != '\\') {
5454            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5455            continue;
5456        }
5457
5458        startinpos = s-starts;
5459        /* \ - Escapes */
5460        s++;
5461        c = *s++;
5462        if (s > end)
5463            c = '\0'; /* Invalid after \ */
5464
5465        if (kind == PyUnicode_WCHAR_KIND) {
5466            assert(i < _PyUnicode_WSTR_LENGTH(v));
5467        }
5468        else {
5469            /* The only case in which i == ascii_length is a backslash
5470               followed by a newline. */
5471            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5472        }
5473
5474        switch (c) {
5475
5476            /* \x escapes */
5477        case '\n': break;
5478        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5479        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5480        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5481        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5482        /* FF */
5483        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5484        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5485        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5486        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5487        /* VT */
5488        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5489        /* BEL, not classic C */
5490        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5491
5492            /* \OOO (octal) escapes */
5493        case '0': case '1': case '2': case '3':
5494        case '4': case '5': case '6': case '7':
5495            x = s[-1] - '0';
5496            if (s < end && '0' <= *s && *s <= '7') {
5497                x = (x<<3) + *s++ - '0';
5498                if (s < end && '0' <= *s && *s <= '7')
5499                    x = (x<<3) + *s++ - '0';
5500            }
5501            WRITE_WSTR(data, i++, x);
5502            break;
5503
5504            /* hex escapes */
5505            /* \xXX */
5506        case 'x':
5507            digits = 2;
5508            message = "truncated \\xXX escape";
5509            goto hexescape;
5510
5511            /* \uXXXX */
5512        case 'u':
5513            digits = 4;
5514            message = "truncated \\uXXXX escape";
5515            goto hexescape;
5516
5517            /* \UXXXXXXXX */
5518        case 'U':
5519            digits = 8;
5520            message = "truncated \\UXXXXXXXX escape";
5521        hexescape:
5522            chr = 0;
5523            p = PyUnicode_AS_UNICODE(v) + i;
5524            if (s+digits>end) {
5525                endinpos = size;
5526                if (unicode_decode_call_errorhandler(
5527                        errors, &errorHandler,
5528                        "unicodeescape", "end of string in escape sequence",
5529                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5530                        &v, &i, &p))
5531                    goto onError;
5532                data = PyUnicode_AS_UNICODE(v);
5533                goto nextByte;
5534            }
5535            for (j = 0; j < digits; ++j) {
5536                c = (unsigned char) s[j];
5537                if (!Py_ISXDIGIT(c)) {
5538                    endinpos = (s+j+1)-starts;
5539                    p = PyUnicode_AS_UNICODE(v) + i;
5540                    if (unicode_decode_call_errorhandler(
5541                            errors, &errorHandler,
5542                            "unicodeescape", message,
5543                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5544                            &v, &i, &p))
5545                        goto onError;
5546                    data = PyUnicode_AS_UNICODE(v);
5547                    goto nextByte;
5548                }
5549                chr = (chr<<4) & ~0xF;
5550                if (c >= '0' && c <= '9')
5551                    chr += c - '0';
5552                else if (c >= 'a' && c <= 'f')
5553                    chr += 10 + c - 'a';
5554                else
5555                    chr += 10 + c - 'A';
5556            }
5557            s += j;
5558            if (chr == 0xffffffff && PyErr_Occurred())
5559                /* _decoding_error will have already written into the
5560                   target buffer. */
5561                break;
5562        store:
5563            /* when we get here, chr is a 32-bit unicode character */
5564            if (chr <= 0xffff)
5565                /* UCS-2 character */
5566                WRITE_WSTR(data, i++, chr);
5567            else if (chr <= 0x10ffff) {
5568                /* UCS-4 character. Either store directly, or as
5569                   surrogate pair. */
5570#ifdef Py_UNICODE_WIDE
5571                WRITE_WSTR(data, i++, chr);
5572#else
5573                chr -= 0x10000L;
5574                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5575                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5576#endif
5577            } else {
5578                endinpos = s-starts;
5579                p = PyUnicode_AS_UNICODE(v) + i;
5580                if (unicode_decode_call_errorhandler(
5581                        errors, &errorHandler,
5582                        "unicodeescape", "illegal Unicode character",
5583                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5584                        &v, &i, &p))
5585                    goto onError;
5586                data = PyUnicode_AS_UNICODE(v);
5587            }
5588            break;
5589
5590            /* \N{name} */
5591        case 'N':
5592            message = "malformed \\N character escape";
5593            if (ucnhash_CAPI == NULL) {
5594                /* load the unicode data module */
5595                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5596                                                PyUnicodeData_CAPSULE_NAME, 1);
5597                if (ucnhash_CAPI == NULL)
5598                    goto ucnhashError;
5599            }
5600            if (*s == '{') {
5601                const char *start = s+1;
5602                /* look for the closing brace */
5603                while (*s != '}' && s < end)
5604                    s++;
5605                if (s > start && s < end && *s == '}') {
5606                    /* found a name.  look it up in the unicode database */
5607                    message = "unknown Unicode character name";
5608                    s++;
5609                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5610                                              &chr))
5611                        goto store;
5612                }
5613            }
5614            endinpos = s-starts;
5615            p = PyUnicode_AS_UNICODE(v) + i;
5616            if (unicode_decode_call_errorhandler(
5617                    errors, &errorHandler,
5618                    "unicodeescape", message,
5619                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5620                    &v, &i, &p))
5621                goto onError;
5622            data = PyUnicode_AS_UNICODE(v);
5623            break;
5624
5625        default:
5626            if (s > end) {
5627                assert(kind == PyUnicode_WCHAR_KIND);
5628                message = "\\ at end of string";
5629                s--;
5630                endinpos = s-starts;
5631                p = PyUnicode_AS_UNICODE(v) + i;
5632                if (unicode_decode_call_errorhandler(
5633                        errors, &errorHandler,
5634                        "unicodeescape", message,
5635                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5636                        &v, &i, &p))
5637                    goto onError;
5638                data = PyUnicode_AS_UNICODE(v);
5639            }
5640            else {
5641                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5642                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5643            }
5644            break;
5645        }
5646      nextByte:
5647        ;
5648    }
5649    /* Ensure the length prediction worked in case of ASCII strings */
5650    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5651
5652    if (kind == PyUnicode_WCHAR_KIND)
5653    {
5654        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5655            goto onError;
5656    }
5657    Py_XDECREF(errorHandler);
5658    Py_XDECREF(exc);
5659#ifndef DONT_MAKE_RESULT_READY
5660    if (_PyUnicode_READY_REPLACE(&v)) {
5661        Py_DECREF(v);
5662        return NULL;
5663    }
5664#endif
5665    return (PyObject *)v;
5666
5667  ucnhashError:
5668    PyErr_SetString(
5669        PyExc_UnicodeError,
5670        "\\N escapes not supported (can't load unicodedata module)"
5671        );
5672    Py_XDECREF(v);
5673    Py_XDECREF(errorHandler);
5674    Py_XDECREF(exc);
5675    return NULL;
5676
5677  onError:
5678    Py_XDECREF(v);
5679    Py_XDECREF(errorHandler);
5680    Py_XDECREF(exc);
5681    return NULL;
5682}
5683
5684#undef WRITE_ASCII_OR_WSTR
5685#undef WRITE_WSTR
5686
5687/* Return a Unicode-Escape string version of the Unicode object.
5688
5689   If quotes is true, the string is enclosed in u"" or u'' quotes as
5690   appropriate.
5691
5692*/
5693
5694static const char *hexdigits = "0123456789abcdef";
5695
5696PyObject *
5697PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5698                              Py_ssize_t size)
5699{
5700    PyObject *repr;
5701    char *p;
5702
5703#ifdef Py_UNICODE_WIDE
5704    const Py_ssize_t expandsize = 10;
5705#else
5706    const Py_ssize_t expandsize = 6;
5707#endif
5708
5709    /* XXX(nnorwitz): rather than over-allocating, it would be
5710       better to choose a different scheme.  Perhaps scan the
5711       first N-chars of the string and allocate based on that size.
5712    */
5713    /* Initial allocation is based on the longest-possible unichr
5714       escape.
5715
5716       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5717       unichr, so in this case it's the longest unichr escape. In
5718       narrow (UTF-16) builds this is five chars per source unichr
5719       since there are two unichrs in the surrogate pair, so in narrow
5720       (UTF-16) builds it's not the longest unichr escape.
5721
5722       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5723       so in the narrow (UTF-16) build case it's the longest unichr
5724       escape.
5725    */
5726
5727    if (size == 0)
5728        return PyBytes_FromStringAndSize(NULL, 0);
5729
5730    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5731        return PyErr_NoMemory();
5732
5733    repr = PyBytes_FromStringAndSize(NULL,
5734                                     2
5735                                     + expandsize*size
5736                                     + 1);
5737    if (repr == NULL)
5738        return NULL;
5739
5740    p = PyBytes_AS_STRING(repr);
5741
5742    while (size-- > 0) {
5743        Py_UNICODE ch = *s++;
5744
5745        /* Escape backslashes */
5746        if (ch == '\\') {
5747            *p++ = '\\';
5748            *p++ = (char) ch;
5749            continue;
5750        }
5751
5752#ifdef Py_UNICODE_WIDE
5753        /* Map 21-bit characters to '\U00xxxxxx' */
5754        else if (ch >= 0x10000) {
5755            *p++ = '\\';
5756            *p++ = 'U';
5757            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5758            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5759            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5760            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5761            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5762            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5763            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5764            *p++ = hexdigits[ch & 0x0000000F];
5765            continue;
5766        }
5767#else
5768        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5769        else if (ch >= 0xD800 && ch < 0xDC00) {
5770            Py_UNICODE ch2;
5771            Py_UCS4 ucs;
5772
5773            ch2 = *s++;
5774            size--;
5775            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5776                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5777                *p++ = '\\';
5778                *p++ = 'U';
5779                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5780                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5781                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5782                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5783                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5784                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5785                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5786                *p++ = hexdigits[ucs & 0x0000000F];
5787                continue;
5788            }
5789            /* Fall through: isolated surrogates are copied as-is */
5790            s--;
5791            size++;
5792        }
5793#endif
5794
5795        /* Map 16-bit characters to '\uxxxx' */
5796        if (ch >= 256) {
5797            *p++ = '\\';
5798            *p++ = 'u';
5799            *p++ = hexdigits[(ch >> 12) & 0x000F];
5800            *p++ = hexdigits[(ch >> 8) & 0x000F];
5801            *p++ = hexdigits[(ch >> 4) & 0x000F];
5802            *p++ = hexdigits[ch & 0x000F];
5803        }
5804
5805        /* Map special whitespace to '\t', \n', '\r' */
5806        else if (ch == '\t') {
5807            *p++ = '\\';
5808            *p++ = 't';
5809        }
5810        else if (ch == '\n') {
5811            *p++ = '\\';
5812            *p++ = 'n';
5813        }
5814        else if (ch == '\r') {
5815            *p++ = '\\';
5816            *p++ = 'r';
5817        }
5818
5819        /* Map non-printable US ASCII to '\xhh' */
5820        else if (ch < ' ' || ch >= 0x7F) {
5821            *p++ = '\\';
5822            *p++ = 'x';
5823            *p++ = hexdigits[(ch >> 4) & 0x000F];
5824            *p++ = hexdigits[ch & 0x000F];
5825        }
5826
5827        /* Copy everything else as-is */
5828        else
5829            *p++ = (char) ch;
5830    }
5831
5832    assert(p - PyBytes_AS_STRING(repr) > 0);
5833    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5834        return NULL;
5835    return repr;
5836}
5837
5838PyObject *
5839PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5840{
5841    PyObject *s;
5842    if (!PyUnicode_Check(unicode)) {
5843        PyErr_BadArgument();
5844        return NULL;
5845    }
5846    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5847                                      PyUnicode_GET_SIZE(unicode));
5848    return s;
5849}
5850
5851/* --- Raw Unicode Escape Codec ------------------------------------------- */
5852
5853PyObject *
5854PyUnicode_DecodeRawUnicodeEscape(const char *s,
5855                                 Py_ssize_t size,
5856                                 const char *errors)
5857{
5858    const char *starts = s;
5859    Py_ssize_t startinpos;
5860    Py_ssize_t endinpos;
5861    Py_ssize_t outpos;
5862    PyUnicodeObject *v;
5863    Py_UNICODE *p;
5864    const char *end;
5865    const char *bs;
5866    PyObject *errorHandler = NULL;
5867    PyObject *exc = NULL;
5868
5869    /* Escaped strings will always be longer than the resulting
5870       Unicode string, so we start with size here and then reduce the
5871       length after conversion to the true value. (But decoding error
5872       handler might have to resize the string) */
5873    v = _PyUnicode_New(size);
5874    if (v == NULL)
5875        goto onError;
5876    if (size == 0)
5877        return (PyObject *)v;
5878    p = PyUnicode_AS_UNICODE(v);
5879    end = s + size;
5880    while (s < end) {
5881        unsigned char c;
5882        Py_UCS4 x;
5883        int i;
5884        int count;
5885
5886        /* Non-escape characters are interpreted as Unicode ordinals */
5887        if (*s != '\\') {
5888            *p++ = (unsigned char)*s++;
5889            continue;
5890        }
5891        startinpos = s-starts;
5892
5893        /* \u-escapes are only interpreted iff the number of leading
5894           backslashes if odd */
5895        bs = s;
5896        for (;s < end;) {
5897            if (*s != '\\')
5898                break;
5899            *p++ = (unsigned char)*s++;
5900        }
5901        if (((s - bs) & 1) == 0 ||
5902            s >= end ||
5903            (*s != 'u' && *s != 'U')) {
5904            continue;
5905        }
5906        p--;
5907        count = *s=='u' ? 4 : 8;
5908        s++;
5909
5910        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5911        outpos = p-PyUnicode_AS_UNICODE(v);
5912        for (x = 0, i = 0; i < count; ++i, ++s) {
5913            c = (unsigned char)*s;
5914            if (!Py_ISXDIGIT(c)) {
5915                endinpos = s-starts;
5916                if (unicode_decode_call_errorhandler(
5917                        errors, &errorHandler,
5918                        "rawunicodeescape", "truncated \\uXXXX",
5919                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5920                        &v, &outpos, &p))
5921                    goto onError;
5922                goto nextByte;
5923            }
5924            x = (x<<4) & ~0xF;
5925            if (c >= '0' && c <= '9')
5926                x += c - '0';
5927            else if (c >= 'a' && c <= 'f')
5928                x += 10 + c - 'a';
5929            else
5930                x += 10 + c - 'A';
5931        }
5932        if (x <= 0xffff)
5933            /* UCS-2 character */
5934            *p++ = (Py_UNICODE) x;
5935        else if (x <= 0x10ffff) {
5936            /* UCS-4 character. Either store directly, or as
5937               surrogate pair. */
5938#ifdef Py_UNICODE_WIDE
5939            *p++ = (Py_UNICODE) x;
5940#else
5941            x -= 0x10000L;
5942            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5943            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
5944#endif
5945        } else {
5946            endinpos = s-starts;
5947            outpos = p-PyUnicode_AS_UNICODE(v);
5948            if (unicode_decode_call_errorhandler(
5949                    errors, &errorHandler,
5950                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5951                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5952                    &v, &outpos, &p))
5953                goto onError;
5954        }
5955      nextByte:
5956        ;
5957    }
5958    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5959        goto onError;
5960    Py_XDECREF(errorHandler);
5961    Py_XDECREF(exc);
5962#ifndef DONT_MAKE_RESULT_READY
5963    if (_PyUnicode_READY_REPLACE(&v)) {
5964        Py_DECREF(v);
5965        return NULL;
5966    }
5967#endif
5968    return (PyObject *)v;
5969
5970  onError:
5971    Py_XDECREF(v);
5972    Py_XDECREF(errorHandler);
5973    Py_XDECREF(exc);
5974    return NULL;
5975}
5976
5977PyObject *
5978PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5979                                 Py_ssize_t size)
5980{
5981    PyObject *repr;
5982    char *p;
5983    char *q;
5984
5985#ifdef Py_UNICODE_WIDE
5986    const Py_ssize_t expandsize = 10;
5987#else
5988    const Py_ssize_t expandsize = 6;
5989#endif
5990
5991    if (size > PY_SSIZE_T_MAX / expandsize)
5992        return PyErr_NoMemory();
5993
5994    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
5995    if (repr == NULL)
5996        return NULL;
5997    if (size == 0)
5998        return repr;
5999
6000    p = q = PyBytes_AS_STRING(repr);
6001    while (size-- > 0) {
6002        Py_UNICODE ch = *s++;
6003#ifdef Py_UNICODE_WIDE
6004        /* Map 32-bit characters to '\Uxxxxxxxx' */
6005        if (ch >= 0x10000) {
6006            *p++ = '\\';
6007            *p++ = 'U';
6008            *p++ = hexdigits[(ch >> 28) & 0xf];
6009            *p++ = hexdigits[(ch >> 24) & 0xf];
6010            *p++ = hexdigits[(ch >> 20) & 0xf];
6011            *p++ = hexdigits[(ch >> 16) & 0xf];
6012            *p++ = hexdigits[(ch >> 12) & 0xf];
6013            *p++ = hexdigits[(ch >> 8) & 0xf];
6014            *p++ = hexdigits[(ch >> 4) & 0xf];
6015            *p++ = hexdigits[ch & 15];
6016        }
6017        else
6018#else
6019            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6020            if (ch >= 0xD800 && ch < 0xDC00) {
6021                Py_UNICODE ch2;
6022                Py_UCS4 ucs;
6023
6024                ch2 = *s++;
6025                size--;
6026                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6027                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6028                    *p++ = '\\';
6029                    *p++ = 'U';
6030                    *p++ = hexdigits[(ucs >> 28) & 0xf];
6031                    *p++ = hexdigits[(ucs >> 24) & 0xf];
6032                    *p++ = hexdigits[(ucs >> 20) & 0xf];
6033                    *p++ = hexdigits[(ucs >> 16) & 0xf];
6034                    *p++ = hexdigits[(ucs >> 12) & 0xf];
6035                    *p++ = hexdigits[(ucs >> 8) & 0xf];
6036                    *p++ = hexdigits[(ucs >> 4) & 0xf];
6037                    *p++ = hexdigits[ucs & 0xf];
6038                    continue;
6039                }
6040                /* Fall through: isolated surrogates are copied as-is */
6041                s--;
6042                size++;
6043            }
6044#endif
6045        /* Map 16-bit characters to '\uxxxx' */
6046        if (ch >= 256) {
6047            *p++ = '\\';
6048            *p++ = 'u';
6049            *p++ = hexdigits[(ch >> 12) & 0xf];
6050            *p++ = hexdigits[(ch >> 8) & 0xf];
6051            *p++ = hexdigits[(ch >> 4) & 0xf];
6052            *p++ = hexdigits[ch & 15];
6053        }
6054        /* Copy everything else as-is */
6055        else
6056            *p++ = (char) ch;
6057    }
6058    size = p - q;
6059
6060    assert(size > 0);
6061    if (_PyBytes_Resize(&repr, size) < 0)
6062        return NULL;
6063    return repr;
6064}
6065
6066PyObject *
6067PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6068{
6069    PyObject *s;
6070    if (!PyUnicode_Check(unicode)) {
6071        PyErr_BadArgument();
6072        return NULL;
6073    }
6074    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6075                                         PyUnicode_GET_SIZE(unicode));
6076
6077    return s;
6078}
6079
6080/* --- Unicode Internal Codec ------------------------------------------- */
6081
6082PyObject *
6083_PyUnicode_DecodeUnicodeInternal(const char *s,
6084                                 Py_ssize_t size,
6085                                 const char *errors)
6086{
6087    const char *starts = s;
6088    Py_ssize_t startinpos;
6089    Py_ssize_t endinpos;
6090    Py_ssize_t outpos;
6091    PyUnicodeObject *v;
6092    Py_UNICODE *p;
6093    const char *end;
6094    const char *reason;
6095    PyObject *errorHandler = NULL;
6096    PyObject *exc = NULL;
6097
6098#ifdef Py_UNICODE_WIDE
6099    Py_UNICODE unimax = PyUnicode_GetMax();
6100#endif
6101
6102    /* XXX overflow detection missing */
6103    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6104    if (v == NULL)
6105        goto onError;
6106    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6107       as string was created with the old API. */
6108    if (PyUnicode_GET_SIZE(v) == 0)
6109        return (PyObject *)v;
6110    p = PyUnicode_AS_UNICODE(v);
6111    end = s + size;
6112
6113    while (s < end) {
6114        memcpy(p, s, sizeof(Py_UNICODE));
6115        /* We have to sanity check the raw data, otherwise doom looms for
6116           some malformed UCS-4 data. */
6117        if (
6118#ifdef Py_UNICODE_WIDE
6119            *p > unimax || *p < 0 ||
6120#endif
6121            end-s < Py_UNICODE_SIZE
6122            )
6123        {
6124            startinpos = s - starts;
6125            if (end-s < Py_UNICODE_SIZE) {
6126                endinpos = end-starts;
6127                reason = "truncated input";
6128            }
6129            else {
6130                endinpos = s - starts + Py_UNICODE_SIZE;
6131                reason = "illegal code point (> 0x10FFFF)";
6132            }
6133            outpos = p - PyUnicode_AS_UNICODE(v);
6134            if (unicode_decode_call_errorhandler(
6135                    errors, &errorHandler,
6136                    "unicode_internal", reason,
6137                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6138                    &v, &outpos, &p)) {
6139                goto onError;
6140            }
6141        }
6142        else {
6143            p++;
6144            s += Py_UNICODE_SIZE;
6145        }
6146    }
6147
6148    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6149        goto onError;
6150    Py_XDECREF(errorHandler);
6151    Py_XDECREF(exc);
6152#ifndef DONT_MAKE_RESULT_READY
6153    if (_PyUnicode_READY_REPLACE(&v)) {
6154        Py_DECREF(v);
6155        return NULL;
6156    }
6157#endif
6158    return (PyObject *)v;
6159
6160  onError:
6161    Py_XDECREF(v);
6162    Py_XDECREF(errorHandler);
6163    Py_XDECREF(exc);
6164    return NULL;
6165}
6166
6167/* --- Latin-1 Codec ------------------------------------------------------ */
6168
6169PyObject *
6170PyUnicode_DecodeLatin1(const char *s,
6171                       Py_ssize_t size,
6172                       const char *errors)
6173{
6174    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6175    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6176}
6177
6178/* create or adjust a UnicodeEncodeError */
6179static void
6180make_encode_exception(PyObject **exceptionObject,
6181                      const char *encoding,
6182                      const Py_UNICODE *unicode, Py_ssize_t size,
6183                      Py_ssize_t startpos, Py_ssize_t endpos,
6184                      const char *reason)
6185{
6186    if (*exceptionObject == NULL) {
6187        *exceptionObject = PyUnicodeEncodeError_Create(
6188            encoding, unicode, size, startpos, endpos, reason);
6189    }
6190    else {
6191        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6192            goto onError;
6193        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6194            goto onError;
6195        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6196            goto onError;
6197        return;
6198      onError:
6199        Py_DECREF(*exceptionObject);
6200        *exceptionObject = NULL;
6201    }
6202}
6203
6204/* raises a UnicodeEncodeError */
6205static void
6206raise_encode_exception(PyObject **exceptionObject,
6207                       const char *encoding,
6208                       const Py_UNICODE *unicode, Py_ssize_t size,
6209                       Py_ssize_t startpos, Py_ssize_t endpos,
6210                       const char *reason)
6211{
6212    make_encode_exception(exceptionObject,
6213                          encoding, unicode, size, startpos, endpos, reason);
6214    if (*exceptionObject != NULL)
6215        PyCodec_StrictErrors(*exceptionObject);
6216}
6217
6218/* error handling callback helper:
6219   build arguments, call the callback and check the arguments,
6220   put the result into newpos and return the replacement string, which
6221   has to be freed by the caller */
6222static PyObject *
6223unicode_encode_call_errorhandler(const char *errors,
6224                                 PyObject **errorHandler,
6225                                 const char *encoding, const char *reason,
6226                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6227                                 Py_ssize_t startpos, Py_ssize_t endpos,
6228                                 Py_ssize_t *newpos)
6229{
6230    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6231
6232    PyObject *restuple;
6233    PyObject *resunicode;
6234
6235    if (*errorHandler == NULL) {
6236        *errorHandler = PyCodec_LookupError(errors);
6237        if (*errorHandler == NULL)
6238            return NULL;
6239    }
6240
6241    make_encode_exception(exceptionObject,
6242                          encoding, unicode, size, startpos, endpos, reason);
6243    if (*exceptionObject == NULL)
6244        return NULL;
6245
6246    restuple = PyObject_CallFunctionObjArgs(
6247        *errorHandler, *exceptionObject, NULL);
6248    if (restuple == NULL)
6249        return NULL;
6250    if (!PyTuple_Check(restuple)) {
6251        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6252        Py_DECREF(restuple);
6253        return NULL;
6254    }
6255    if (!PyArg_ParseTuple(restuple, argparse,
6256                          &resunicode, newpos)) {
6257        Py_DECREF(restuple);
6258        return NULL;
6259    }
6260    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6261        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6262        Py_DECREF(restuple);
6263        return NULL;
6264    }
6265    if (*newpos<0)
6266        *newpos = size+*newpos;
6267    if (*newpos<0 || *newpos>size) {
6268        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6269        Py_DECREF(restuple);
6270        return NULL;
6271    }
6272    Py_INCREF(resunicode);
6273    Py_DECREF(restuple);
6274    return resunicode;
6275}
6276
6277static PyObject *
6278unicode_encode_ucs1(const Py_UNICODE *p,
6279                    Py_ssize_t size,
6280                    const char *errors,
6281                    int limit)
6282{
6283    /* output object */
6284    PyObject *res;
6285    /* pointers to the beginning and end+1 of input */
6286    const Py_UNICODE *startp = p;
6287    const Py_UNICODE *endp = p + size;
6288    /* pointer to the beginning of the unencodable characters */
6289    /* const Py_UNICODE *badp = NULL; */
6290    /* pointer into the output */
6291    char *str;
6292    /* current output position */
6293    Py_ssize_t ressize;
6294    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6295    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6296    PyObject *errorHandler = NULL;
6297    PyObject *exc = NULL;
6298    /* the following variable is used for caching string comparisons
6299     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6300    int known_errorHandler = -1;
6301
6302    /* allocate enough for a simple encoding without
6303       replacements, if we need more, we'll resize */
6304    if (size == 0)
6305        return PyBytes_FromStringAndSize(NULL, 0);
6306    res = PyBytes_FromStringAndSize(NULL, size);
6307    if (res == NULL)
6308        return NULL;
6309    str = PyBytes_AS_STRING(res);
6310    ressize = size;
6311
6312    while (p<endp) {
6313        Py_UNICODE c = *p;
6314
6315        /* can we encode this? */
6316        if (c<limit) {
6317            /* no overflow check, because we know that the space is enough */
6318            *str++ = (char)c;
6319            ++p;
6320        }
6321        else {
6322            Py_ssize_t unicodepos = p-startp;
6323            Py_ssize_t requiredsize;
6324            PyObject *repunicode;
6325            Py_ssize_t repsize;
6326            Py_ssize_t newpos;
6327            Py_ssize_t respos;
6328            Py_UNICODE *uni2;
6329            /* startpos for collecting unencodable chars */
6330            const Py_UNICODE *collstart = p;
6331            const Py_UNICODE *collend = p;
6332            /* find all unecodable characters */
6333            while ((collend < endp) && ((*collend)>=limit))
6334                ++collend;
6335            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6336            if (known_errorHandler==-1) {
6337                if ((errors==NULL) || (!strcmp(errors, "strict")))
6338                    known_errorHandler = 1;
6339                else if (!strcmp(errors, "replace"))
6340                    known_errorHandler = 2;
6341                else if (!strcmp(errors, "ignore"))
6342                    known_errorHandler = 3;
6343                else if (!strcmp(errors, "xmlcharrefreplace"))
6344                    known_errorHandler = 4;
6345                else
6346                    known_errorHandler = 0;
6347            }
6348            switch (known_errorHandler) {
6349            case 1: /* strict */
6350                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6351                goto onError;
6352            case 2: /* replace */
6353                while (collstart++<collend)
6354                    *str++ = '?'; /* fall through */
6355            case 3: /* ignore */
6356                p = collend;
6357                break;
6358            case 4: /* xmlcharrefreplace */
6359                respos = str - PyBytes_AS_STRING(res);
6360                /* determine replacement size (temporarily (mis)uses p) */
6361                for (p = collstart, repsize = 0; p < collend; ++p) {
6362                    if (*p<10)
6363                        repsize += 2+1+1;
6364                    else if (*p<100)
6365                        repsize += 2+2+1;
6366                    else if (*p<1000)
6367                        repsize += 2+3+1;
6368                    else if (*p<10000)
6369                        repsize += 2+4+1;
6370#ifndef Py_UNICODE_WIDE
6371                    else
6372                        repsize += 2+5+1;
6373#else
6374                    else if (*p<100000)
6375                        repsize += 2+5+1;
6376                    else if (*p<1000000)
6377                        repsize += 2+6+1;
6378                    else
6379                        repsize += 2+7+1;
6380#endif
6381                }
6382                requiredsize = respos+repsize+(endp-collend);
6383                if (requiredsize > ressize) {
6384                    if (requiredsize<2*ressize)
6385                        requiredsize = 2*ressize;
6386                    if (_PyBytes_Resize(&res, requiredsize))
6387                        goto onError;
6388                    str = PyBytes_AS_STRING(res) + respos;
6389                    ressize = requiredsize;
6390                }
6391                /* generate replacement (temporarily (mis)uses p) */
6392                for (p = collstart; p < collend; ++p) {
6393                    str += sprintf(str, "&#%d;", (int)*p);
6394                }
6395                p = collend;
6396                break;
6397            default:
6398                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6399                                                              encoding, reason, startp, size, &exc,
6400                                                              collstart-startp, collend-startp, &newpos);
6401                if (repunicode == NULL)
6402                    goto onError;
6403                if (PyBytes_Check(repunicode)) {
6404                    /* Directly copy bytes result to output. */
6405                    repsize = PyBytes_Size(repunicode);
6406                    if (repsize > 1) {
6407                        /* Make room for all additional bytes. */
6408                        respos = str - PyBytes_AS_STRING(res);
6409                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6410                            Py_DECREF(repunicode);
6411                            goto onError;
6412                        }
6413                        str = PyBytes_AS_STRING(res) + respos;
6414                        ressize += repsize-1;
6415                    }
6416                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6417                    str += repsize;
6418                    p = startp + newpos;
6419                    Py_DECREF(repunicode);
6420                    break;
6421                }
6422                /* need more space? (at least enough for what we
6423                   have+the replacement+the rest of the string, so
6424                   we won't have to check space for encodable characters) */
6425                respos = str - PyBytes_AS_STRING(res);
6426                repsize = PyUnicode_GET_SIZE(repunicode);
6427                requiredsize = respos+repsize+(endp-collend);
6428                if (requiredsize > ressize) {
6429                    if (requiredsize<2*ressize)
6430                        requiredsize = 2*ressize;
6431                    if (_PyBytes_Resize(&res, requiredsize)) {
6432                        Py_DECREF(repunicode);
6433                        goto onError;
6434                    }
6435                    str = PyBytes_AS_STRING(res) + respos;
6436                    ressize = requiredsize;
6437                }
6438                /* check if there is anything unencodable in the replacement
6439                   and copy it to the output */
6440                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6441                    c = *uni2;
6442                    if (c >= limit) {
6443                        raise_encode_exception(&exc, encoding, startp, size,
6444                                               unicodepos, unicodepos+1, reason);
6445                        Py_DECREF(repunicode);
6446                        goto onError;
6447                    }
6448                    *str = (char)c;
6449                }
6450                p = startp + newpos;
6451                Py_DECREF(repunicode);
6452            }
6453        }
6454    }
6455    /* Resize if we allocated to much */
6456    size = str - PyBytes_AS_STRING(res);
6457    if (size < ressize) { /* If this falls res will be NULL */
6458        assert(size >= 0);
6459        if (_PyBytes_Resize(&res, size) < 0)
6460            goto onError;
6461    }
6462
6463    Py_XDECREF(errorHandler);
6464    Py_XDECREF(exc);
6465    return res;
6466
6467  onError:
6468    Py_XDECREF(res);
6469    Py_XDECREF(errorHandler);
6470    Py_XDECREF(exc);
6471    return NULL;
6472}
6473
6474PyObject *
6475PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6476                       Py_ssize_t size,
6477                       const char *errors)
6478{
6479    return unicode_encode_ucs1(p, size, errors, 256);
6480}
6481
6482PyObject *
6483_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6484{
6485    if (!PyUnicode_Check(unicode)) {
6486        PyErr_BadArgument();
6487        return NULL;
6488    }
6489    if (PyUnicode_READY(unicode) == -1)
6490        return NULL;
6491    /* Fast path: if it is a one-byte string, construct
6492       bytes object directly. */
6493    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6494        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6495                                         PyUnicode_GET_LENGTH(unicode));
6496    /* Non-Latin-1 characters present. Defer to above function to
6497       raise the exception. */
6498    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6499                                  PyUnicode_GET_SIZE(unicode),
6500                                  errors);
6501}
6502
6503PyObject*
6504PyUnicode_AsLatin1String(PyObject *unicode)
6505{
6506    return _PyUnicode_AsLatin1String(unicode, NULL);
6507}
6508
6509/* --- 7-bit ASCII Codec -------------------------------------------------- */
6510
6511PyObject *
6512PyUnicode_DecodeASCII(const char *s,
6513                      Py_ssize_t size,
6514                      const char *errors)
6515{
6516    const char *starts = s;
6517    PyUnicodeObject *v;
6518    Py_UNICODE *u;
6519    Py_ssize_t startinpos;
6520    Py_ssize_t endinpos;
6521    Py_ssize_t outpos;
6522    const char *e;
6523    int has_error;
6524    const unsigned char *p = (const unsigned char *)s;
6525    const unsigned char *end = p + size;
6526    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6527    PyObject *errorHandler = NULL;
6528    PyObject *exc = NULL;
6529
6530    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6531    if (size == 1 && (unsigned char)s[0] < 128)
6532        return get_latin1_char((unsigned char)s[0]);
6533
6534    has_error = 0;
6535    while (p < end && !has_error) {
6536        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6537           an explanation. */
6538        if (!((size_t) p & LONG_PTR_MASK)) {
6539            /* Help register allocation */
6540            register const unsigned char *_p = p;
6541            while (_p < aligned_end) {
6542                unsigned long value = *(unsigned long *) _p;
6543                if (value & ASCII_CHAR_MASK) {
6544                    has_error = 1;
6545                    break;
6546                }
6547                _p += SIZEOF_LONG;
6548            }
6549            if (_p == end)
6550                break;
6551            if (has_error)
6552                break;
6553            p = _p;
6554        }
6555        if (*p & 0x80) {
6556            has_error = 1;
6557            break;
6558        }
6559        else {
6560            ++p;
6561        }
6562    }
6563    if (!has_error)
6564        return unicode_fromascii((const unsigned char *)s, size);
6565
6566    v = _PyUnicode_New(size);
6567    if (v == NULL)
6568        goto onError;
6569    if (size == 0)
6570        return (PyObject *)v;
6571    u = PyUnicode_AS_UNICODE(v);
6572    e = s + size;
6573    while (s < e) {
6574        register unsigned char c = (unsigned char)*s;
6575        if (c < 128) {
6576            *u++ = c;
6577            ++s;
6578        }
6579        else {
6580            startinpos = s-starts;
6581            endinpos = startinpos + 1;
6582            outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6583            if (unicode_decode_call_errorhandler(
6584                    errors, &errorHandler,
6585                    "ascii", "ordinal not in range(128)",
6586                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6587                    &v, &outpos, &u))
6588                goto onError;
6589        }
6590    }
6591    if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6592        if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
6593            goto onError;
6594    Py_XDECREF(errorHandler);
6595    Py_XDECREF(exc);
6596#ifndef DONT_MAKE_RESULT_READY
6597    if (_PyUnicode_READY_REPLACE(&v)) {
6598        Py_DECREF(v);
6599        return NULL;
6600    }
6601#endif
6602    return (PyObject *)v;
6603
6604  onError:
6605    Py_XDECREF(v);
6606    Py_XDECREF(errorHandler);
6607    Py_XDECREF(exc);
6608    return NULL;
6609}
6610
6611PyObject *
6612PyUnicode_EncodeASCII(const Py_UNICODE *p,
6613                      Py_ssize_t size,
6614                      const char *errors)
6615{
6616    return unicode_encode_ucs1(p, size, errors, 128);
6617}
6618
6619PyObject *
6620_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6621{
6622    if (!PyUnicode_Check(unicode)) {
6623        PyErr_BadArgument();
6624        return NULL;
6625    }
6626    if (PyUnicode_READY(unicode) == -1)
6627        return NULL;
6628    /* Fast path: if it is an ASCII-only string, construct bytes object
6629       directly. Else defer to above function to raise the exception. */
6630    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6631        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6632                                         PyUnicode_GET_LENGTH(unicode));
6633    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6634                                 PyUnicode_GET_SIZE(unicode),
6635                                 errors);
6636}
6637
6638PyObject *
6639PyUnicode_AsASCIIString(PyObject *unicode)
6640{
6641    return _PyUnicode_AsASCIIString(unicode, NULL);
6642}
6643
6644#ifdef HAVE_MBCS
6645
6646/* --- MBCS codecs for Windows -------------------------------------------- */
6647
6648#if SIZEOF_INT < SIZEOF_SIZE_T
6649#define NEED_RETRY
6650#endif
6651
6652/* XXX This code is limited to "true" double-byte encodings, as
6653   a) it assumes an incomplete character consists of a single byte, and
6654   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6655   encodings, see IsDBCSLeadByteEx documentation. */
6656
6657static int
6658is_dbcs_lead_byte(const char *s, int offset)
6659{
6660    const char *curr = s + offset;
6661
6662    if (IsDBCSLeadByte(*curr)) {
6663        const char *prev = CharPrev(s, curr);
6664        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6665    }
6666    return 0;
6667}
6668
6669/*
6670 * Decode MBCS string into unicode object. If 'final' is set, converts
6671 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6672 */
6673static int
6674decode_mbcs(PyUnicodeObject **v,
6675            const char *s, /* MBCS string */
6676            int size, /* sizeof MBCS string */
6677            int final,
6678            const char *errors)
6679{
6680    Py_UNICODE *p;
6681    Py_ssize_t n;
6682    DWORD usize;
6683    DWORD flags;
6684
6685    assert(size >= 0);
6686
6687    /* check and handle 'errors' arg */
6688    if (errors==NULL || strcmp(errors, "strict")==0)
6689        flags = MB_ERR_INVALID_CHARS;
6690    else if (strcmp(errors, "ignore")==0)
6691        flags = 0;
6692    else {
6693        PyErr_Format(PyExc_ValueError,
6694                     "mbcs encoding does not support errors='%s'",
6695                     errors);
6696        return -1;
6697    }
6698
6699    /* Skip trailing lead-byte unless 'final' is set */
6700    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6701        --size;
6702
6703    /* First get the size of the result */
6704    if (size > 0) {
6705        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6706        if (usize==0)
6707            goto mbcs_decode_error;
6708    } else
6709        usize = 0;
6710
6711    if (*v == NULL) {
6712        /* Create unicode object */
6713        *v = _PyUnicode_New(usize);
6714        if (*v == NULL)
6715            return -1;
6716        n = 0;
6717    }
6718    else {
6719        /* Extend unicode object */
6720        n = PyUnicode_GET_SIZE(*v);
6721        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6722            return -1;
6723    }
6724
6725    /* Do the conversion */
6726    if (usize > 0) {
6727        p = PyUnicode_AS_UNICODE(*v) + n;
6728        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6729            goto mbcs_decode_error;
6730        }
6731    }
6732    return size;
6733
6734mbcs_decode_error:
6735    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6736       we raise a UnicodeDecodeError - else it is a 'generic'
6737       windows error
6738     */
6739    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6740        /* Ideally, we should get reason from FormatMessage - this
6741           is the Windows 2000 English version of the message
6742        */
6743        PyObject *exc = NULL;
6744        const char *reason = "No mapping for the Unicode character exists "
6745                             "in the target multi-byte code page.";
6746        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6747        if (exc != NULL) {
6748            PyCodec_StrictErrors(exc);
6749            Py_DECREF(exc);
6750        }
6751    } else {
6752        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6753    }
6754    return -1;
6755}
6756
6757PyObject *
6758PyUnicode_DecodeMBCSStateful(const char *s,
6759                             Py_ssize_t size,
6760                             const char *errors,
6761                             Py_ssize_t *consumed)
6762{
6763    PyUnicodeObject *v = NULL;
6764    int done;
6765
6766    if (consumed)
6767        *consumed = 0;
6768
6769#ifdef NEED_RETRY
6770  retry:
6771    if (size > INT_MAX)
6772        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6773    else
6774#endif
6775        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6776
6777    if (done < 0) {
6778        Py_XDECREF(v);
6779        return NULL;
6780    }
6781
6782    if (consumed)
6783        *consumed += done;
6784
6785#ifdef NEED_RETRY
6786    if (size > INT_MAX) {
6787        s += done;
6788        size -= done;
6789        goto retry;
6790    }
6791#endif
6792#ifndef DONT_MAKE_RESULT_READY
6793    if (_PyUnicode_READY_REPLACE(&v)) {
6794        Py_DECREF(v);
6795        return NULL;
6796    }
6797#endif
6798    return (PyObject *)v;
6799}
6800
6801PyObject *
6802PyUnicode_DecodeMBCS(const char *s,
6803                     Py_ssize_t size,
6804                     const char *errors)
6805{
6806    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6807}
6808
6809/*
6810 * Convert unicode into string object (MBCS).
6811 * Returns 0 if succeed, -1 otherwise.
6812 */
6813static int
6814encode_mbcs(PyObject **repr,
6815            const Py_UNICODE *p, /* unicode */
6816            int size, /* size of unicode */
6817            const char* errors)
6818{
6819    BOOL usedDefaultChar = FALSE;
6820    BOOL *pusedDefaultChar;
6821    int mbcssize;
6822    Py_ssize_t n;
6823    PyObject *exc = NULL;
6824    DWORD flags;
6825
6826    assert(size >= 0);
6827
6828    /* check and handle 'errors' arg */
6829    if (errors==NULL || strcmp(errors, "strict")==0) {
6830        flags = WC_NO_BEST_FIT_CHARS;
6831        pusedDefaultChar = &usedDefaultChar;
6832    } else if (strcmp(errors, "replace")==0) {
6833        flags = 0;
6834        pusedDefaultChar = NULL;
6835    } else {
6836         PyErr_Format(PyExc_ValueError,
6837                      "mbcs encoding does not support errors='%s'",
6838                      errors);
6839         return -1;
6840    }
6841
6842    /* First get the size of the result */
6843    if (size > 0) {
6844        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6845                                       NULL, pusedDefaultChar);
6846        if (mbcssize == 0) {
6847            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6848            return -1;
6849        }
6850        /* If we used a default char, then we failed! */
6851        if (pusedDefaultChar && *pusedDefaultChar)
6852            goto mbcs_encode_error;
6853    } else {
6854        mbcssize = 0;
6855    }
6856
6857    if (*repr == NULL) {
6858        /* Create string object */
6859        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6860        if (*repr == NULL)
6861            return -1;
6862        n = 0;
6863    }
6864    else {
6865        /* Extend string object */
6866        n = PyBytes_Size(*repr);
6867        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6868            return -1;
6869    }
6870
6871    /* Do the conversion */
6872    if (size > 0) {
6873        char *s = PyBytes_AS_STRING(*repr) + n;
6874        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6875                                     NULL, pusedDefaultChar)) {
6876            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6877            return -1;
6878        }
6879        if (pusedDefaultChar && *pusedDefaultChar)
6880            goto mbcs_encode_error;
6881    }
6882    return 0;
6883
6884mbcs_encode_error:
6885    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6886    Py_XDECREF(exc);
6887    return -1;
6888}
6889
6890PyObject *
6891PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6892                     Py_ssize_t size,
6893                     const char *errors)
6894{
6895    PyObject *repr = NULL;
6896    int ret;
6897
6898#ifdef NEED_RETRY
6899  retry:
6900    if (size > INT_MAX)
6901        ret = encode_mbcs(&repr, p, INT_MAX, errors);
6902    else
6903#endif
6904        ret = encode_mbcs(&repr, p, (int)size, errors);
6905
6906    if (ret < 0) {
6907        Py_XDECREF(repr);
6908        return NULL;
6909    }
6910
6911#ifdef NEED_RETRY
6912    if (size > INT_MAX) {
6913        p += INT_MAX;
6914        size -= INT_MAX;
6915        goto retry;
6916    }
6917#endif
6918
6919    return repr;
6920}
6921
6922PyObject *
6923PyUnicode_AsMBCSString(PyObject *unicode)
6924{
6925    if (!PyUnicode_Check(unicode)) {
6926        PyErr_BadArgument();
6927        return NULL;
6928    }
6929    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
6930                                PyUnicode_GET_SIZE(unicode),
6931                                NULL);
6932}
6933
6934#undef NEED_RETRY
6935
6936#endif /* HAVE_MBCS */
6937
6938/* --- Character Mapping Codec -------------------------------------------- */
6939
6940PyObject *
6941PyUnicode_DecodeCharmap(const char *s,
6942                        Py_ssize_t size,
6943                        PyObject *mapping,
6944                        const char *errors)
6945{
6946    const char *starts = s;
6947    Py_ssize_t startinpos;
6948    Py_ssize_t endinpos;
6949    Py_ssize_t outpos;
6950    const char *e;
6951    PyUnicodeObject *v;
6952    Py_UNICODE *p;
6953    Py_ssize_t extrachars = 0;
6954    PyObject *errorHandler = NULL;
6955    PyObject *exc = NULL;
6956    Py_UNICODE *mapstring = NULL;
6957    Py_ssize_t maplen = 0;
6958
6959    /* Default to Latin-1 */
6960    if (mapping == NULL)
6961        return PyUnicode_DecodeLatin1(s, size, errors);
6962
6963    v = _PyUnicode_New(size);
6964    if (v == NULL)
6965        goto onError;
6966    if (size == 0)
6967        return (PyObject *)v;
6968    p = PyUnicode_AS_UNICODE(v);
6969    e = s + size;
6970    if (PyUnicode_CheckExact(mapping)) {
6971        mapstring = PyUnicode_AS_UNICODE(mapping);
6972        maplen = PyUnicode_GET_SIZE(mapping);
6973        while (s < e) {
6974            unsigned char ch = *s;
6975            Py_UNICODE x = 0xfffe; /* illegal value */
6976
6977            if (ch < maplen)
6978                x = mapstring[ch];
6979
6980            if (x == 0xfffe) {
6981                /* undefined mapping */
6982                outpos = p-PyUnicode_AS_UNICODE(v);
6983                startinpos = s-starts;
6984                endinpos = startinpos+1;
6985                if (unicode_decode_call_errorhandler(
6986                        errors, &errorHandler,
6987                        "charmap", "character maps to <undefined>",
6988                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6989                        &v, &outpos, &p)) {
6990                    goto onError;
6991                }
6992                continue;
6993            }
6994            *p++ = x;
6995            ++s;
6996        }
6997    }
6998    else {
6999        while (s < e) {
7000            unsigned char ch = *s;
7001            PyObject *w, *x;
7002
7003            /* Get mapping (char ordinal -> integer, Unicode char or None) */
7004            w = PyLong_FromLong((long)ch);
7005            if (w == NULL)
7006                goto onError;
7007            x = PyObject_GetItem(mapping, w);
7008            Py_DECREF(w);
7009            if (x == NULL) {
7010                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7011                    /* No mapping found means: mapping is undefined. */
7012                    PyErr_Clear();
7013                    x = Py_None;
7014                    Py_INCREF(x);
7015                } else
7016                    goto onError;
7017            }
7018
7019            /* Apply mapping */
7020            if (PyLong_Check(x)) {
7021                long value = PyLong_AS_LONG(x);
7022                if (value < 0 || value > 65535) {
7023                    PyErr_SetString(PyExc_TypeError,
7024                                    "character mapping must be in range(65536)");
7025                    Py_DECREF(x);
7026                    goto onError;
7027                }
7028                *p++ = (Py_UNICODE)value;
7029            }
7030            else if (x == Py_None) {
7031                /* undefined mapping */
7032                outpos = p-PyUnicode_AS_UNICODE(v);
7033                startinpos = s-starts;
7034                endinpos = startinpos+1;
7035                if (unicode_decode_call_errorhandler(
7036                        errors, &errorHandler,
7037                        "charmap", "character maps to <undefined>",
7038                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7039                        &v, &outpos, &p)) {
7040                    Py_DECREF(x);
7041                    goto onError;
7042                }
7043                Py_DECREF(x);
7044                continue;
7045            }
7046            else if (PyUnicode_Check(x)) {
7047                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
7048
7049                if (targetsize == 1)
7050                    /* 1-1 mapping */
7051                    *p++ = *PyUnicode_AS_UNICODE(x);
7052
7053                else if (targetsize > 1) {
7054                    /* 1-n mapping */
7055                    if (targetsize > extrachars) {
7056                        /* resize first */
7057                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7058                        Py_ssize_t needed = (targetsize - extrachars) + \
7059                            (targetsize << 2);
7060                        extrachars += needed;
7061                        /* XXX overflow detection missing */
7062                        if (PyUnicode_Resize((PyObject**)&v,
7063                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
7064                            Py_DECREF(x);
7065                            goto onError;
7066                        }
7067                        p = PyUnicode_AS_UNICODE(v) + oldpos;
7068                    }
7069                    Py_UNICODE_COPY(p,
7070                                    PyUnicode_AS_UNICODE(x),
7071                                    targetsize);
7072                    p += targetsize;
7073                    extrachars -= targetsize;
7074                }
7075                /* 1-0 mapping: skip the character */
7076            }
7077            else {
7078                /* wrong return value */
7079                PyErr_SetString(PyExc_TypeError,
7080                                "character mapping must return integer, None or str");
7081                Py_DECREF(x);
7082                goto onError;
7083            }
7084            Py_DECREF(x);
7085            ++s;
7086        }
7087    }
7088    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
7089        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
7090            goto onError;
7091    Py_XDECREF(errorHandler);
7092    Py_XDECREF(exc);
7093#ifndef DONT_MAKE_RESULT_READY
7094    if (_PyUnicode_READY_REPLACE(&v)) {
7095        Py_DECREF(v);
7096        return NULL;
7097    }
7098#endif
7099    return (PyObject *)v;
7100
7101  onError:
7102    Py_XDECREF(errorHandler);
7103    Py_XDECREF(exc);
7104    Py_XDECREF(v);
7105    return NULL;
7106}
7107
7108/* Charmap encoding: the lookup table */
7109
7110struct encoding_map {
7111    PyObject_HEAD
7112    unsigned char level1[32];
7113    int count2, count3;
7114    unsigned char level23[1];
7115};
7116
7117static PyObject*
7118encoding_map_size(PyObject *obj, PyObject* args)
7119{
7120    struct encoding_map *map = (struct encoding_map*)obj;
7121    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7122                           128*map->count3);
7123}
7124
7125static PyMethodDef encoding_map_methods[] = {
7126    {"size", encoding_map_size, METH_NOARGS,
7127     PyDoc_STR("Return the size (in bytes) of this object") },
7128    { 0 }
7129};
7130
7131static void
7132encoding_map_dealloc(PyObject* o)
7133{
7134    PyObject_FREE(o);
7135}
7136
7137static PyTypeObject EncodingMapType = {
7138    PyVarObject_HEAD_INIT(NULL, 0)
7139    "EncodingMap",          /*tp_name*/
7140    sizeof(struct encoding_map),   /*tp_basicsize*/
7141    0,                      /*tp_itemsize*/
7142    /* methods */
7143    encoding_map_dealloc,   /*tp_dealloc*/
7144    0,                      /*tp_print*/
7145    0,                      /*tp_getattr*/
7146    0,                      /*tp_setattr*/
7147    0,                      /*tp_reserved*/
7148    0,                      /*tp_repr*/
7149    0,                      /*tp_as_number*/
7150    0,                      /*tp_as_sequence*/
7151    0,                      /*tp_as_mapping*/
7152    0,                      /*tp_hash*/
7153    0,                      /*tp_call*/
7154    0,                      /*tp_str*/
7155    0,                      /*tp_getattro*/
7156    0,                      /*tp_setattro*/
7157    0,                      /*tp_as_buffer*/
7158    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7159    0,                      /*tp_doc*/
7160    0,                      /*tp_traverse*/
7161    0,                      /*tp_clear*/
7162    0,                      /*tp_richcompare*/
7163    0,                      /*tp_weaklistoffset*/
7164    0,                      /*tp_iter*/
7165    0,                      /*tp_iternext*/
7166    encoding_map_methods,   /*tp_methods*/
7167    0,                      /*tp_members*/
7168    0,                      /*tp_getset*/
7169    0,                      /*tp_base*/
7170    0,                      /*tp_dict*/
7171    0,                      /*tp_descr_get*/
7172    0,                      /*tp_descr_set*/
7173    0,                      /*tp_dictoffset*/
7174    0,                      /*tp_init*/
7175    0,                      /*tp_alloc*/
7176    0,                      /*tp_new*/
7177    0,                      /*tp_free*/
7178    0,                      /*tp_is_gc*/
7179};
7180
7181PyObject*
7182PyUnicode_BuildEncodingMap(PyObject* string)
7183{
7184    PyObject *result;
7185    struct encoding_map *mresult;
7186    int i;
7187    int need_dict = 0;
7188    unsigned char level1[32];
7189    unsigned char level2[512];
7190    unsigned char *mlevel1, *mlevel2, *mlevel3;
7191    int count2 = 0, count3 = 0;
7192    int kind;
7193    void *data;
7194    Py_UCS4 ch;
7195
7196    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7197        PyErr_BadArgument();
7198        return NULL;
7199    }
7200    kind = PyUnicode_KIND(string);
7201    data = PyUnicode_DATA(string);
7202    memset(level1, 0xFF, sizeof level1);
7203    memset(level2, 0xFF, sizeof level2);
7204
7205    /* If there isn't a one-to-one mapping of NULL to \0,
7206       or if there are non-BMP characters, we need to use
7207       a mapping dictionary. */
7208    if (PyUnicode_READ(kind, data, 0) != 0)
7209        need_dict = 1;
7210    for (i = 1; i < 256; i++) {
7211        int l1, l2;
7212        ch = PyUnicode_READ(kind, data, i);
7213        if (ch == 0 || ch > 0xFFFF) {
7214            need_dict = 1;
7215            break;
7216        }
7217        if (ch == 0xFFFE)
7218            /* unmapped character */
7219            continue;
7220        l1 = ch >> 11;
7221        l2 = ch >> 7;
7222        if (level1[l1] == 0xFF)
7223            level1[l1] = count2++;
7224        if (level2[l2] == 0xFF)
7225            level2[l2] = count3++;
7226    }
7227
7228    if (count2 >= 0xFF || count3 >= 0xFF)
7229        need_dict = 1;
7230
7231    if (need_dict) {
7232        PyObject *result = PyDict_New();
7233        PyObject *key, *value;
7234        if (!result)
7235            return NULL;
7236        for (i = 0; i < 256; i++) {
7237            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7238            value = PyLong_FromLong(i);
7239            if (!key || !value)
7240                goto failed1;
7241            if (PyDict_SetItem(result, key, value) == -1)
7242                goto failed1;
7243            Py_DECREF(key);
7244            Py_DECREF(value);
7245        }
7246        return result;
7247      failed1:
7248        Py_XDECREF(key);
7249        Py_XDECREF(value);
7250        Py_DECREF(result);
7251        return NULL;
7252    }
7253
7254    /* Create a three-level trie */
7255    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7256                             16*count2 + 128*count3 - 1);
7257    if (!result)
7258        return PyErr_NoMemory();
7259    PyObject_Init(result, &EncodingMapType);
7260    mresult = (struct encoding_map*)result;
7261    mresult->count2 = count2;
7262    mresult->count3 = count3;
7263    mlevel1 = mresult->level1;
7264    mlevel2 = mresult->level23;
7265    mlevel3 = mresult->level23 + 16*count2;
7266    memcpy(mlevel1, level1, 32);
7267    memset(mlevel2, 0xFF, 16*count2);
7268    memset(mlevel3, 0, 128*count3);
7269    count3 = 0;
7270    for (i = 1; i < 256; i++) {
7271        int o1, o2, o3, i2, i3;
7272        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7273            /* unmapped character */
7274            continue;
7275        o1 = PyUnicode_READ(kind, data, i)>>11;
7276        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7277        i2 = 16*mlevel1[o1] + o2;
7278        if (mlevel2[i2] == 0xFF)
7279            mlevel2[i2] = count3++;
7280        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7281        i3 = 128*mlevel2[i2] + o3;
7282        mlevel3[i3] = i;
7283    }
7284    return result;
7285}
7286
7287static int
7288encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7289{
7290    struct encoding_map *map = (struct encoding_map*)mapping;
7291    int l1 = c>>11;
7292    int l2 = (c>>7) & 0xF;
7293    int l3 = c & 0x7F;
7294    int i;
7295
7296#ifdef Py_UNICODE_WIDE
7297    if (c > 0xFFFF) {
7298        return -1;
7299    }
7300#endif
7301    if (c == 0)
7302        return 0;
7303    /* level 1*/
7304    i = map->level1[l1];
7305    if (i == 0xFF) {
7306        return -1;
7307    }
7308    /* level 2*/
7309    i = map->level23[16*i+l2];
7310    if (i == 0xFF) {
7311        return -1;
7312    }
7313    /* level 3 */
7314    i = map->level23[16*map->count2 + 128*i + l3];
7315    if (i == 0) {
7316        return -1;
7317    }
7318    return i;
7319}
7320
7321/* Lookup the character ch in the mapping. If the character
7322   can't be found, Py_None is returned (or NULL, if another
7323   error occurred). */
7324static PyObject *
7325charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7326{
7327    PyObject *w = PyLong_FromLong((long)c);
7328    PyObject *x;
7329
7330    if (w == NULL)
7331        return NULL;
7332    x = PyObject_GetItem(mapping, w);
7333    Py_DECREF(w);
7334    if (x == NULL) {
7335        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7336            /* No mapping found means: mapping is undefined. */
7337            PyErr_Clear();
7338            x = Py_None;
7339            Py_INCREF(x);
7340            return x;
7341        } else
7342            return NULL;
7343    }
7344    else if (x == Py_None)
7345        return x;
7346    else if (PyLong_Check(x)) {
7347        long value = PyLong_AS_LONG(x);
7348        if (value < 0 || value > 255) {
7349            PyErr_SetString(PyExc_TypeError,
7350                            "character mapping must be in range(256)");
7351            Py_DECREF(x);
7352            return NULL;
7353        }
7354        return x;
7355    }
7356    else if (PyBytes_Check(x))
7357        return x;
7358    else {
7359        /* wrong return value */
7360        PyErr_Format(PyExc_TypeError,
7361                     "character mapping must return integer, bytes or None, not %.400s",
7362                     x->ob_type->tp_name);
7363        Py_DECREF(x);
7364        return NULL;
7365    }
7366}
7367
7368static int
7369charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7370{
7371    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7372    /* exponentially overallocate to minimize reallocations */
7373    if (requiredsize < 2*outsize)
7374        requiredsize = 2*outsize;
7375    if (_PyBytes_Resize(outobj, requiredsize))
7376        return -1;
7377    return 0;
7378}
7379
7380typedef enum charmapencode_result {
7381    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7382} charmapencode_result;
7383/* lookup the character, put the result in the output string and adjust
7384   various state variables. Resize the output bytes object if not enough
7385   space is available. Return a new reference to the object that
7386   was put in the output buffer, or Py_None, if the mapping was undefined
7387   (in which case no character was written) or NULL, if a
7388   reallocation error occurred. The caller must decref the result */
7389static charmapencode_result
7390charmapencode_output(Py_UNICODE c, PyObject *mapping,
7391                     PyObject **outobj, Py_ssize_t *outpos)
7392{
7393    PyObject *rep;
7394    char *outstart;
7395    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7396
7397    if (Py_TYPE(mapping) == &EncodingMapType) {
7398        int res = encoding_map_lookup(c, mapping);
7399        Py_ssize_t requiredsize = *outpos+1;
7400        if (res == -1)
7401            return enc_FAILED;
7402        if (outsize<requiredsize)
7403            if (charmapencode_resize(outobj, outpos, requiredsize))
7404                return enc_EXCEPTION;
7405        outstart = PyBytes_AS_STRING(*outobj);
7406        outstart[(*outpos)++] = (char)res;
7407        return enc_SUCCESS;
7408    }
7409
7410    rep = charmapencode_lookup(c, mapping);
7411    if (rep==NULL)
7412        return enc_EXCEPTION;
7413    else if (rep==Py_None) {
7414        Py_DECREF(rep);
7415        return enc_FAILED;
7416    } else {
7417        if (PyLong_Check(rep)) {
7418            Py_ssize_t requiredsize = *outpos+1;
7419            if (outsize<requiredsize)
7420                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7421                    Py_DECREF(rep);
7422                    return enc_EXCEPTION;
7423                }
7424            outstart = PyBytes_AS_STRING(*outobj);
7425            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7426        }
7427        else {
7428            const char *repchars = PyBytes_AS_STRING(rep);
7429            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7430            Py_ssize_t requiredsize = *outpos+repsize;
7431            if (outsize<requiredsize)
7432                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7433                    Py_DECREF(rep);
7434                    return enc_EXCEPTION;
7435                }
7436            outstart = PyBytes_AS_STRING(*outobj);
7437            memcpy(outstart + *outpos, repchars, repsize);
7438            *outpos += repsize;
7439        }
7440    }
7441    Py_DECREF(rep);
7442    return enc_SUCCESS;
7443}
7444
7445/* handle an error in PyUnicode_EncodeCharmap
7446   Return 0 on success, -1 on error */
7447static int
7448charmap_encoding_error(
7449    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7450    PyObject **exceptionObject,
7451    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7452    PyObject **res, Py_ssize_t *respos)
7453{
7454    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7455    Py_ssize_t repsize;
7456    Py_ssize_t newpos;
7457    Py_UNICODE *uni2;
7458    /* startpos for collecting unencodable chars */
7459    Py_ssize_t collstartpos = *inpos;
7460    Py_ssize_t collendpos = *inpos+1;
7461    Py_ssize_t collpos;
7462    char *encoding = "charmap";
7463    char *reason = "character maps to <undefined>";
7464    charmapencode_result x;
7465
7466    /* find all unencodable characters */
7467    while (collendpos < size) {
7468        PyObject *rep;
7469        if (Py_TYPE(mapping) == &EncodingMapType) {
7470            int res = encoding_map_lookup(p[collendpos], mapping);
7471            if (res != -1)
7472                break;
7473            ++collendpos;
7474            continue;
7475        }
7476
7477        rep = charmapencode_lookup(p[collendpos], mapping);
7478        if (rep==NULL)
7479            return -1;
7480        else if (rep!=Py_None) {
7481            Py_DECREF(rep);
7482            break;
7483        }
7484        Py_DECREF(rep);
7485        ++collendpos;
7486    }
7487    /* cache callback name lookup
7488     * (if not done yet, i.e. it's the first error) */
7489    if (*known_errorHandler==-1) {
7490        if ((errors==NULL) || (!strcmp(errors, "strict")))
7491            *known_errorHandler = 1;
7492        else if (!strcmp(errors, "replace"))
7493            *known_errorHandler = 2;
7494        else if (!strcmp(errors, "ignore"))
7495            *known_errorHandler = 3;
7496        else if (!strcmp(errors, "xmlcharrefreplace"))
7497            *known_errorHandler = 4;
7498        else
7499            *known_errorHandler = 0;
7500    }
7501    switch (*known_errorHandler) {
7502    case 1: /* strict */
7503        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7504        return -1;
7505    case 2: /* replace */
7506        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7507            x = charmapencode_output('?', mapping, res, respos);
7508            if (x==enc_EXCEPTION) {
7509                return -1;
7510            }
7511            else if (x==enc_FAILED) {
7512                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7513                return -1;
7514            }
7515        }
7516        /* fall through */
7517    case 3: /* ignore */
7518        *inpos = collendpos;
7519        break;
7520    case 4: /* xmlcharrefreplace */
7521        /* generate replacement (temporarily (mis)uses p) */
7522        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7523            char buffer[2+29+1+1];
7524            char *cp;
7525            sprintf(buffer, "&#%d;", (int)p[collpos]);
7526            for (cp = buffer; *cp; ++cp) {
7527                x = charmapencode_output(*cp, mapping, res, respos);
7528                if (x==enc_EXCEPTION)
7529                    return -1;
7530                else if (x==enc_FAILED) {
7531                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7532                    return -1;
7533                }
7534            }
7535        }
7536        *inpos = collendpos;
7537        break;
7538    default:
7539        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7540                                                      encoding, reason, p, size, exceptionObject,
7541                                                      collstartpos, collendpos, &newpos);
7542        if (repunicode == NULL)
7543            return -1;
7544        if (PyBytes_Check(repunicode)) {
7545            /* Directly copy bytes result to output. */
7546            Py_ssize_t outsize = PyBytes_Size(*res);
7547            Py_ssize_t requiredsize;
7548            repsize = PyBytes_Size(repunicode);
7549            requiredsize = *respos + repsize;
7550            if (requiredsize > outsize)
7551                /* Make room for all additional bytes. */
7552                if (charmapencode_resize(res, respos, requiredsize)) {
7553                    Py_DECREF(repunicode);
7554                    return -1;
7555                }
7556            memcpy(PyBytes_AsString(*res) + *respos,
7557                   PyBytes_AsString(repunicode),  repsize);
7558            *respos += repsize;
7559            *inpos = newpos;
7560            Py_DECREF(repunicode);
7561            break;
7562        }
7563        /* generate replacement  */
7564        repsize = PyUnicode_GET_SIZE(repunicode);
7565        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7566            x = charmapencode_output(*uni2, mapping, res, respos);
7567            if (x==enc_EXCEPTION) {
7568                return -1;
7569            }
7570            else if (x==enc_FAILED) {
7571                Py_DECREF(repunicode);
7572                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7573                return -1;
7574            }
7575        }
7576        *inpos = newpos;
7577        Py_DECREF(repunicode);
7578    }
7579    return 0;
7580}
7581
7582PyObject *
7583PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7584                        Py_ssize_t size,
7585                        PyObject *mapping,
7586                        const char *errors)
7587{
7588    /* output object */
7589    PyObject *res = NULL;
7590    /* current input position */
7591    Py_ssize_t inpos = 0;
7592    /* current output position */
7593    Py_ssize_t respos = 0;
7594    PyObject *errorHandler = NULL;
7595    PyObject *exc = NULL;
7596    /* the following variable is used for caching string comparisons
7597     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7598     * 3=ignore, 4=xmlcharrefreplace */
7599    int known_errorHandler = -1;
7600
7601    /* Default to Latin-1 */
7602    if (mapping == NULL)
7603        return PyUnicode_EncodeLatin1(p, size, errors);
7604
7605    /* allocate enough for a simple encoding without
7606       replacements, if we need more, we'll resize */
7607    res = PyBytes_FromStringAndSize(NULL, size);
7608    if (res == NULL)
7609        goto onError;
7610    if (size == 0)
7611        return res;
7612
7613    while (inpos<size) {
7614        /* try to encode it */
7615        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7616        if (x==enc_EXCEPTION) /* error */
7617            goto onError;
7618        if (x==enc_FAILED) { /* unencodable character */
7619            if (charmap_encoding_error(p, size, &inpos, mapping,
7620                                       &exc,
7621                                       &known_errorHandler, &errorHandler, errors,
7622                                       &res, &respos)) {
7623                goto onError;
7624            }
7625        }
7626        else
7627            /* done with this character => adjust input position */
7628            ++inpos;
7629    }
7630
7631    /* Resize if we allocated to much */
7632    if (respos<PyBytes_GET_SIZE(res))
7633        if (_PyBytes_Resize(&res, respos) < 0)
7634            goto onError;
7635
7636    Py_XDECREF(exc);
7637    Py_XDECREF(errorHandler);
7638    return res;
7639
7640  onError:
7641    Py_XDECREF(res);
7642    Py_XDECREF(exc);
7643    Py_XDECREF(errorHandler);
7644    return NULL;
7645}
7646
7647PyObject *
7648PyUnicode_AsCharmapString(PyObject *unicode,
7649                          PyObject *mapping)
7650{
7651    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7652        PyErr_BadArgument();
7653        return NULL;
7654    }
7655    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7656                                   PyUnicode_GET_SIZE(unicode),
7657                                   mapping,
7658                                   NULL);
7659}
7660
7661/* create or adjust a UnicodeTranslateError */
7662static void
7663make_translate_exception(PyObject **exceptionObject,
7664                         PyObject *unicode,
7665                         Py_ssize_t startpos, Py_ssize_t endpos,
7666                         const char *reason)
7667{
7668    if (*exceptionObject == NULL) {
7669        *exceptionObject = _PyUnicodeTranslateError_Create(
7670            unicode, startpos, endpos, reason);
7671    }
7672    else {
7673        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7674            goto onError;
7675        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7676            goto onError;
7677        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7678            goto onError;
7679        return;
7680      onError:
7681        Py_DECREF(*exceptionObject);
7682        *exceptionObject = NULL;
7683    }
7684}
7685
7686/* raises a UnicodeTranslateError */
7687static void
7688raise_translate_exception(PyObject **exceptionObject,
7689                          PyObject *unicode,
7690                          Py_ssize_t startpos, Py_ssize_t endpos,
7691                          const char *reason)
7692{
7693    make_translate_exception(exceptionObject,
7694                             unicode, startpos, endpos, reason);
7695    if (*exceptionObject != NULL)
7696        PyCodec_StrictErrors(*exceptionObject);
7697}
7698
7699/* error handling callback helper:
7700   build arguments, call the callback and check the arguments,
7701   put the result into newpos and return the replacement string, which
7702   has to be freed by the caller */
7703static PyObject *
7704unicode_translate_call_errorhandler(const char *errors,
7705                                    PyObject **errorHandler,
7706                                    const char *reason,
7707                                    PyObject *unicode, PyObject **exceptionObject,
7708                                    Py_ssize_t startpos, Py_ssize_t endpos,
7709                                    Py_ssize_t *newpos)
7710{
7711    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7712
7713    Py_ssize_t i_newpos;
7714    PyObject *restuple;
7715    PyObject *resunicode;
7716
7717    if (*errorHandler == NULL) {
7718        *errorHandler = PyCodec_LookupError(errors);
7719        if (*errorHandler == NULL)
7720            return NULL;
7721    }
7722
7723    make_translate_exception(exceptionObject,
7724                             unicode, startpos, endpos, reason);
7725    if (*exceptionObject == NULL)
7726        return NULL;
7727
7728    restuple = PyObject_CallFunctionObjArgs(
7729        *errorHandler, *exceptionObject, NULL);
7730    if (restuple == NULL)
7731        return NULL;
7732    if (!PyTuple_Check(restuple)) {
7733        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7734        Py_DECREF(restuple);
7735        return NULL;
7736    }
7737    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7738                          &resunicode, &i_newpos)) {
7739        Py_DECREF(restuple);
7740        return NULL;
7741    }
7742    if (i_newpos<0)
7743        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7744    else
7745        *newpos = i_newpos;
7746    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7747        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7748        Py_DECREF(restuple);
7749        return NULL;
7750    }
7751    Py_INCREF(resunicode);
7752    Py_DECREF(restuple);
7753    return resunicode;
7754}
7755
7756/* Lookup the character ch in the mapping and put the result in result,
7757   which must be decrefed by the caller.
7758   Return 0 on success, -1 on error */
7759static int
7760charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7761{
7762    PyObject *w = PyLong_FromLong((long)c);
7763    PyObject *x;
7764
7765    if (w == NULL)
7766        return -1;
7767    x = PyObject_GetItem(mapping, w);
7768    Py_DECREF(w);
7769    if (x == NULL) {
7770        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7771            /* No mapping found means: use 1:1 mapping. */
7772            PyErr_Clear();
7773            *result = NULL;
7774            return 0;
7775        } else
7776            return -1;
7777    }
7778    else if (x == Py_None) {
7779        *result = x;
7780        return 0;
7781    }
7782    else if (PyLong_Check(x)) {
7783        long value = PyLong_AS_LONG(x);
7784        long max = PyUnicode_GetMax();
7785        if (value < 0 || value > max) {
7786            PyErr_Format(PyExc_TypeError,
7787                         "character mapping must be in range(0x%x)", max+1);
7788            Py_DECREF(x);
7789            return -1;
7790        }
7791        *result = x;
7792        return 0;
7793    }
7794    else if (PyUnicode_Check(x)) {
7795        *result = x;
7796        return 0;
7797    }
7798    else {
7799        /* wrong return value */
7800        PyErr_SetString(PyExc_TypeError,
7801                        "character mapping must return integer, None or str");
7802        Py_DECREF(x);
7803        return -1;
7804    }
7805}
7806/* ensure that *outobj is at least requiredsize characters long,
7807   if not reallocate and adjust various state variables.
7808   Return 0 on success, -1 on error */
7809static int
7810charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
7811                               Py_ssize_t requiredsize)
7812{
7813    Py_ssize_t oldsize = *psize;
7814    if (requiredsize > oldsize) {
7815        /* exponentially overallocate to minimize reallocations */
7816        if (requiredsize < 2 * oldsize)
7817            requiredsize = 2 * oldsize;
7818        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7819        if (*outobj == 0)
7820            return -1;
7821        *psize = requiredsize;
7822    }
7823    return 0;
7824}
7825/* lookup the character, put the result in the output string and adjust
7826   various state variables. Return a new reference to the object that
7827   was put in the output buffer in *result, or Py_None, if the mapping was
7828   undefined (in which case no character was written).
7829   The called must decref result.
7830   Return 0 on success, -1 on error. */
7831static int
7832charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7833                        PyObject *mapping, Py_UCS4 **output,
7834                        Py_ssize_t *osize, Py_ssize_t *opos,
7835                        PyObject **res)
7836{
7837    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7838    if (charmaptranslate_lookup(curinp, mapping, res))
7839        return -1;
7840    if (*res==NULL) {
7841        /* not found => default to 1:1 mapping */
7842        (*output)[(*opos)++] = curinp;
7843    }
7844    else if (*res==Py_None)
7845        ;
7846    else if (PyLong_Check(*res)) {
7847        /* no overflow check, because we know that the space is enough */
7848        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
7849    }
7850    else if (PyUnicode_Check(*res)) {
7851        Py_ssize_t repsize;
7852        if (PyUnicode_READY(*res) == -1)
7853            return -1;
7854        repsize = PyUnicode_GET_LENGTH(*res);
7855        if (repsize==1) {
7856            /* no overflow check, because we know that the space is enough */
7857            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
7858        }
7859        else if (repsize!=0) {
7860            /* more than one character */
7861            Py_ssize_t requiredsize = *opos +
7862                (PyUnicode_GET_LENGTH(input) - ipos) +
7863                repsize - 1;
7864            Py_ssize_t i;
7865            if (charmaptranslate_makespace(output, osize, requiredsize))
7866                return -1;
7867            for(i = 0; i < repsize; i++)
7868                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
7869        }
7870    }
7871    else
7872        return -1;
7873    return 0;
7874}
7875
7876PyObject *
7877_PyUnicode_TranslateCharmap(PyObject *input,
7878                            PyObject *mapping,
7879                            const char *errors)
7880{
7881    /* input object */
7882    char *idata;
7883    Py_ssize_t size, i;
7884    int kind;
7885    /* output buffer */
7886    Py_UCS4 *output = NULL;
7887    Py_ssize_t osize;
7888    PyObject *res;
7889    /* current output position */
7890    Py_ssize_t opos;
7891    char *reason = "character maps to <undefined>";
7892    PyObject *errorHandler = NULL;
7893    PyObject *exc = NULL;
7894    /* the following variable is used for caching string comparisons
7895     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7896     * 3=ignore, 4=xmlcharrefreplace */
7897    int known_errorHandler = -1;
7898
7899    if (mapping == NULL) {
7900        PyErr_BadArgument();
7901        return NULL;
7902    }
7903
7904    if (PyUnicode_READY(input) == -1)
7905        return NULL;
7906    idata = (char*)PyUnicode_DATA(input);
7907    kind = PyUnicode_KIND(input);
7908    size = PyUnicode_GET_LENGTH(input);
7909    i = 0;
7910
7911    if (size == 0) {
7912        Py_INCREF(input);
7913        return input;
7914    }
7915
7916    /* allocate enough for a simple 1:1 translation without
7917       replacements, if we need more, we'll resize */
7918    osize = size;
7919    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7920    opos = 0;
7921    if (output == NULL) {
7922        PyErr_NoMemory();
7923        goto onError;
7924    }
7925
7926    while (i<size) {
7927        /* try to encode it */
7928        PyObject *x = NULL;
7929        if (charmaptranslate_output(input, i, mapping,
7930                                    &output, &osize, &opos, &x)) {
7931            Py_XDECREF(x);
7932            goto onError;
7933        }
7934        Py_XDECREF(x);
7935        if (x!=Py_None) /* it worked => adjust input pointer */
7936            ++i;
7937        else { /* untranslatable character */
7938            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7939            Py_ssize_t repsize;
7940            Py_ssize_t newpos;
7941            Py_ssize_t uni2;
7942            /* startpos for collecting untranslatable chars */
7943            Py_ssize_t collstart = i;
7944            Py_ssize_t collend = i+1;
7945            Py_ssize_t coll;
7946
7947            /* find all untranslatable characters */
7948            while (collend < size) {
7949                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
7950                    goto onError;
7951                Py_XDECREF(x);
7952                if (x!=Py_None)
7953                    break;
7954                ++collend;
7955            }
7956            /* cache callback name lookup
7957             * (if not done yet, i.e. it's the first error) */
7958            if (known_errorHandler==-1) {
7959                if ((errors==NULL) || (!strcmp(errors, "strict")))
7960                    known_errorHandler = 1;
7961                else if (!strcmp(errors, "replace"))
7962                    known_errorHandler = 2;
7963                else if (!strcmp(errors, "ignore"))
7964                    known_errorHandler = 3;
7965                else if (!strcmp(errors, "xmlcharrefreplace"))
7966                    known_errorHandler = 4;
7967                else
7968                    known_errorHandler = 0;
7969            }
7970            switch (known_errorHandler) {
7971            case 1: /* strict */
7972                raise_translate_exception(&exc, input, collstart,
7973                                          collend, reason);
7974                goto onError;
7975            case 2: /* replace */
7976                /* No need to check for space, this is a 1:1 replacement */
7977                for (coll = collstart; coll<collend; coll++)
7978                    output[opos++] = '?';
7979                /* fall through */
7980            case 3: /* ignore */
7981                i = collend;
7982                break;
7983            case 4: /* xmlcharrefreplace */
7984                /* generate replacement (temporarily (mis)uses i) */
7985                for (i = collstart; i < collend; ++i) {
7986                    char buffer[2+29+1+1];
7987                    char *cp;
7988                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7989                    if (charmaptranslate_makespace(&output, &osize,
7990                                                   opos+strlen(buffer)+(size-collend)))
7991                        goto onError;
7992                    for (cp = buffer; *cp; ++cp)
7993                        output[opos++] = *cp;
7994                }
7995                i = collend;
7996                break;
7997            default:
7998                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
7999                                                                 reason, input, &exc,
8000                                                                 collstart, collend, &newpos);
8001                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
8002                    goto onError;
8003                /* generate replacement  */
8004                repsize = PyUnicode_GET_LENGTH(repunicode);
8005                if (charmaptranslate_makespace(&output, &osize,
8006                                               opos+repsize+(size-collend))) {
8007                    Py_DECREF(repunicode);
8008                    goto onError;
8009                }
8010                for (uni2 = 0; repsize-->0; ++uni2)
8011                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8012                i = newpos;
8013                Py_DECREF(repunicode);
8014            }
8015        }
8016    }
8017    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8018    if (!res)
8019        goto onError;
8020    PyMem_Free(output);
8021    Py_XDECREF(exc);
8022    Py_XDECREF(errorHandler);
8023    return res;
8024
8025  onError:
8026    PyMem_Free(output);
8027    Py_XDECREF(exc);
8028    Py_XDECREF(errorHandler);
8029    return NULL;
8030}
8031
8032/* Deprecated. Use PyUnicode_Translate instead. */
8033PyObject *
8034PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8035                           Py_ssize_t size,
8036                           PyObject *mapping,
8037                           const char *errors)
8038{
8039    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8040    if (!unicode)
8041        return NULL;
8042    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8043}
8044
8045PyObject *
8046PyUnicode_Translate(PyObject *str,
8047                    PyObject *mapping,
8048                    const char *errors)
8049{
8050    PyObject *result;
8051
8052    str = PyUnicode_FromObject(str);
8053    if (str == NULL)
8054        goto onError;
8055    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8056    Py_DECREF(str);
8057    return result;
8058
8059  onError:
8060    Py_XDECREF(str);
8061    return NULL;
8062}
8063
8064static Py_UCS4
8065fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8066{
8067    /* No need to call PyUnicode_READY(self) because this function is only
8068       called as a callback from fixup() which does it already. */
8069    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8070    const int kind = PyUnicode_KIND(self);
8071    void *data = PyUnicode_DATA(self);
8072    Py_UCS4 maxchar = 0, ch, fixed;
8073    Py_ssize_t i;
8074
8075    for (i = 0; i < len; ++i) {
8076        ch = PyUnicode_READ(kind, data, i);
8077        fixed = 0;
8078        if (ch > 127) {
8079            if (Py_UNICODE_ISSPACE(ch))
8080                fixed = ' ';
8081            else {
8082                const int decimal = Py_UNICODE_TODECIMAL(ch);
8083                if (decimal >= 0)
8084                    fixed = '0' + decimal;
8085            }
8086            if (fixed != 0) {
8087                if (fixed > maxchar)
8088                    maxchar = fixed;
8089                PyUnicode_WRITE(kind, data, i, fixed);
8090            }
8091            else if (ch > maxchar)
8092                maxchar = ch;
8093        }
8094        else if (ch > maxchar)
8095            maxchar = ch;
8096    }
8097
8098    return maxchar;
8099}
8100
8101PyObject *
8102_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8103{
8104    if (!PyUnicode_Check(unicode)) {
8105        PyErr_BadInternalCall();
8106        return NULL;
8107    }
8108    if (PyUnicode_READY(unicode) == -1)
8109        return NULL;
8110    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8111        /* If the string is already ASCII, just return the same string */
8112        Py_INCREF(unicode);
8113        return unicode;
8114    }
8115    return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8116}
8117
8118PyObject *
8119PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8120                                  Py_ssize_t length)
8121{
8122    PyObject *result;
8123    Py_UNICODE *p; /* write pointer into result */
8124    Py_ssize_t i;
8125    /* Copy to a new string */
8126    result = (PyObject *)_PyUnicode_New(length);
8127    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8128    if (result == NULL)
8129        return result;
8130    p = PyUnicode_AS_UNICODE(result);
8131    /* Iterate over code points */
8132    for (i = 0; i < length; i++) {
8133        Py_UNICODE ch =s[i];
8134        if (ch > 127) {
8135            int decimal = Py_UNICODE_TODECIMAL(ch);
8136            if (decimal >= 0)
8137                p[i] = '0' + decimal;
8138        }
8139    }
8140#ifndef DONT_MAKE_RESULT_READY
8141    if (_PyUnicode_READY_REPLACE(&result)) {
8142        Py_DECREF(result);
8143        return NULL;
8144    }
8145#endif
8146    return result;
8147}
8148/* --- Decimal Encoder ---------------------------------------------------- */
8149
8150int
8151PyUnicode_EncodeDecimal(Py_UNICODE *s,
8152                        Py_ssize_t length,
8153                        char *output,
8154                        const char *errors)
8155{
8156    Py_UNICODE *p, *end;
8157    PyObject *errorHandler = NULL;
8158    PyObject *exc = NULL;
8159    const char *encoding = "decimal";
8160    const char *reason = "invalid decimal Unicode string";
8161    /* the following variable is used for caching string comparisons
8162     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8163    int known_errorHandler = -1;
8164
8165    if (output == NULL) {
8166        PyErr_BadArgument();
8167        return -1;
8168    }
8169
8170    p = s;
8171    end = s + length;
8172    while (p < end) {
8173        register Py_UNICODE ch = *p;
8174        int decimal;
8175        PyObject *repunicode;
8176        Py_ssize_t repsize;
8177        Py_ssize_t newpos;
8178        Py_UNICODE *uni2;
8179        Py_UNICODE *collstart;
8180        Py_UNICODE *collend;
8181
8182        if (Py_UNICODE_ISSPACE(ch)) {
8183            *output++ = ' ';
8184            ++p;
8185            continue;
8186        }
8187        decimal = Py_UNICODE_TODECIMAL(ch);
8188        if (decimal >= 0) {
8189            *output++ = '0' + decimal;
8190            ++p;
8191            continue;
8192        }
8193        if (0 < ch && ch < 256) {
8194            *output++ = (char)ch;
8195            ++p;
8196            continue;
8197        }
8198        /* All other characters are considered unencodable */
8199        collstart = p;
8200        collend = p+1;
8201        while (collend < end) {
8202            if ((0 < *collend && *collend < 256) ||
8203                !Py_UNICODE_ISSPACE(*collend) ||
8204                Py_UNICODE_TODECIMAL(*collend))
8205                break;
8206        }
8207        /* cache callback name lookup
8208         * (if not done yet, i.e. it's the first error) */
8209        if (known_errorHandler==-1) {
8210            if ((errors==NULL) || (!strcmp(errors, "strict")))
8211                known_errorHandler = 1;
8212            else if (!strcmp(errors, "replace"))
8213                known_errorHandler = 2;
8214            else if (!strcmp(errors, "ignore"))
8215                known_errorHandler = 3;
8216            else if (!strcmp(errors, "xmlcharrefreplace"))
8217                known_errorHandler = 4;
8218            else
8219                known_errorHandler = 0;
8220        }
8221        switch (known_errorHandler) {
8222        case 1: /* strict */
8223            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8224            goto onError;
8225        case 2: /* replace */
8226            for (p = collstart; p < collend; ++p)
8227                *output++ = '?';
8228            /* fall through */
8229        case 3: /* ignore */
8230            p = collend;
8231            break;
8232        case 4: /* xmlcharrefreplace */
8233            /* generate replacement (temporarily (mis)uses p) */
8234            for (p = collstart; p < collend; ++p)
8235                output += sprintf(output, "&#%d;", (int)*p);
8236            p = collend;
8237            break;
8238        default:
8239            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8240                                                          encoding, reason, s, length, &exc,
8241                                                          collstart-s, collend-s, &newpos);
8242            if (repunicode == NULL)
8243                goto onError;
8244            if (!PyUnicode_Check(repunicode)) {
8245                /* Byte results not supported, since they have no decimal property. */
8246                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8247                Py_DECREF(repunicode);
8248                goto onError;
8249            }
8250            /* generate replacement  */
8251            repsize = PyUnicode_GET_SIZE(repunicode);
8252            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8253                Py_UNICODE ch = *uni2;
8254                if (Py_UNICODE_ISSPACE(ch))
8255                    *output++ = ' ';
8256                else {
8257                    decimal = Py_UNICODE_TODECIMAL(ch);
8258                    if (decimal >= 0)
8259                        *output++ = '0' + decimal;
8260                    else if (0 < ch && ch < 256)
8261                        *output++ = (char)ch;
8262                    else {
8263                        Py_DECREF(repunicode);
8264                        raise_encode_exception(&exc, encoding,
8265                                               s, length, collstart-s, collend-s, reason);
8266                        goto onError;
8267                    }
8268                }
8269            }
8270            p = s + newpos;
8271            Py_DECREF(repunicode);
8272        }
8273    }
8274    /* 0-terminate the output string */
8275    *output++ = '\0';
8276    Py_XDECREF(exc);
8277    Py_XDECREF(errorHandler);
8278    return 0;
8279
8280  onError:
8281    Py_XDECREF(exc);
8282    Py_XDECREF(errorHandler);
8283    return -1;
8284}
8285
8286/* --- Helpers ------------------------------------------------------------ */
8287
8288#include "stringlib/ucs1lib.h"
8289#include "stringlib/fastsearch.h"
8290#include "stringlib/partition.h"
8291#include "stringlib/split.h"
8292#include "stringlib/count.h"
8293#include "stringlib/find.h"
8294#include "stringlib/localeutil.h"
8295#include "stringlib/undef.h"
8296
8297#include "stringlib/ucs2lib.h"
8298#include "stringlib/fastsearch.h"
8299#include "stringlib/partition.h"
8300#include "stringlib/split.h"
8301#include "stringlib/count.h"
8302#include "stringlib/find.h"
8303#include "stringlib/localeutil.h"
8304#include "stringlib/undef.h"
8305
8306#include "stringlib/ucs4lib.h"
8307#include "stringlib/fastsearch.h"
8308#include "stringlib/partition.h"
8309#include "stringlib/split.h"
8310#include "stringlib/count.h"
8311#include "stringlib/find.h"
8312#include "stringlib/localeutil.h"
8313#include "stringlib/undef.h"
8314
8315static Py_ssize_t
8316any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8317                                  const Py_UCS1*, Py_ssize_t,
8318                                  Py_ssize_t, Py_ssize_t),
8319               Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8320                                  const Py_UCS2*, Py_ssize_t,
8321                                  Py_ssize_t, Py_ssize_t),
8322               Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8323                                  const Py_UCS4*, Py_ssize_t,
8324                                  Py_ssize_t, Py_ssize_t),
8325               PyObject* s1, PyObject* s2,
8326               Py_ssize_t start,
8327               Py_ssize_t end)
8328{
8329    int kind1, kind2, kind;
8330    void *buf1, *buf2;
8331    Py_ssize_t len1, len2, result;
8332
8333    kind1 = PyUnicode_KIND(s1);
8334    kind2 = PyUnicode_KIND(s2);
8335    kind = kind1 > kind2 ? kind1 : kind2;
8336    buf1 = PyUnicode_DATA(s1);
8337    buf2 = PyUnicode_DATA(s2);
8338    if (kind1 != kind)
8339        buf1 = _PyUnicode_AsKind(s1, kind);
8340    if (!buf1)
8341        return -2;
8342    if (kind2 != kind)
8343        buf2 = _PyUnicode_AsKind(s2, kind);
8344    if (!buf2) {
8345        if (kind1 != kind) PyMem_Free(buf1);
8346        return -2;
8347    }
8348    len1 = PyUnicode_GET_LENGTH(s1);
8349    len2 = PyUnicode_GET_LENGTH(s2);
8350
8351    switch(kind) {
8352    case PyUnicode_1BYTE_KIND:
8353        result = ucs1(buf1, len1, buf2, len2, start, end);
8354        break;
8355    case PyUnicode_2BYTE_KIND:
8356        result = ucs2(buf1, len1, buf2, len2, start, end);
8357        break;
8358    case PyUnicode_4BYTE_KIND:
8359        result = ucs4(buf1, len1, buf2, len2, start, end);
8360        break;
8361    default:
8362        assert(0); result = -2;
8363    }
8364
8365    if (kind1 != kind)
8366        PyMem_Free(buf1);
8367    if (kind2 != kind)
8368        PyMem_Free(buf2);
8369
8370    return result;
8371}
8372
8373Py_ssize_t
8374_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8375                                   Py_ssize_t n_buffer,
8376                                   void *digits, Py_ssize_t n_digits,
8377                                   Py_ssize_t min_width,
8378                                   const char *grouping,
8379                                   const char *thousands_sep)
8380{
8381    switch(kind) {
8382    case PyUnicode_1BYTE_KIND:
8383        return _PyUnicode_ucs1_InsertThousandsGrouping(
8384            (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8385            min_width, grouping, thousands_sep);
8386    case PyUnicode_2BYTE_KIND:
8387        return _PyUnicode_ucs2_InsertThousandsGrouping(
8388            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8389            min_width, grouping, thousands_sep);
8390    case PyUnicode_4BYTE_KIND:
8391        return _PyUnicode_ucs4_InsertThousandsGrouping(
8392            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8393            min_width, grouping, thousands_sep);
8394    }
8395    assert(0);
8396    return -1;
8397}
8398
8399
8400#include "stringlib/unicodedefs.h"
8401#include "stringlib/fastsearch.h"
8402
8403#include "stringlib/count.h"
8404#include "stringlib/find.h"
8405
8406/* helper macro to fixup start/end slice values */
8407#define ADJUST_INDICES(start, end, len)         \
8408    if (end > len)                              \
8409        end = len;                              \
8410    else if (end < 0) {                         \
8411        end += len;                             \
8412        if (end < 0)                            \
8413            end = 0;                            \
8414    }                                           \
8415    if (start < 0) {                            \
8416        start += len;                           \
8417        if (start < 0)                          \
8418            start = 0;                          \
8419    }
8420
8421Py_ssize_t
8422PyUnicode_Count(PyObject *str,
8423                PyObject *substr,
8424                Py_ssize_t start,
8425                Py_ssize_t end)
8426{
8427    Py_ssize_t result;
8428    PyUnicodeObject* str_obj;
8429    PyUnicodeObject* sub_obj;
8430    int kind1, kind2, kind;
8431    void *buf1 = NULL, *buf2 = NULL;
8432    Py_ssize_t len1, len2;
8433
8434    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8435    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8436        return -1;
8437    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8438    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8439        Py_DECREF(str_obj);
8440        return -1;
8441    }
8442
8443    kind1 = PyUnicode_KIND(str_obj);
8444    kind2 = PyUnicode_KIND(sub_obj);
8445    kind = kind1 > kind2 ? kind1 : kind2;
8446    buf1 = PyUnicode_DATA(str_obj);
8447    if (kind1 != kind)
8448        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8449    if (!buf1)
8450        goto onError;
8451    buf2 = PyUnicode_DATA(sub_obj);
8452    if (kind2 != kind)
8453        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8454    if (!buf2)
8455        goto onError;
8456    len1 = PyUnicode_GET_LENGTH(str_obj);
8457    len2 = PyUnicode_GET_LENGTH(sub_obj);
8458
8459    ADJUST_INDICES(start, end, len1);
8460    switch(kind) {
8461    case PyUnicode_1BYTE_KIND:
8462        result = ucs1lib_count(
8463            ((Py_UCS1*)buf1) + start, end - start,
8464            buf2, len2, PY_SSIZE_T_MAX
8465            );
8466        break;
8467    case PyUnicode_2BYTE_KIND:
8468        result = ucs2lib_count(
8469            ((Py_UCS2*)buf1) + start, end - start,
8470            buf2, len2, PY_SSIZE_T_MAX
8471            );
8472        break;
8473    case PyUnicode_4BYTE_KIND:
8474        result = ucs4lib_count(
8475            ((Py_UCS4*)buf1) + start, end - start,
8476            buf2, len2, PY_SSIZE_T_MAX
8477            );
8478        break;
8479    default:
8480        assert(0); result = 0;
8481    }
8482
8483    Py_DECREF(sub_obj);
8484    Py_DECREF(str_obj);
8485
8486    if (kind1 != kind)
8487        PyMem_Free(buf1);
8488    if (kind2 != kind)
8489        PyMem_Free(buf2);
8490
8491    return result;
8492  onError:
8493    Py_DECREF(sub_obj);
8494    Py_DECREF(str_obj);
8495    if (kind1 != kind && buf1)
8496        PyMem_Free(buf1);
8497    if (kind2 != kind && buf2)
8498        PyMem_Free(buf2);
8499    return -1;
8500}
8501
8502Py_ssize_t
8503PyUnicode_Find(PyObject *str,
8504               PyObject *sub,
8505               Py_ssize_t start,
8506               Py_ssize_t end,
8507               int direction)
8508{
8509    Py_ssize_t result;
8510
8511    str = PyUnicode_FromObject(str);
8512    if (!str || PyUnicode_READY(str) == -1)
8513        return -2;
8514    sub = PyUnicode_FromObject(sub);
8515    if (!sub || PyUnicode_READY(sub) == -1) {
8516        Py_DECREF(str);
8517        return -2;
8518    }
8519
8520    if (direction > 0)
8521        result = any_find_slice(
8522            ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8523            str, sub, start, end
8524            );
8525    else
8526        result = any_find_slice(
8527            ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8528            str, sub, start, end
8529            );
8530
8531    Py_DECREF(str);
8532    Py_DECREF(sub);
8533
8534    return result;
8535}
8536
8537Py_ssize_t
8538PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8539                   Py_ssize_t start, Py_ssize_t end,
8540                   int direction)
8541{
8542    char *result;
8543    int kind;
8544    if (PyUnicode_READY(str) == -1)
8545        return -2;
8546    if (start < 0 || end < 0) {
8547        PyErr_SetString(PyExc_IndexError, "string index out of range");
8548        return -2;
8549    }
8550    if (end > PyUnicode_GET_LENGTH(str))
8551        end = PyUnicode_GET_LENGTH(str);
8552    kind = PyUnicode_KIND(str);
8553    result = findchar(PyUnicode_1BYTE_DATA(str)
8554                      + PyUnicode_KIND_SIZE(kind, start),
8555                      kind,
8556                      end-start, ch, direction);
8557    if (!result)
8558        return -1;
8559    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8560}
8561
8562static int
8563tailmatch(PyUnicodeObject *self,
8564          PyUnicodeObject *substring,
8565          Py_ssize_t start,
8566          Py_ssize_t end,
8567          int direction)
8568{
8569    int kind_self;
8570    int kind_sub;
8571    void *data_self;
8572    void *data_sub;
8573    Py_ssize_t offset;
8574    Py_ssize_t i;
8575    Py_ssize_t end_sub;
8576
8577    if (PyUnicode_READY(self) == -1 ||
8578        PyUnicode_READY(substring) == -1)
8579        return 0;
8580
8581    if (PyUnicode_GET_LENGTH(substring) == 0)
8582        return 1;
8583
8584    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8585    end -= PyUnicode_GET_LENGTH(substring);
8586    if (end < start)
8587        return 0;
8588
8589    kind_self = PyUnicode_KIND(self);
8590    data_self = PyUnicode_DATA(self);
8591    kind_sub = PyUnicode_KIND(substring);
8592    data_sub = PyUnicode_DATA(substring);
8593    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8594
8595    if (direction > 0)
8596        offset = end;
8597    else
8598        offset = start;
8599
8600    if (PyUnicode_READ(kind_self, data_self, offset) ==
8601        PyUnicode_READ(kind_sub, data_sub, 0) &&
8602        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8603        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8604        /* If both are of the same kind, memcmp is sufficient */
8605        if (kind_self == kind_sub) {
8606            return ! memcmp((char *)data_self +
8607                                (offset * PyUnicode_CHARACTER_SIZE(substring)),
8608                            data_sub,
8609                            PyUnicode_GET_LENGTH(substring) *
8610                                PyUnicode_CHARACTER_SIZE(substring));
8611        }
8612        /* otherwise we have to compare each character by first accesing it */
8613        else {
8614            /* We do not need to compare 0 and len(substring)-1 because
8615               the if statement above ensured already that they are equal
8616               when we end up here. */
8617            // TODO: honor direction and do a forward or backwards search
8618            for (i = 1; i < end_sub; ++i) {
8619                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8620                    PyUnicode_READ(kind_sub, data_sub, i))
8621                    return 0;
8622            }
8623            return 1;
8624        }
8625    }
8626
8627    return 0;
8628}
8629
8630Py_ssize_t
8631PyUnicode_Tailmatch(PyObject *str,
8632                    PyObject *substr,
8633                    Py_ssize_t start,
8634                    Py_ssize_t end,
8635                    int direction)
8636{
8637    Py_ssize_t result;
8638
8639    str = PyUnicode_FromObject(str);
8640    if (str == NULL)
8641        return -1;
8642    substr = PyUnicode_FromObject(substr);
8643    if (substr == NULL) {
8644        Py_DECREF(str);
8645        return -1;
8646    }
8647
8648    result = tailmatch((PyUnicodeObject *)str,
8649                       (PyUnicodeObject *)substr,
8650                       start, end, direction);
8651    Py_DECREF(str);
8652    Py_DECREF(substr);
8653    return result;
8654}
8655
8656/* Apply fixfct filter to the Unicode object self and return a
8657   reference to the modified object */
8658
8659static PyObject *
8660fixup(PyUnicodeObject *self,
8661      Py_UCS4 (*fixfct)(PyUnicodeObject *s))
8662{
8663    PyObject *u;
8664    Py_UCS4 maxchar_old, maxchar_new = 0;
8665
8666    if (PyUnicode_READY(self) == -1)
8667        return NULL;
8668    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8669    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8670                      maxchar_old);
8671    if (u == NULL)
8672        return NULL;
8673
8674    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8675              PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
8676
8677    /* fix functions return the new maximum character in a string,
8678       if the kind of the resulting unicode object does not change,
8679       everything is fine.  Otherwise we need to change the string kind
8680       and re-run the fix function. */
8681    maxchar_new = fixfct((PyUnicodeObject*)u);
8682    if (maxchar_new == 0)
8683        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8684    else if (maxchar_new <= 127)
8685        maxchar_new = 127;
8686    else if (maxchar_new <= 255)
8687        maxchar_new = 255;
8688    else if (maxchar_new <= 65535)
8689        maxchar_new = 65535;
8690    else
8691        maxchar_new = 1114111; /* 0x10ffff */
8692
8693    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8694        /* fixfct should return TRUE if it modified the buffer. If
8695           FALSE, return a reference to the original buffer instead
8696           (to save space, not time) */
8697        Py_INCREF(self);
8698        Py_DECREF(u);
8699        return (PyObject*) self;
8700    }
8701    else if (maxchar_new == maxchar_old) {
8702        return u;
8703    }
8704    else {
8705        /* In case the maximum character changed, we need to
8706           convert the string to the new category. */
8707        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8708        if (v == NULL) {
8709            Py_DECREF(u);
8710            return NULL;
8711        }
8712        if (maxchar_new > maxchar_old) {
8713            /* If the maxchar increased so that the kind changed, not all
8714               characters are representable anymore and we need to fix the
8715               string again. This only happens in very few cases. */
8716            if (PyUnicode_CopyCharacters(v, 0,
8717                                         (PyObject*)self, 0,
8718                                         PyUnicode_GET_LENGTH(self)) < 0)
8719            {
8720                Py_DECREF(u);
8721                return NULL;
8722            }
8723            maxchar_old = fixfct((PyUnicodeObject*)v);
8724            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8725        }
8726        else {
8727            if (PyUnicode_CopyCharacters(v, 0,
8728                                         u, 0,
8729                                         PyUnicode_GET_LENGTH(self)) < 0)
8730            {
8731                Py_DECREF(u);
8732                return NULL;
8733            }
8734        }
8735
8736        Py_DECREF(u);
8737        return v;
8738    }
8739}
8740
8741static Py_UCS4
8742fixupper(PyUnicodeObject *self)
8743{
8744    /* No need to call PyUnicode_READY(self) because this function is only
8745       called as a callback from fixup() which does it already. */
8746    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8747    const int kind = PyUnicode_KIND(self);
8748    void *data = PyUnicode_DATA(self);
8749    int touched = 0;
8750    Py_UCS4 maxchar = 0;
8751    Py_ssize_t i;
8752
8753    for (i = 0; i < len; ++i) {
8754        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8755        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8756        if (up != ch) {
8757            if (up > maxchar)
8758                maxchar = up;
8759            PyUnicode_WRITE(kind, data, i, up);
8760            touched = 1;
8761        }
8762        else if (ch > maxchar)
8763            maxchar = ch;
8764    }
8765
8766    if (touched)
8767        return maxchar;
8768    else
8769        return 0;
8770}
8771
8772static Py_UCS4
8773fixlower(PyUnicodeObject *self)
8774{
8775    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8776    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8777    const int kind = PyUnicode_KIND(self);
8778    void *data = PyUnicode_DATA(self);
8779    int touched = 0;
8780    Py_UCS4 maxchar = 0;
8781    Py_ssize_t i;
8782
8783    for(i = 0; i < len; ++i) {
8784        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8785        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8786        if (lo != ch) {
8787            if (lo > maxchar)
8788                maxchar = lo;
8789            PyUnicode_WRITE(kind, data, i, lo);
8790            touched = 1;
8791        }
8792        else if (ch > maxchar)
8793            maxchar = ch;
8794    }
8795
8796    if (touched)
8797        return maxchar;
8798    else
8799        return 0;
8800}
8801
8802static Py_UCS4
8803fixswapcase(PyUnicodeObject *self)
8804{
8805    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8806    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8807    const int kind = PyUnicode_KIND(self);
8808    void *data = PyUnicode_DATA(self);
8809    int touched = 0;
8810    Py_UCS4 maxchar = 0;
8811    Py_ssize_t i;
8812
8813    for(i = 0; i < len; ++i) {
8814        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8815        Py_UCS4 nu = 0;
8816
8817        if (Py_UNICODE_ISUPPER(ch))
8818            nu = Py_UNICODE_TOLOWER(ch);
8819        else if (Py_UNICODE_ISLOWER(ch))
8820            nu = Py_UNICODE_TOUPPER(ch);
8821
8822        if (nu != 0) {
8823            if (nu > maxchar)
8824                maxchar = nu;
8825            PyUnicode_WRITE(kind, data, i, nu);
8826            touched = 1;
8827        }
8828        else if (ch > maxchar)
8829            maxchar = ch;
8830    }
8831
8832    if (touched)
8833        return maxchar;
8834    else
8835        return 0;
8836}
8837
8838static Py_UCS4
8839fixcapitalize(PyUnicodeObject *self)
8840{
8841    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8842    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8843    const int kind = PyUnicode_KIND(self);
8844    void *data = PyUnicode_DATA(self);
8845    int touched = 0;
8846    Py_UCS4 maxchar = 0;
8847    Py_ssize_t i = 0;
8848    Py_UCS4 ch;
8849
8850    if (len == 0)
8851        return 0;
8852
8853    ch = PyUnicode_READ(kind, data, i);
8854    if (!Py_UNICODE_ISUPPER(ch)) {
8855        maxchar = Py_UNICODE_TOUPPER(ch);
8856        PyUnicode_WRITE(kind, data, i, maxchar);
8857        touched = 1;
8858    }
8859    ++i;
8860    for(; i < len; ++i) {
8861        ch = PyUnicode_READ(kind, data, i);
8862        if (!Py_UNICODE_ISLOWER(ch)) {
8863            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8864            if (lo > maxchar)
8865                maxchar = lo;
8866            PyUnicode_WRITE(kind, data, i, lo);
8867            touched = 1;
8868        }
8869        else if (ch > maxchar)
8870            maxchar = ch;
8871    }
8872
8873    if (touched)
8874        return maxchar;
8875    else
8876        return 0;
8877}
8878
8879static Py_UCS4
8880fixtitle(PyUnicodeObject *self)
8881{
8882    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8883    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8884    const int kind = PyUnicode_KIND(self);
8885    void *data = PyUnicode_DATA(self);
8886    Py_UCS4 maxchar = 0;
8887    Py_ssize_t i = 0;
8888    int previous_is_cased;
8889
8890    /* Shortcut for single character strings */
8891    if (len == 1) {
8892        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8893        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8894        if (ti != ch) {
8895            PyUnicode_WRITE(kind, data, i, ti);
8896            return ti;
8897        }
8898        else
8899            return 0;
8900    }
8901    previous_is_cased = 0;
8902    for(; i < len; ++i) {
8903        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8904        Py_UCS4 nu;
8905
8906        if (previous_is_cased)
8907            nu = Py_UNICODE_TOLOWER(ch);
8908        else
8909            nu = Py_UNICODE_TOTITLE(ch);
8910
8911        if (nu > maxchar)
8912            maxchar = nu;
8913        PyUnicode_WRITE(kind, data, i, nu);
8914
8915        if (Py_UNICODE_ISLOWER(ch) ||
8916            Py_UNICODE_ISUPPER(ch) ||
8917            Py_UNICODE_ISTITLE(ch))
8918            previous_is_cased = 1;
8919        else
8920            previous_is_cased = 0;
8921    }
8922    return maxchar;
8923}
8924
8925PyObject *
8926PyUnicode_Join(PyObject *separator, PyObject *seq)
8927{
8928    PyObject *sep = NULL;
8929    Py_ssize_t seplen = 1;
8930    PyObject *res = NULL; /* the result */
8931    PyObject *fseq;          /* PySequence_Fast(seq) */
8932    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
8933    PyObject **items;
8934    PyObject *item;
8935    Py_ssize_t sz, i, res_offset;
8936    Py_UCS4 maxchar = 0;
8937    Py_UCS4 item_maxchar;
8938
8939    fseq = PySequence_Fast(seq, "");
8940    if (fseq == NULL) {
8941        return NULL;
8942    }
8943
8944    /* NOTE: the following code can't call back into Python code,
8945     * so we are sure that fseq won't be mutated.
8946     */
8947
8948    seqlen = PySequence_Fast_GET_SIZE(fseq);
8949    /* If empty sequence, return u"". */
8950    if (seqlen == 0) {
8951        res = PyUnicode_New(0, 0);
8952        goto Done;
8953    }
8954    items = PySequence_Fast_ITEMS(fseq);
8955    /* If singleton sequence with an exact Unicode, return that. */
8956    if (seqlen == 1) {
8957        item = items[0];
8958        if (PyUnicode_CheckExact(item)) {
8959            Py_INCREF(item);
8960            res = item;
8961            goto Done;
8962        }
8963    }
8964    else {
8965        /* Set up sep and seplen */
8966        if (separator == NULL) {
8967            /* fall back to a blank space separator */
8968            sep = PyUnicode_FromOrdinal(' ');
8969            if (!sep)
8970                goto onError;
8971        }
8972        else {
8973            if (!PyUnicode_Check(separator)) {
8974                PyErr_Format(PyExc_TypeError,
8975                             "separator: expected str instance,"
8976                             " %.80s found",
8977                             Py_TYPE(separator)->tp_name);
8978                goto onError;
8979            }
8980            if (PyUnicode_READY(separator))
8981                goto onError;
8982            sep = separator;
8983            seplen = PyUnicode_GET_LENGTH(separator);
8984            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8985            /* inc refcount to keep this code path symmetric with the
8986               above case of a blank separator */
8987            Py_INCREF(sep);
8988        }
8989    }
8990
8991    /* There are at least two things to join, or else we have a subclass
8992     * of str in the sequence.
8993     * Do a pre-pass to figure out the total amount of space we'll
8994     * need (sz), and see whether all argument are strings.
8995     */
8996    sz = 0;
8997    for (i = 0; i < seqlen; i++) {
8998        const Py_ssize_t old_sz = sz;
8999        item = items[i];
9000        if (!PyUnicode_Check(item)) {
9001            PyErr_Format(PyExc_TypeError,
9002                         "sequence item %zd: expected str instance,"
9003                         " %.80s found",
9004                         i, Py_TYPE(item)->tp_name);
9005            goto onError;
9006        }
9007        if (PyUnicode_READY(item) == -1)
9008            goto onError;
9009        sz += PyUnicode_GET_LENGTH(item);
9010        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9011        if (item_maxchar > maxchar)
9012            maxchar = item_maxchar;
9013        if (i != 0)
9014            sz += seplen;
9015        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9016            PyErr_SetString(PyExc_OverflowError,
9017                            "join() result is too long for a Python string");
9018            goto onError;
9019        }
9020    }
9021
9022    res = PyUnicode_New(sz, maxchar);
9023    if (res == NULL)
9024        goto onError;
9025
9026    /* Catenate everything. */
9027    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9028        Py_ssize_t itemlen, copied;
9029        item = items[i];
9030        /* Copy item, and maybe the separator. */
9031        if (i && seplen != 0) {
9032            copied = PyUnicode_CopyCharacters(res, res_offset,
9033                                              sep, 0, seplen);
9034            if (copied < 0)
9035                goto onError;
9036#ifdef Py_DEBUG
9037            res_offset += copied;
9038#else
9039            res_offset += seplen;
9040#endif
9041        }
9042        itemlen = PyUnicode_GET_LENGTH(item);
9043        if (itemlen != 0) {
9044            copied = PyUnicode_CopyCharacters(res, res_offset,
9045                                              item, 0, itemlen);
9046            if (copied < 0)
9047                goto onError;
9048#ifdef Py_DEBUG
9049            res_offset += copied;
9050#else
9051            res_offset += itemlen;
9052#endif
9053        }
9054    }
9055    assert(res_offset == PyUnicode_GET_LENGTH(res));
9056
9057  Done:
9058    Py_DECREF(fseq);
9059    Py_XDECREF(sep);
9060    return res;
9061
9062  onError:
9063    Py_DECREF(fseq);
9064    Py_XDECREF(sep);
9065    Py_XDECREF(res);
9066    return NULL;
9067}
9068
9069#define FILL(kind, data, value, start, length) \
9070    do { \
9071        Py_ssize_t i_ = 0; \
9072        assert(kind != PyUnicode_WCHAR_KIND); \
9073        switch ((kind)) { \
9074        case PyUnicode_1BYTE_KIND: { \
9075            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9076            memset(to_, (unsigned char)value, length); \
9077            break; \
9078        } \
9079        case PyUnicode_2BYTE_KIND: { \
9080            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9081            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9082            break; \
9083        } \
9084        default: { \
9085            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9086            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9087            break; \
9088        } \
9089        } \
9090    } while (0)
9091
9092static PyUnicodeObject *
9093pad(PyUnicodeObject *self,
9094    Py_ssize_t left,
9095    Py_ssize_t right,
9096    Py_UCS4 fill)
9097{
9098    PyObject *u;
9099    Py_UCS4 maxchar;
9100    int kind;
9101    void *data;
9102
9103    if (left < 0)
9104        left = 0;
9105    if (right < 0)
9106        right = 0;
9107
9108    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9109        Py_INCREF(self);
9110        return self;
9111    }
9112
9113    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9114        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9115        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9116        return NULL;
9117    }
9118    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9119    if (fill > maxchar)
9120        maxchar = fill;
9121    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9122    if (!u)
9123        return NULL;
9124
9125    kind = PyUnicode_KIND(u);
9126    data = PyUnicode_DATA(u);
9127    if (left)
9128        FILL(kind, data, fill, 0, left);
9129    if (right)
9130        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9131    if (PyUnicode_CopyCharacters(u, left,
9132                                 (PyObject*)self, 0,
9133                                 _PyUnicode_LENGTH(self)) < 0)
9134    {
9135        Py_DECREF(u);
9136        return NULL;
9137    }
9138
9139    return (PyUnicodeObject*)u;
9140}
9141#undef FILL
9142
9143PyObject *
9144PyUnicode_Splitlines(PyObject *string, int keepends)
9145{
9146    PyObject *list;
9147
9148    string = PyUnicode_FromObject(string);
9149    if (string == NULL || PyUnicode_READY(string) == -1)
9150        return NULL;
9151
9152    switch(PyUnicode_KIND(string)) {
9153    case PyUnicode_1BYTE_KIND:
9154        list = ucs1lib_splitlines(
9155            (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9156            PyUnicode_GET_LENGTH(string), keepends);
9157        break;
9158    case PyUnicode_2BYTE_KIND:
9159        list = ucs2lib_splitlines(
9160            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9161            PyUnicode_GET_LENGTH(string), keepends);
9162        break;
9163    case PyUnicode_4BYTE_KIND:
9164        list = ucs4lib_splitlines(
9165            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9166            PyUnicode_GET_LENGTH(string), keepends);
9167        break;
9168    default:
9169        assert(0);
9170        list = 0;
9171    }
9172    Py_DECREF(string);
9173    return list;
9174}
9175
9176static PyObject *
9177split(PyUnicodeObject *self,
9178      PyUnicodeObject *substring,
9179      Py_ssize_t maxcount)
9180{
9181    int kind1, kind2, kind;
9182    void *buf1, *buf2;
9183    Py_ssize_t len1, len2;
9184    PyObject* out;
9185
9186    if (maxcount < 0)
9187        maxcount = PY_SSIZE_T_MAX;
9188
9189    if (PyUnicode_READY(self) == -1)
9190        return NULL;
9191
9192    if (substring == NULL)
9193        switch(PyUnicode_KIND(self)) {
9194        case PyUnicode_1BYTE_KIND:
9195            return ucs1lib_split_whitespace(
9196                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9197                PyUnicode_GET_LENGTH(self), maxcount
9198                );
9199        case PyUnicode_2BYTE_KIND:
9200            return ucs2lib_split_whitespace(
9201                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9202                PyUnicode_GET_LENGTH(self), maxcount
9203                );
9204        case PyUnicode_4BYTE_KIND:
9205            return ucs4lib_split_whitespace(
9206                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9207                PyUnicode_GET_LENGTH(self), maxcount
9208                );
9209        default:
9210            assert(0);
9211            return NULL;
9212        }
9213
9214    if (PyUnicode_READY(substring) == -1)
9215        return NULL;
9216
9217    kind1 = PyUnicode_KIND(self);
9218    kind2 = PyUnicode_KIND(substring);
9219    kind = kind1 > kind2 ? kind1 : kind2;
9220    buf1 = PyUnicode_DATA(self);
9221    buf2 = PyUnicode_DATA(substring);
9222    if (kind1 != kind)
9223        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9224    if (!buf1)
9225        return NULL;
9226    if (kind2 != kind)
9227        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9228    if (!buf2) {
9229        if (kind1 != kind) PyMem_Free(buf1);
9230        return NULL;
9231    }
9232    len1 = PyUnicode_GET_LENGTH(self);
9233    len2 = PyUnicode_GET_LENGTH(substring);
9234
9235    switch(kind) {
9236    case PyUnicode_1BYTE_KIND:
9237        out = ucs1lib_split(
9238            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9239        break;
9240    case PyUnicode_2BYTE_KIND:
9241        out = ucs2lib_split(
9242            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9243        break;
9244    case PyUnicode_4BYTE_KIND:
9245        out = ucs4lib_split(
9246            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9247        break;
9248    default:
9249        out = NULL;
9250    }
9251    if (kind1 != kind)
9252        PyMem_Free(buf1);
9253    if (kind2 != kind)
9254        PyMem_Free(buf2);
9255    return out;
9256}
9257
9258static PyObject *
9259rsplit(PyUnicodeObject *self,
9260       PyUnicodeObject *substring,
9261       Py_ssize_t maxcount)
9262{
9263    int kind1, kind2, kind;
9264    void *buf1, *buf2;
9265    Py_ssize_t len1, len2;
9266    PyObject* out;
9267
9268    if (maxcount < 0)
9269        maxcount = PY_SSIZE_T_MAX;
9270
9271    if (PyUnicode_READY(self) == -1)
9272        return NULL;
9273
9274    if (substring == NULL)
9275        switch(PyUnicode_KIND(self)) {
9276        case PyUnicode_1BYTE_KIND:
9277            return ucs1lib_rsplit_whitespace(
9278                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9279                PyUnicode_GET_LENGTH(self), maxcount
9280                );
9281        case PyUnicode_2BYTE_KIND:
9282            return ucs2lib_rsplit_whitespace(
9283                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9284                PyUnicode_GET_LENGTH(self), maxcount
9285                );
9286        case PyUnicode_4BYTE_KIND:
9287            return ucs4lib_rsplit_whitespace(
9288                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9289                PyUnicode_GET_LENGTH(self), maxcount
9290                );
9291        default:
9292            assert(0);
9293            return NULL;
9294        }
9295
9296    if (PyUnicode_READY(substring) == -1)
9297        return NULL;
9298
9299    kind1 = PyUnicode_KIND(self);
9300    kind2 = PyUnicode_KIND(substring);
9301    kind = kind1 > kind2 ? kind1 : kind2;
9302    buf1 = PyUnicode_DATA(self);
9303    buf2 = PyUnicode_DATA(substring);
9304    if (kind1 != kind)
9305        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9306    if (!buf1)
9307        return NULL;
9308    if (kind2 != kind)
9309        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9310    if (!buf2) {
9311        if (kind1 != kind) PyMem_Free(buf1);
9312        return NULL;
9313    }
9314    len1 = PyUnicode_GET_LENGTH(self);
9315    len2 = PyUnicode_GET_LENGTH(substring);
9316
9317    switch(kind) {
9318    case PyUnicode_1BYTE_KIND:
9319        out = ucs1lib_rsplit(
9320            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9321        break;
9322    case PyUnicode_2BYTE_KIND:
9323        out = ucs2lib_rsplit(
9324            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9325        break;
9326    case PyUnicode_4BYTE_KIND:
9327        out = ucs4lib_rsplit(
9328            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9329        break;
9330    default:
9331        out = NULL;
9332    }
9333    if (kind1 != kind)
9334        PyMem_Free(buf1);
9335    if (kind2 != kind)
9336        PyMem_Free(buf2);
9337    return out;
9338}
9339
9340static Py_ssize_t
9341anylib_find(int kind, void *buf1, Py_ssize_t len1,
9342            void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9343{
9344    switch(kind) {
9345    case PyUnicode_1BYTE_KIND:
9346        return ucs1lib_find(buf1, len1, buf2, len2, offset);
9347    case PyUnicode_2BYTE_KIND:
9348        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9349    case PyUnicode_4BYTE_KIND:
9350        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9351    }
9352    assert(0);
9353    return -1;
9354}
9355
9356static Py_ssize_t
9357anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9358             void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9359{
9360        switch(kind) {
9361        case PyUnicode_1BYTE_KIND:
9362            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9363        case PyUnicode_2BYTE_KIND:
9364            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9365        case PyUnicode_4BYTE_KIND:
9366            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9367        }
9368        assert(0);
9369        return 0;
9370}
9371
9372static PyObject *
9373replace(PyObject *self, PyObject *str1,
9374        PyObject *str2, Py_ssize_t maxcount)
9375{
9376    PyObject *u;
9377    char *sbuf = PyUnicode_DATA(self);
9378    char *buf1 = PyUnicode_DATA(str1);
9379    char *buf2 = PyUnicode_DATA(str2);
9380    int srelease = 0, release1 = 0, release2 = 0;
9381    int skind = PyUnicode_KIND(self);
9382    int kind1 = PyUnicode_KIND(str1);
9383    int kind2 = PyUnicode_KIND(str2);
9384    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9385    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9386    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9387
9388    if (maxcount < 0)
9389        maxcount = PY_SSIZE_T_MAX;
9390    else if (maxcount == 0 || slen == 0)
9391        goto nothing;
9392
9393    if (skind < kind1)
9394        /* substring too wide to be present */
9395        goto nothing;
9396
9397    if (len1 == len2) {
9398        Py_ssize_t i;
9399        /* same length */
9400        if (len1 == 0)
9401            goto nothing;
9402        if (len1 == 1) {
9403            /* replace characters */
9404            Py_UCS4 u1, u2, maxchar;
9405            int mayshrink, rkind;
9406            u1 = PyUnicode_READ_CHAR(str1, 0);
9407            if (!findchar(sbuf, PyUnicode_KIND(self),
9408                          slen, u1, 1))
9409                goto nothing;
9410            u2 = PyUnicode_READ_CHAR(str2, 0);
9411            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9412            /* Replacing u1 with u2 may cause a maxchar reduction in the
9413               result string. */
9414            if (u2 > maxchar) {
9415                maxchar = u2;
9416                mayshrink = 0;
9417            }
9418            else
9419                mayshrink = maxchar > 127;
9420            u = PyUnicode_New(slen, maxchar);
9421            if (!u)
9422                goto error;
9423            if (PyUnicode_CopyCharacters(u, 0,
9424                                         (PyObject*)self, 0, slen) < 0)
9425            {
9426                Py_DECREF(u);
9427                return NULL;
9428            }
9429            rkind = PyUnicode_KIND(u);
9430            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9431                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9432                    if (--maxcount < 0)
9433                        break;
9434                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9435                }
9436            if (mayshrink) {
9437                PyObject *tmp = u;
9438                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9439                                              PyUnicode_GET_LENGTH(tmp));
9440                Py_DECREF(tmp);
9441            }
9442        } else {
9443            int rkind = skind;
9444            char *res;
9445            if (kind1 < rkind) {
9446                /* widen substring */
9447                buf1 = _PyUnicode_AsKind(str1, rkind);
9448                if (!buf1) goto error;
9449                release1 = 1;
9450            }
9451            i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
9452            if (i < 0)
9453                goto nothing;
9454            if (rkind > kind2) {
9455                /* widen replacement */
9456                buf2 = _PyUnicode_AsKind(str2, rkind);
9457                if (!buf2) goto error;
9458                release2 = 1;
9459            }
9460            else if (rkind < kind2) {
9461                /* widen self and buf1 */
9462                rkind = kind2;
9463                if (release1) PyMem_Free(buf1);
9464                sbuf = _PyUnicode_AsKind(self, rkind);
9465                if (!sbuf) goto error;
9466                srelease = 1;
9467                buf1 = _PyUnicode_AsKind(str1, rkind);
9468                if (!buf1) goto error;
9469                release1 = 1;
9470            }
9471            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9472            if (!res) {
9473                PyErr_NoMemory();
9474                goto error;
9475            }
9476            memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
9477            /* change everything in-place, starting with this one */
9478            memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9479                   buf2,
9480                   PyUnicode_KIND_SIZE(rkind, len2));
9481            i += len1;
9482
9483            while ( --maxcount > 0) {
9484                i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9485                                slen-i,
9486                                buf1, len1, i);
9487                if (i == -1)
9488                    break;
9489                memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9490                       buf2,
9491                       PyUnicode_KIND_SIZE(rkind, len2));
9492                i += len1;
9493            }
9494
9495            u = PyUnicode_FromKindAndData(rkind, res, slen);
9496            PyMem_Free(res);
9497            if (!u) goto error;
9498        }
9499    } else {
9500
9501        Py_ssize_t n, i, j, ires;
9502        Py_ssize_t product, new_size;
9503        int rkind = skind;
9504        char *res;
9505
9506        if (kind1 < rkind) {
9507            buf1 = _PyUnicode_AsKind(str1, rkind);
9508            if (!buf1) goto error;
9509            release1 = 1;
9510        }
9511        n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
9512        if (n == 0)
9513            goto nothing;
9514        if (kind2 < rkind) {
9515            buf2 = _PyUnicode_AsKind(str2, rkind);
9516            if (!buf2) goto error;
9517            release2 = 1;
9518        }
9519        else if (kind2 > rkind) {
9520            rkind = kind2;
9521            sbuf = _PyUnicode_AsKind(self, rkind);
9522            if (!sbuf) goto error;
9523            srelease = 1;
9524            if (release1) PyMem_Free(buf1);
9525            buf1 = _PyUnicode_AsKind(str1, rkind);
9526            if (!buf1) goto error;
9527            release1 = 1;
9528        }
9529        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9530           PyUnicode_GET_LENGTH(str1))); */
9531        product = n * (len2-len1);
9532        if ((product / (len2-len1)) != n) {
9533                PyErr_SetString(PyExc_OverflowError,
9534                                "replace string is too long");
9535                goto error;
9536        }
9537        new_size = slen + product;
9538        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9539            PyErr_SetString(PyExc_OverflowError,
9540                            "replace string is too long");
9541            goto error;
9542        }
9543        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9544        if (!res)
9545            goto error;
9546        ires = i = 0;
9547        if (len1 > 0) {
9548            while (n-- > 0) {
9549                /* look for next match */
9550                j = anylib_find(rkind,
9551                                sbuf + PyUnicode_KIND_SIZE(rkind, i),
9552                                slen-i, buf1, len1, i);
9553                if (j == -1)
9554                    break;
9555                else if (j > i) {
9556                    /* copy unchanged part [i:j] */
9557                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9558                           sbuf + PyUnicode_KIND_SIZE(rkind, i),
9559                           PyUnicode_KIND_SIZE(rkind, j-i));
9560                    ires += j - i;
9561                }
9562                /* copy substitution string */
9563                if (len2 > 0) {
9564                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9565                           buf2,
9566                           PyUnicode_KIND_SIZE(rkind, len2));
9567                    ires += len2;
9568                }
9569                i = j + len1;
9570            }
9571            if (i < slen)
9572                /* copy tail [i:] */
9573                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9574                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9575                       PyUnicode_KIND_SIZE(rkind, slen-i));
9576        } else {
9577            /* interleave */
9578            while (n > 0) {
9579                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9580                       buf2,
9581                       PyUnicode_KIND_SIZE(rkind, len2));
9582                ires += len2;
9583                if (--n <= 0)
9584                    break;
9585                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9586                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9587                       PyUnicode_KIND_SIZE(rkind, 1));
9588                ires++;
9589                i++;
9590            }
9591            memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9592                   sbuf + PyUnicode_KIND_SIZE(rkind, i),
9593                   PyUnicode_KIND_SIZE(rkind, slen-i));
9594        }
9595        u = PyUnicode_FromKindAndData(rkind, res, new_size);
9596        PyMem_Free(res);
9597    }
9598    if (srelease)
9599        PyMem_FREE(sbuf);
9600    if (release1)
9601        PyMem_FREE(buf1);
9602    if (release2)
9603        PyMem_FREE(buf2);
9604    return u;
9605
9606  nothing:
9607    /* nothing to replace; return original string (when possible) */
9608    if (srelease)
9609        PyMem_FREE(sbuf);
9610    if (release1)
9611        PyMem_FREE(buf1);
9612    if (release2)
9613        PyMem_FREE(buf2);
9614    if (PyUnicode_CheckExact(self)) {
9615        Py_INCREF(self);
9616        return (PyObject *) self;
9617    }
9618    return PyUnicode_Copy(self);
9619  error:
9620    if (srelease && sbuf)
9621        PyMem_FREE(sbuf);
9622    if (release1 && buf1)
9623        PyMem_FREE(buf1);
9624    if (release2 && buf2)
9625        PyMem_FREE(buf2);
9626    return NULL;
9627}
9628
9629/* --- Unicode Object Methods --------------------------------------------- */
9630
9631PyDoc_STRVAR(title__doc__,
9632             "S.title() -> str\n\
9633\n\
9634Return a titlecased version of S, i.e. words start with title case\n\
9635characters, all remaining cased characters have lower case.");
9636
9637static PyObject*
9638unicode_title(PyUnicodeObject *self)
9639{
9640    return fixup(self, fixtitle);
9641}
9642
9643PyDoc_STRVAR(capitalize__doc__,
9644             "S.capitalize() -> str\n\
9645\n\
9646Return a capitalized version of S, i.e. make the first character\n\
9647have upper case and the rest lower case.");
9648
9649static PyObject*
9650unicode_capitalize(PyUnicodeObject *self)
9651{
9652    return fixup(self, fixcapitalize);
9653}
9654
9655#if 0
9656PyDoc_STRVAR(capwords__doc__,
9657             "S.capwords() -> str\n\
9658\n\
9659Apply .capitalize() to all words in S and return the result with\n\
9660normalized whitespace (all whitespace strings are replaced by ' ').");
9661
9662static PyObject*
9663unicode_capwords(PyUnicodeObject *self)
9664{
9665    PyObject *list;
9666    PyObject *item;
9667    Py_ssize_t i;
9668
9669    /* Split into words */
9670    list = split(self, NULL, -1);
9671    if (!list)
9672        return NULL;
9673
9674    /* Capitalize each word */
9675    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9676        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9677                     fixcapitalize);
9678        if (item == NULL)
9679            goto onError;
9680        Py_DECREF(PyList_GET_ITEM(list, i));
9681        PyList_SET_ITEM(list, i, item);
9682    }
9683
9684    /* Join the words to form a new string */
9685    item = PyUnicode_Join(NULL, list);
9686
9687  onError:
9688    Py_DECREF(list);
9689    return (PyObject *)item;
9690}
9691#endif
9692
9693/* Argument converter.  Coerces to a single unicode character */
9694
9695static int
9696convert_uc(PyObject *obj, void *addr)
9697{
9698    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9699    PyObject *uniobj;
9700
9701    uniobj = PyUnicode_FromObject(obj);
9702    if (uniobj == NULL) {
9703        PyErr_SetString(PyExc_TypeError,
9704                        "The fill character cannot be converted to Unicode");
9705        return 0;
9706    }
9707    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
9708        PyErr_SetString(PyExc_TypeError,
9709                        "The fill character must be exactly one character long");
9710        Py_DECREF(uniobj);
9711        return 0;
9712    }
9713    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
9714    Py_DECREF(uniobj);
9715    return 1;
9716}
9717
9718PyDoc_STRVAR(center__doc__,
9719             "S.center(width[, fillchar]) -> str\n\
9720\n\
9721Return S centered in a string of length width. Padding is\n\
9722done using the specified fill character (default is a space)");
9723
9724static PyObject *
9725unicode_center(PyUnicodeObject *self, PyObject *args)
9726{
9727    Py_ssize_t marg, left;
9728    Py_ssize_t width;
9729    Py_UCS4 fillchar = ' ';
9730
9731    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
9732        return NULL;
9733
9734    if (PyUnicode_READY(self) == -1)
9735        return NULL;
9736
9737    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
9738        Py_INCREF(self);
9739        return (PyObject*) self;
9740    }
9741
9742    marg = width - _PyUnicode_LENGTH(self);
9743    left = marg / 2 + (marg & width & 1);
9744
9745    return (PyObject*) pad(self, left, marg - left, fillchar);
9746}
9747
9748#if 0
9749
9750/* This code should go into some future Unicode collation support
9751   module. The basic comparison should compare ordinals on a naive
9752   basis (this is what Java does and thus Jython too). */
9753
9754/* speedy UTF-16 code point order comparison */
9755/* gleaned from: */
9756/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9757
9758static short utf16Fixup[32] =
9759{
9760    0, 0, 0, 0, 0, 0, 0, 0,
9761    0, 0, 0, 0, 0, 0, 0, 0,
9762    0, 0, 0, 0, 0, 0, 0, 0,
9763    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
9764};
9765
9766static int
9767unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9768{
9769    Py_ssize_t len1, len2;
9770
9771    Py_UNICODE *s1 = str1->str;
9772    Py_UNICODE *s2 = str2->str;
9773
9774    len1 = str1->_base._base.length;
9775    len2 = str2->_base._base.length;
9776
9777    while (len1 > 0 && len2 > 0) {
9778        Py_UNICODE c1, c2;
9779
9780        c1 = *s1++;
9781        c2 = *s2++;
9782
9783        if (c1 > (1<<11) * 26)
9784            c1 += utf16Fixup[c1>>11];
9785        if (c2 > (1<<11) * 26)
9786            c2 += utf16Fixup[c2>>11];
9787        /* now c1 and c2 are in UTF-32-compatible order */
9788
9789        if (c1 != c2)
9790            return (c1 < c2) ? -1 : 1;
9791
9792        len1--; len2--;
9793    }
9794
9795    return (len1 < len2) ? -1 : (len1 != len2);
9796}
9797
9798#else
9799
9800/* This function assumes that str1 and str2 are readied by the caller. */
9801
9802static int
9803unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9804{
9805    int kind1, kind2;
9806    void *data1, *data2;
9807    Py_ssize_t len1, len2, i;
9808
9809    kind1 = PyUnicode_KIND(str1);
9810    kind2 = PyUnicode_KIND(str2);
9811    data1 = PyUnicode_DATA(str1);
9812    data2 = PyUnicode_DATA(str2);
9813    len1 = PyUnicode_GET_LENGTH(str1);
9814    len2 = PyUnicode_GET_LENGTH(str2);
9815
9816    for (i = 0; i < len1 && i < len2; ++i) {
9817        Py_UCS4 c1, c2;
9818        c1 = PyUnicode_READ(kind1, data1, i);
9819        c2 = PyUnicode_READ(kind2, data2, i);
9820
9821        if (c1 != c2)
9822            return (c1 < c2) ? -1 : 1;
9823    }
9824
9825    return (len1 < len2) ? -1 : (len1 != len2);
9826}
9827
9828#endif
9829
9830int
9831PyUnicode_Compare(PyObject *left, PyObject *right)
9832{
9833    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9834        if (PyUnicode_READY(left) == -1 ||
9835            PyUnicode_READY(right) == -1)
9836            return -1;
9837        return unicode_compare((PyUnicodeObject *)left,
9838                               (PyUnicodeObject *)right);
9839    }
9840    PyErr_Format(PyExc_TypeError,
9841                 "Can't compare %.100s and %.100s",
9842                 left->ob_type->tp_name,
9843                 right->ob_type->tp_name);
9844    return -1;
9845}
9846
9847int
9848PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9849{
9850    Py_ssize_t i;
9851    int kind;
9852    void *data;
9853    Py_UCS4 chr;
9854
9855    assert(_PyUnicode_CHECK(uni));
9856    if (PyUnicode_READY(uni) == -1)
9857        return -1;
9858    kind = PyUnicode_KIND(uni);
9859    data = PyUnicode_DATA(uni);
9860    /* Compare Unicode string and source character set string */
9861    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9862        if (chr != str[i])
9863            return (chr < (unsigned char)(str[i])) ? -1 : 1;
9864    /* This check keeps Python strings that end in '\0' from comparing equal
9865     to C strings identical up to that point. */
9866    if (PyUnicode_GET_LENGTH(uni) != i || chr)
9867        return 1; /* uni is longer */
9868    if (str[i])
9869        return -1; /* str is longer */
9870    return 0;
9871}
9872
9873
9874#define TEST_COND(cond)                         \
9875    ((cond) ? Py_True : Py_False)
9876
9877PyObject *
9878PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
9879{
9880    int result;
9881
9882    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9883        PyObject *v;
9884        if (PyUnicode_READY(left) == -1 ||
9885            PyUnicode_READY(right) == -1)
9886            return NULL;
9887        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9888            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
9889            if (op == Py_EQ) {
9890                Py_INCREF(Py_False);
9891                return Py_False;
9892            }
9893            if (op == Py_NE) {
9894                Py_INCREF(Py_True);
9895                return Py_True;
9896            }
9897        }
9898        if (left == right)
9899            result = 0;
9900        else
9901            result = unicode_compare((PyUnicodeObject *)left,
9902                                     (PyUnicodeObject *)right);
9903
9904        /* Convert the return value to a Boolean */
9905        switch (op) {
9906        case Py_EQ:
9907            v = TEST_COND(result == 0);
9908            break;
9909        case Py_NE:
9910            v = TEST_COND(result != 0);
9911            break;
9912        case Py_LE:
9913            v = TEST_COND(result <= 0);
9914            break;
9915        case Py_GE:
9916            v = TEST_COND(result >= 0);
9917            break;
9918        case Py_LT:
9919            v = TEST_COND(result == -1);
9920            break;
9921        case Py_GT:
9922            v = TEST_COND(result == 1);
9923            break;
9924        default:
9925            PyErr_BadArgument();
9926            return NULL;
9927        }
9928        Py_INCREF(v);
9929        return v;
9930    }
9931
9932    Py_RETURN_NOTIMPLEMENTED;
9933}
9934
9935int
9936PyUnicode_Contains(PyObject *container, PyObject *element)
9937{
9938    PyObject *str, *sub;
9939    int kind1, kind2, kind;
9940    void *buf1, *buf2;
9941    Py_ssize_t len1, len2;
9942    int result;
9943
9944    /* Coerce the two arguments */
9945    sub = PyUnicode_FromObject(element);
9946    if (!sub) {
9947        PyErr_Format(PyExc_TypeError,
9948                     "'in <string>' requires string as left operand, not %s",
9949                     element->ob_type->tp_name);
9950        return -1;
9951    }
9952    if (PyUnicode_READY(sub) == -1)
9953        return -1;
9954
9955    str = PyUnicode_FromObject(container);
9956    if (!str || PyUnicode_READY(str) == -1) {
9957        Py_DECREF(sub);
9958        return -1;
9959    }
9960
9961    kind1 = PyUnicode_KIND(str);
9962    kind2 = PyUnicode_KIND(sub);
9963    kind = kind1 > kind2 ? kind1 : kind2;
9964    buf1 = PyUnicode_DATA(str);
9965    buf2 = PyUnicode_DATA(sub);
9966    if (kind1 != kind)
9967        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9968    if (!buf1) {
9969        Py_DECREF(sub);
9970        return -1;
9971    }
9972    if (kind2 != kind)
9973        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9974    if (!buf2) {
9975        Py_DECREF(sub);
9976        if (kind1 != kind) PyMem_Free(buf1);
9977        return -1;
9978    }
9979    len1 = PyUnicode_GET_LENGTH(str);
9980    len2 = PyUnicode_GET_LENGTH(sub);
9981
9982    switch(kind) {
9983    case PyUnicode_1BYTE_KIND:
9984        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9985        break;
9986    case PyUnicode_2BYTE_KIND:
9987        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9988        break;
9989    case PyUnicode_4BYTE_KIND:
9990        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9991        break;
9992    default:
9993        result = -1;
9994        assert(0);
9995    }
9996
9997    Py_DECREF(str);
9998    Py_DECREF(sub);
9999
10000    if (kind1 != kind)
10001        PyMem_Free(buf1);
10002    if (kind2 != kind)
10003        PyMem_Free(buf2);
10004
10005    return result;
10006}
10007
10008/* Concat to string or Unicode object giving a new Unicode object. */
10009
10010PyObject *
10011PyUnicode_Concat(PyObject *left, PyObject *right)
10012{
10013    PyObject *u = NULL, *v = NULL, *w;
10014    Py_UCS4 maxchar;
10015
10016    /* Coerce the two arguments */
10017    u = PyUnicode_FromObject(left);
10018    if (u == NULL)
10019        goto onError;
10020    v = PyUnicode_FromObject(right);
10021    if (v == NULL)
10022        goto onError;
10023
10024    /* Shortcuts */
10025    if (v == unicode_empty) {
10026        Py_DECREF(v);
10027        return u;
10028    }
10029    if (u == unicode_empty) {
10030        Py_DECREF(u);
10031        return v;
10032    }
10033
10034    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10035    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
10036
10037    /* Concat the two Unicode strings */
10038    w = PyUnicode_New(
10039        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10040        maxchar);
10041    if (w == NULL)
10042        goto onError;
10043    if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10044        goto onError;
10045    if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
10046                                 v, 0,
10047                                 PyUnicode_GET_LENGTH(v)) < 0)
10048        goto onError;
10049    Py_DECREF(u);
10050    Py_DECREF(v);
10051    return w;
10052
10053  onError:
10054    Py_XDECREF(u);
10055    Py_XDECREF(v);
10056    return NULL;
10057}
10058
10059static void
10060unicode_append_inplace(PyObject **p_left, PyObject *right)
10061{
10062    Py_ssize_t left_len, right_len, new_len;
10063#ifdef Py_DEBUG
10064    Py_ssize_t copied;
10065#endif
10066
10067    assert(PyUnicode_IS_READY(*p_left));
10068    assert(PyUnicode_IS_READY(right));
10069
10070    left_len = PyUnicode_GET_LENGTH(*p_left);
10071    right_len = PyUnicode_GET_LENGTH(right);
10072    if (left_len > PY_SSIZE_T_MAX - right_len) {
10073        PyErr_SetString(PyExc_OverflowError,
10074                        "strings are too large to concat");
10075        goto error;
10076    }
10077    new_len = left_len + right_len;
10078
10079    /* Now we own the last reference to 'left', so we can resize it
10080     * in-place.
10081     */
10082    if (unicode_resize(p_left, new_len) != 0) {
10083        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10084         * deallocated so it cannot be put back into
10085         * 'variable'.  The MemoryError is raised when there
10086         * is no value in 'variable', which might (very
10087         * remotely) be a cause of incompatibilities.
10088         */
10089        goto error;
10090    }
10091    /* copy 'right' into the newly allocated area of 'left' */
10092#ifdef Py_DEBUG
10093    copied = PyUnicode_CopyCharacters(*p_left, left_len,
10094                                      right, 0,
10095                                      right_len);
10096    assert(0 <= copied);
10097#else
10098    PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10099#endif
10100    return;
10101
10102error:
10103    Py_DECREF(*p_left);
10104    *p_left = NULL;
10105}
10106
10107void
10108PyUnicode_Append(PyObject **p_left, PyObject *right)
10109{
10110    PyObject *left, *res;
10111
10112    if (p_left == NULL) {
10113        if (!PyErr_Occurred())
10114            PyErr_BadInternalCall();
10115        return;
10116    }
10117    left = *p_left;
10118    if (right == NULL || !PyUnicode_Check(left)) {
10119        if (!PyErr_Occurred())
10120            PyErr_BadInternalCall();
10121        goto error;
10122    }
10123
10124    if (PyUnicode_READY(left))
10125        goto error;
10126    if (PyUnicode_READY(right))
10127        goto error;
10128
10129    if (PyUnicode_CheckExact(left) && left != unicode_empty
10130        && PyUnicode_CheckExact(right) && right != unicode_empty
10131        && unicode_resizable(left)
10132        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10133            || _PyUnicode_WSTR(left) != NULL))
10134    {
10135        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10136           to change the structure size, but characters are stored just after
10137           the structure, and so it requires to move all characters which is
10138           not so different than duplicating the string. */
10139        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10140        {
10141            unicode_append_inplace(p_left, right);
10142            return;
10143        }
10144    }
10145
10146    res = PyUnicode_Concat(left, right);
10147    if (res == NULL)
10148        goto error;
10149    Py_DECREF(left);
10150    *p_left = res;
10151    return;
10152
10153error:
10154    Py_DECREF(*p_left);
10155    *p_left = NULL;
10156}
10157
10158void
10159PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10160{
10161    PyUnicode_Append(pleft, right);
10162    Py_XDECREF(right);
10163}
10164
10165PyDoc_STRVAR(count__doc__,
10166             "S.count(sub[, start[, end]]) -> int\n\
10167\n\
10168Return the number of non-overlapping occurrences of substring sub in\n\
10169string S[start:end].  Optional arguments start and end are\n\
10170interpreted as in slice notation.");
10171
10172static PyObject *
10173unicode_count(PyUnicodeObject *self, PyObject *args)
10174{
10175    PyUnicodeObject *substring;
10176    Py_ssize_t start = 0;
10177    Py_ssize_t end = PY_SSIZE_T_MAX;
10178    PyObject *result;
10179    int kind1, kind2, kind;
10180    void *buf1, *buf2;
10181    Py_ssize_t len1, len2, iresult;
10182
10183    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10184                                            &start, &end))
10185        return NULL;
10186
10187    kind1 = PyUnicode_KIND(self);
10188    kind2 = PyUnicode_KIND(substring);
10189    kind = kind1 > kind2 ? kind1 : kind2;
10190    buf1 = PyUnicode_DATA(self);
10191    buf2 = PyUnicode_DATA(substring);
10192    if (kind1 != kind)
10193        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10194    if (!buf1) {
10195        Py_DECREF(substring);
10196        return NULL;
10197    }
10198    if (kind2 != kind)
10199        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10200    if (!buf2) {
10201        Py_DECREF(substring);
10202        if (kind1 != kind) PyMem_Free(buf1);
10203        return NULL;
10204    }
10205    len1 = PyUnicode_GET_LENGTH(self);
10206    len2 = PyUnicode_GET_LENGTH(substring);
10207
10208    ADJUST_INDICES(start, end, len1);
10209    switch(kind) {
10210    case PyUnicode_1BYTE_KIND:
10211        iresult = ucs1lib_count(
10212            ((Py_UCS1*)buf1) + start, end - start,
10213            buf2, len2, PY_SSIZE_T_MAX
10214            );
10215        break;
10216    case PyUnicode_2BYTE_KIND:
10217        iresult = ucs2lib_count(
10218            ((Py_UCS2*)buf1) + start, end - start,
10219            buf2, len2, PY_SSIZE_T_MAX
10220            );
10221        break;
10222    case PyUnicode_4BYTE_KIND:
10223        iresult = ucs4lib_count(
10224            ((Py_UCS4*)buf1) + start, end - start,
10225            buf2, len2, PY_SSIZE_T_MAX
10226            );
10227        break;
10228    default:
10229        assert(0); iresult = 0;
10230    }
10231
10232    result = PyLong_FromSsize_t(iresult);
10233
10234    if (kind1 != kind)
10235        PyMem_Free(buf1);
10236    if (kind2 != kind)
10237        PyMem_Free(buf2);
10238
10239    Py_DECREF(substring);
10240
10241    return result;
10242}
10243
10244PyDoc_STRVAR(encode__doc__,
10245             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10246\n\
10247Encode S using the codec registered for encoding. Default encoding\n\
10248is 'utf-8'. errors may be given to set a different error\n\
10249handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10250a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10251'xmlcharrefreplace' as well as any other name registered with\n\
10252codecs.register_error that can handle UnicodeEncodeErrors.");
10253
10254static PyObject *
10255unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10256{
10257    static char *kwlist[] = {"encoding", "errors", 0};
10258    char *encoding = NULL;
10259    char *errors = NULL;
10260
10261    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10262                                     kwlist, &encoding, &errors))
10263        return NULL;
10264    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10265}
10266
10267PyDoc_STRVAR(expandtabs__doc__,
10268             "S.expandtabs([tabsize]) -> str\n\
10269\n\
10270Return a copy of S where all tab characters are expanded using spaces.\n\
10271If tabsize is not given, a tab size of 8 characters is assumed.");
10272
10273static PyObject*
10274unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10275{
10276    Py_ssize_t i, j, line_pos, src_len, incr;
10277    Py_UCS4 ch;
10278    PyObject *u;
10279    void *src_data, *dest_data;
10280    int tabsize = 8;
10281    int kind;
10282    int found;
10283
10284    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10285        return NULL;
10286
10287    if (PyUnicode_READY(self) == -1)
10288        return NULL;
10289
10290    /* First pass: determine size of output string */
10291    src_len = PyUnicode_GET_LENGTH(self);
10292    i = j = line_pos = 0;
10293    kind = PyUnicode_KIND(self);
10294    src_data = PyUnicode_DATA(self);
10295    found = 0;
10296    for (; i < src_len; i++) {
10297        ch = PyUnicode_READ(kind, src_data, i);
10298        if (ch == '\t') {
10299            found = 1;
10300            if (tabsize > 0) {
10301                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10302                if (j > PY_SSIZE_T_MAX - incr)
10303                    goto overflow;
10304                line_pos += incr;
10305                j += incr;
10306            }
10307        }
10308        else {
10309            if (j > PY_SSIZE_T_MAX - 1)
10310                goto overflow;
10311            line_pos++;
10312            j++;
10313            if (ch == '\n' || ch == '\r')
10314                line_pos = 0;
10315        }
10316    }
10317    if (!found && PyUnicode_CheckExact(self)) {
10318        Py_INCREF((PyObject *) self);
10319        return (PyObject *) self;
10320    }
10321
10322    /* Second pass: create output string and fill it */
10323    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10324    if (!u)
10325        return NULL;
10326    dest_data = PyUnicode_DATA(u);
10327
10328    i = j = line_pos = 0;
10329
10330    for (; i < src_len; i++) {
10331        ch = PyUnicode_READ(kind, src_data, i);
10332        if (ch == '\t') {
10333            if (tabsize > 0) {
10334                incr = tabsize - (line_pos % tabsize);
10335                line_pos += incr;
10336                while (incr--) {
10337                    PyUnicode_WRITE(kind, dest_data, j, ' ');
10338                    j++;
10339                }
10340            }
10341        }
10342        else {
10343            line_pos++;
10344            PyUnicode_WRITE(kind, dest_data, j, ch);
10345            j++;
10346            if (ch == '\n' || ch == '\r')
10347                line_pos = 0;
10348        }
10349    }
10350    assert (j == PyUnicode_GET_LENGTH(u));
10351#ifndef DONT_MAKE_RESULT_READY
10352    if (_PyUnicode_READY_REPLACE(&u)) {
10353        Py_DECREF(u);
10354        return NULL;
10355    }
10356#endif
10357    return (PyObject*) u;
10358
10359  overflow:
10360    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10361    return NULL;
10362}
10363
10364PyDoc_STRVAR(find__doc__,
10365             "S.find(sub[, start[, end]]) -> int\n\
10366\n\
10367Return the lowest index in S where substring sub is found,\n\
10368such that sub is contained within S[start:end].  Optional\n\
10369arguments start and end are interpreted as in slice notation.\n\
10370\n\
10371Return -1 on failure.");
10372
10373static PyObject *
10374unicode_find(PyObject *self, PyObject *args)
10375{
10376    PyUnicodeObject *substring;
10377    Py_ssize_t start;
10378    Py_ssize_t end;
10379    Py_ssize_t result;
10380
10381    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10382                                            &start, &end))
10383        return NULL;
10384
10385    if (PyUnicode_READY(self) == -1)
10386        return NULL;
10387    if (PyUnicode_READY(substring) == -1)
10388        return NULL;
10389
10390    result = any_find_slice(
10391        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10392        self, (PyObject*)substring, start, end
10393        );
10394
10395    Py_DECREF(substring);
10396
10397    if (result == -2)
10398        return NULL;
10399
10400    return PyLong_FromSsize_t(result);
10401}
10402
10403static PyObject *
10404unicode_getitem(PyObject *self, Py_ssize_t index)
10405{
10406    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10407    if (ch == (Py_UCS4)-1)
10408        return NULL;
10409    return PyUnicode_FromOrdinal(ch);
10410}
10411
10412/* Believe it or not, this produces the same value for ASCII strings
10413   as bytes_hash(). */
10414static Py_hash_t
10415unicode_hash(PyUnicodeObject *self)
10416{
10417    Py_ssize_t len;
10418    Py_uhash_t x;
10419
10420    if (_PyUnicode_HASH(self) != -1)
10421        return _PyUnicode_HASH(self);
10422    if (PyUnicode_READY(self) == -1)
10423        return -1;
10424    len = PyUnicode_GET_LENGTH(self);
10425
10426    /* The hash function as a macro, gets expanded three times below. */
10427#define HASH(P) \
10428    x = (Py_uhash_t)*P << 7; \
10429    while (--len >= 0) \
10430        x = (1000003*x) ^ (Py_uhash_t)*P++;
10431
10432    switch (PyUnicode_KIND(self)) {
10433    case PyUnicode_1BYTE_KIND: {
10434        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10435        HASH(c);
10436        break;
10437    }
10438    case PyUnicode_2BYTE_KIND: {
10439        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10440        HASH(s);
10441        break;
10442    }
10443    default: {
10444        Py_UCS4 *l;
10445        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10446               "Impossible switch case in unicode_hash");
10447        l = PyUnicode_4BYTE_DATA(self);
10448        HASH(l);
10449        break;
10450    }
10451    }
10452    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10453
10454    if (x == -1)
10455        x = -2;
10456    _PyUnicode_HASH(self) = x;
10457    return x;
10458}
10459#undef HASH
10460
10461PyDoc_STRVAR(index__doc__,
10462             "S.index(sub[, start[, end]]) -> int\n\
10463\n\
10464Like S.find() but raise ValueError when the substring is not found.");
10465
10466static PyObject *
10467unicode_index(PyObject *self, PyObject *args)
10468{
10469    Py_ssize_t result;
10470    PyUnicodeObject *substring;
10471    Py_ssize_t start;
10472    Py_ssize_t end;
10473
10474    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10475                                            &start, &end))
10476        return NULL;
10477
10478    if (PyUnicode_READY(self) == -1)
10479        return NULL;
10480    if (PyUnicode_READY(substring) == -1)
10481        return NULL;
10482
10483    result = any_find_slice(
10484        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10485        self, (PyObject*)substring, start, end
10486        );
10487
10488    Py_DECREF(substring);
10489
10490    if (result == -2)
10491        return NULL;
10492
10493    if (result < 0) {
10494        PyErr_SetString(PyExc_ValueError, "substring not found");
10495        return NULL;
10496    }
10497
10498    return PyLong_FromSsize_t(result);
10499}
10500
10501PyDoc_STRVAR(islower__doc__,
10502             "S.islower() -> bool\n\
10503\n\
10504Return True if all cased characters in S are lowercase and there is\n\
10505at least one cased character in S, False otherwise.");
10506
10507static PyObject*
10508unicode_islower(PyUnicodeObject *self)
10509{
10510    Py_ssize_t i, length;
10511    int kind;
10512    void *data;
10513    int cased;
10514
10515    if (PyUnicode_READY(self) == -1)
10516        return NULL;
10517    length = PyUnicode_GET_LENGTH(self);
10518    kind = PyUnicode_KIND(self);
10519    data = PyUnicode_DATA(self);
10520
10521    /* Shortcut for single character strings */
10522    if (length == 1)
10523        return PyBool_FromLong(
10524            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10525
10526    /* Special case for empty strings */
10527    if (length == 0)
10528        return PyBool_FromLong(0);
10529
10530    cased = 0;
10531    for (i = 0; i < length; i++) {
10532        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10533
10534        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10535            return PyBool_FromLong(0);
10536        else if (!cased && Py_UNICODE_ISLOWER(ch))
10537            cased = 1;
10538    }
10539    return PyBool_FromLong(cased);
10540}
10541
10542PyDoc_STRVAR(isupper__doc__,
10543             "S.isupper() -> bool\n\
10544\n\
10545Return True if all cased characters in S are uppercase and there is\n\
10546at least one cased character in S, False otherwise.");
10547
10548static PyObject*
10549unicode_isupper(PyUnicodeObject *self)
10550{
10551    Py_ssize_t i, length;
10552    int kind;
10553    void *data;
10554    int cased;
10555
10556    if (PyUnicode_READY(self) == -1)
10557        return NULL;
10558    length = PyUnicode_GET_LENGTH(self);
10559    kind = PyUnicode_KIND(self);
10560    data = PyUnicode_DATA(self);
10561
10562    /* Shortcut for single character strings */
10563    if (length == 1)
10564        return PyBool_FromLong(
10565            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10566
10567    /* Special case for empty strings */
10568    if (length == 0)
10569        return PyBool_FromLong(0);
10570
10571    cased = 0;
10572    for (i = 0; i < length; i++) {
10573        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10574
10575        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10576            return PyBool_FromLong(0);
10577        else if (!cased && Py_UNICODE_ISUPPER(ch))
10578            cased = 1;
10579    }
10580    return PyBool_FromLong(cased);
10581}
10582
10583PyDoc_STRVAR(istitle__doc__,
10584             "S.istitle() -> bool\n\
10585\n\
10586Return True if S is a titlecased string and there is at least one\n\
10587character in S, i.e. upper- and titlecase characters may only\n\
10588follow uncased characters and lowercase characters only cased ones.\n\
10589Return False otherwise.");
10590
10591static PyObject*
10592unicode_istitle(PyUnicodeObject *self)
10593{
10594    Py_ssize_t i, length;
10595    int kind;
10596    void *data;
10597    int cased, previous_is_cased;
10598
10599    if (PyUnicode_READY(self) == -1)
10600        return NULL;
10601    length = PyUnicode_GET_LENGTH(self);
10602    kind = PyUnicode_KIND(self);
10603    data = PyUnicode_DATA(self);
10604
10605    /* Shortcut for single character strings */
10606    if (length == 1) {
10607        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10608        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10609                               (Py_UNICODE_ISUPPER(ch) != 0));
10610    }
10611
10612    /* Special case for empty strings */
10613    if (length == 0)
10614        return PyBool_FromLong(0);
10615
10616    cased = 0;
10617    previous_is_cased = 0;
10618    for (i = 0; i < length; i++) {
10619        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10620
10621        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10622            if (previous_is_cased)
10623                return PyBool_FromLong(0);
10624            previous_is_cased = 1;
10625            cased = 1;
10626        }
10627        else if (Py_UNICODE_ISLOWER(ch)) {
10628            if (!previous_is_cased)
10629                return PyBool_FromLong(0);
10630            previous_is_cased = 1;
10631            cased = 1;
10632        }
10633        else
10634            previous_is_cased = 0;
10635    }
10636    return PyBool_FromLong(cased);
10637}
10638
10639PyDoc_STRVAR(isspace__doc__,
10640             "S.isspace() -> bool\n\
10641\n\
10642Return True if all characters in S are whitespace\n\
10643and there is at least one character in S, False otherwise.");
10644
10645static PyObject*
10646unicode_isspace(PyUnicodeObject *self)
10647{
10648    Py_ssize_t i, length;
10649    int kind;
10650    void *data;
10651
10652    if (PyUnicode_READY(self) == -1)
10653        return NULL;
10654    length = PyUnicode_GET_LENGTH(self);
10655    kind = PyUnicode_KIND(self);
10656    data = PyUnicode_DATA(self);
10657
10658    /* Shortcut for single character strings */
10659    if (length == 1)
10660        return PyBool_FromLong(
10661            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10662
10663    /* Special case for empty strings */
10664    if (length == 0)
10665        return PyBool_FromLong(0);
10666
10667    for (i = 0; i < length; i++) {
10668        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10669        if (!Py_UNICODE_ISSPACE(ch))
10670            return PyBool_FromLong(0);
10671    }
10672    return PyBool_FromLong(1);
10673}
10674
10675PyDoc_STRVAR(isalpha__doc__,
10676             "S.isalpha() -> bool\n\
10677\n\
10678Return True if all characters in S are alphabetic\n\
10679and there is at least one character in S, False otherwise.");
10680
10681static PyObject*
10682unicode_isalpha(PyUnicodeObject *self)
10683{
10684    Py_ssize_t i, length;
10685    int kind;
10686    void *data;
10687
10688    if (PyUnicode_READY(self) == -1)
10689        return NULL;
10690    length = PyUnicode_GET_LENGTH(self);
10691    kind = PyUnicode_KIND(self);
10692    data = PyUnicode_DATA(self);
10693
10694    /* Shortcut for single character strings */
10695    if (length == 1)
10696        return PyBool_FromLong(
10697            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10698
10699    /* Special case for empty strings */
10700    if (length == 0)
10701        return PyBool_FromLong(0);
10702
10703    for (i = 0; i < length; i++) {
10704        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10705            return PyBool_FromLong(0);
10706    }
10707    return PyBool_FromLong(1);
10708}
10709
10710PyDoc_STRVAR(isalnum__doc__,
10711             "S.isalnum() -> bool\n\
10712\n\
10713Return True if all characters in S are alphanumeric\n\
10714and there is at least one character in S, False otherwise.");
10715
10716static PyObject*
10717unicode_isalnum(PyUnicodeObject *self)
10718{
10719    int kind;
10720    void *data;
10721    Py_ssize_t len, i;
10722
10723    if (PyUnicode_READY(self) == -1)
10724        return NULL;
10725
10726    kind = PyUnicode_KIND(self);
10727    data = PyUnicode_DATA(self);
10728    len = PyUnicode_GET_LENGTH(self);
10729
10730    /* Shortcut for single character strings */
10731    if (len == 1) {
10732        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10733        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10734    }
10735
10736    /* Special case for empty strings */
10737    if (len == 0)
10738        return PyBool_FromLong(0);
10739
10740    for (i = 0; i < len; i++) {
10741        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10742        if (!Py_UNICODE_ISALNUM(ch))
10743            return PyBool_FromLong(0);
10744    }
10745    return PyBool_FromLong(1);
10746}
10747
10748PyDoc_STRVAR(isdecimal__doc__,
10749             "S.isdecimal() -> bool\n\
10750\n\
10751Return True if there are only decimal characters in S,\n\
10752False otherwise.");
10753
10754static PyObject*
10755unicode_isdecimal(PyUnicodeObject *self)
10756{
10757    Py_ssize_t i, length;
10758    int kind;
10759    void *data;
10760
10761    if (PyUnicode_READY(self) == -1)
10762        return NULL;
10763    length = PyUnicode_GET_LENGTH(self);
10764    kind = PyUnicode_KIND(self);
10765    data = PyUnicode_DATA(self);
10766
10767    /* Shortcut for single character strings */
10768    if (length == 1)
10769        return PyBool_FromLong(
10770            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
10771
10772    /* Special case for empty strings */
10773    if (length == 0)
10774        return PyBool_FromLong(0);
10775
10776    for (i = 0; i < length; i++) {
10777        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
10778            return PyBool_FromLong(0);
10779    }
10780    return PyBool_FromLong(1);
10781}
10782
10783PyDoc_STRVAR(isdigit__doc__,
10784             "S.isdigit() -> bool\n\
10785\n\
10786Return True if all characters in S are digits\n\
10787and there is at least one character in S, False otherwise.");
10788
10789static PyObject*
10790unicode_isdigit(PyUnicodeObject *self)
10791{
10792    Py_ssize_t i, length;
10793    int kind;
10794    void *data;
10795
10796    if (PyUnicode_READY(self) == -1)
10797        return NULL;
10798    length = PyUnicode_GET_LENGTH(self);
10799    kind = PyUnicode_KIND(self);
10800    data = PyUnicode_DATA(self);
10801
10802    /* Shortcut for single character strings */
10803    if (length == 1) {
10804        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10805        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10806    }
10807
10808    /* Special case for empty strings */
10809    if (length == 0)
10810        return PyBool_FromLong(0);
10811
10812    for (i = 0; i < length; i++) {
10813        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
10814            return PyBool_FromLong(0);
10815    }
10816    return PyBool_FromLong(1);
10817}
10818
10819PyDoc_STRVAR(isnumeric__doc__,
10820             "S.isnumeric() -> bool\n\
10821\n\
10822Return True if there are only numeric characters in S,\n\
10823False otherwise.");
10824
10825static PyObject*
10826unicode_isnumeric(PyUnicodeObject *self)
10827{
10828    Py_ssize_t i, length;
10829    int kind;
10830    void *data;
10831
10832    if (PyUnicode_READY(self) == -1)
10833        return NULL;
10834    length = PyUnicode_GET_LENGTH(self);
10835    kind = PyUnicode_KIND(self);
10836    data = PyUnicode_DATA(self);
10837
10838    /* Shortcut for single character strings */
10839    if (length == 1)
10840        return PyBool_FromLong(
10841            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
10842
10843    /* Special case for empty strings */
10844    if (length == 0)
10845        return PyBool_FromLong(0);
10846
10847    for (i = 0; i < length; i++) {
10848        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
10849            return PyBool_FromLong(0);
10850    }
10851    return PyBool_FromLong(1);
10852}
10853
10854int
10855PyUnicode_IsIdentifier(PyObject *self)
10856{
10857    int kind;
10858    void *data;
10859    Py_ssize_t i;
10860    Py_UCS4 first;
10861
10862    if (PyUnicode_READY(self) == -1) {
10863        Py_FatalError("identifier not ready");
10864        return 0;
10865    }
10866
10867    /* Special case for empty strings */
10868    if (PyUnicode_GET_LENGTH(self) == 0)
10869        return 0;
10870    kind = PyUnicode_KIND(self);
10871    data = PyUnicode_DATA(self);
10872
10873    /* PEP 3131 says that the first character must be in
10874       XID_Start and subsequent characters in XID_Continue,
10875       and for the ASCII range, the 2.x rules apply (i.e
10876       start with letters and underscore, continue with
10877       letters, digits, underscore). However, given the current
10878       definition of XID_Start and XID_Continue, it is sufficient
10879       to check just for these, except that _ must be allowed
10880       as starting an identifier.  */
10881    first = PyUnicode_READ(kind, data, 0);
10882    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
10883        return 0;
10884
10885    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
10886        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
10887            return 0;
10888    return 1;
10889}
10890
10891PyDoc_STRVAR(isidentifier__doc__,
10892             "S.isidentifier() -> bool\n\
10893\n\
10894Return True if S is a valid identifier according\n\
10895to the language definition.");
10896
10897static PyObject*
10898unicode_isidentifier(PyObject *self)
10899{
10900    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10901}
10902
10903PyDoc_STRVAR(isprintable__doc__,
10904             "S.isprintable() -> bool\n\
10905\n\
10906Return True if all characters in S are considered\n\
10907printable in repr() or S is empty, False otherwise.");
10908
10909static PyObject*
10910unicode_isprintable(PyObject *self)
10911{
10912    Py_ssize_t i, length;
10913    int kind;
10914    void *data;
10915
10916    if (PyUnicode_READY(self) == -1)
10917        return NULL;
10918    length = PyUnicode_GET_LENGTH(self);
10919    kind = PyUnicode_KIND(self);
10920    data = PyUnicode_DATA(self);
10921
10922    /* Shortcut for single character strings */
10923    if (length == 1)
10924        return PyBool_FromLong(
10925            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
10926
10927    for (i = 0; i < length; i++) {
10928        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
10929            Py_RETURN_FALSE;
10930        }
10931    }
10932    Py_RETURN_TRUE;
10933}
10934
10935PyDoc_STRVAR(join__doc__,
10936             "S.join(iterable) -> str\n\
10937\n\
10938Return a string which is the concatenation of the strings in the\n\
10939iterable.  The separator between elements is S.");
10940
10941static PyObject*
10942unicode_join(PyObject *self, PyObject *data)
10943{
10944    return PyUnicode_Join(self, data);
10945}
10946
10947static Py_ssize_t
10948unicode_length(PyUnicodeObject *self)
10949{
10950    if (PyUnicode_READY(self) == -1)
10951        return -1;
10952    return PyUnicode_GET_LENGTH(self);
10953}
10954
10955PyDoc_STRVAR(ljust__doc__,
10956             "S.ljust(width[, fillchar]) -> str\n\
10957\n\
10958Return S left-justified in a Unicode string of length width. Padding is\n\
10959done using the specified fill character (default is a space).");
10960
10961static PyObject *
10962unicode_ljust(PyUnicodeObject *self, PyObject *args)
10963{
10964    Py_ssize_t width;
10965    Py_UCS4 fillchar = ' ';
10966
10967    if (PyUnicode_READY(self) == -1)
10968        return NULL;
10969
10970    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
10971        return NULL;
10972
10973    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10974        Py_INCREF(self);
10975        return (PyObject*) self;
10976    }
10977
10978    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
10979}
10980
10981PyDoc_STRVAR(lower__doc__,
10982             "S.lower() -> str\n\
10983\n\
10984Return a copy of the string S converted to lowercase.");
10985
10986static PyObject*
10987unicode_lower(PyUnicodeObject *self)
10988{
10989    return fixup(self, fixlower);
10990}
10991
10992#define LEFTSTRIP 0
10993#define RIGHTSTRIP 1
10994#define BOTHSTRIP 2
10995
10996/* Arrays indexed by above */
10997static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10998
10999#define STRIPNAME(i) (stripformat[i]+3)
11000
11001/* externally visible for str.strip(unicode) */
11002PyObject *
11003_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
11004{
11005    void *data;
11006    int kind;
11007    Py_ssize_t i, j, len;
11008    BLOOM_MASK sepmask;
11009
11010    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11011        return NULL;
11012
11013    kind = PyUnicode_KIND(self);
11014    data = PyUnicode_DATA(self);
11015    len = PyUnicode_GET_LENGTH(self);
11016    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11017                              PyUnicode_DATA(sepobj),
11018                              PyUnicode_GET_LENGTH(sepobj));
11019
11020    i = 0;
11021    if (striptype != RIGHTSTRIP) {
11022        while (i < len &&
11023               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11024            i++;
11025        }
11026    }
11027
11028    j = len;
11029    if (striptype != LEFTSTRIP) {
11030        do {
11031            j--;
11032        } while (j >= i &&
11033                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11034        j++;
11035    }
11036
11037    return PyUnicode_Substring((PyObject*)self, i, j);
11038}
11039
11040PyObject*
11041PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11042{
11043    unsigned char *data;
11044    int kind;
11045    Py_ssize_t length;
11046
11047    if (PyUnicode_READY(self) == -1)
11048        return NULL;
11049
11050    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11051
11052    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11053    {
11054        if (PyUnicode_CheckExact(self)) {
11055            Py_INCREF(self);
11056            return self;
11057        }
11058        else
11059            return PyUnicode_Copy(self);
11060    }
11061
11062    length = end - start;
11063    if (length == 1)
11064        return unicode_getitem(self, start);
11065
11066    if (start < 0 || end < 0) {
11067        PyErr_SetString(PyExc_IndexError, "string index out of range");
11068        return NULL;
11069    }
11070
11071    if (PyUnicode_IS_ASCII(self)) {
11072        kind = PyUnicode_KIND(self);
11073        data = PyUnicode_1BYTE_DATA(self);
11074        return unicode_fromascii(data + start, length);
11075    }
11076    else {
11077        kind = PyUnicode_KIND(self);
11078        data = PyUnicode_1BYTE_DATA(self);
11079        return PyUnicode_FromKindAndData(kind,
11080                                         data + PyUnicode_KIND_SIZE(kind, start),
11081                                         length);
11082    }
11083}
11084
11085static PyObject *
11086do_strip(PyUnicodeObject *self, int striptype)
11087{
11088    int kind;
11089    void *data;
11090    Py_ssize_t len, i, j;
11091
11092    if (PyUnicode_READY(self) == -1)
11093        return NULL;
11094
11095    kind = PyUnicode_KIND(self);
11096    data = PyUnicode_DATA(self);
11097    len = PyUnicode_GET_LENGTH(self);
11098
11099    i = 0;
11100    if (striptype != RIGHTSTRIP) {
11101        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11102            i++;
11103        }
11104    }
11105
11106    j = len;
11107    if (striptype != LEFTSTRIP) {
11108        do {
11109            j--;
11110        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11111        j++;
11112    }
11113
11114    return PyUnicode_Substring((PyObject*)self, i, j);
11115}
11116
11117
11118static PyObject *
11119do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11120{
11121    PyObject *sep = NULL;
11122
11123    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11124        return NULL;
11125
11126    if (sep != NULL && sep != Py_None) {
11127        if (PyUnicode_Check(sep))
11128            return _PyUnicode_XStrip(self, striptype, sep);
11129        else {
11130            PyErr_Format(PyExc_TypeError,
11131                         "%s arg must be None or str",
11132                         STRIPNAME(striptype));
11133            return NULL;
11134        }
11135    }
11136
11137    return do_strip(self, striptype);
11138}
11139
11140
11141PyDoc_STRVAR(strip__doc__,
11142             "S.strip([chars]) -> str\n\
11143\n\
11144Return a copy of the string S with leading and trailing\n\
11145whitespace removed.\n\
11146If chars is given and not None, remove characters in chars instead.");
11147
11148static PyObject *
11149unicode_strip(PyUnicodeObject *self, PyObject *args)
11150{
11151    if (PyTuple_GET_SIZE(args) == 0)
11152        return do_strip(self, BOTHSTRIP); /* Common case */
11153    else
11154        return do_argstrip(self, BOTHSTRIP, args);
11155}
11156
11157
11158PyDoc_STRVAR(lstrip__doc__,
11159             "S.lstrip([chars]) -> str\n\
11160\n\
11161Return a copy of the string S with leading whitespace removed.\n\
11162If chars is given and not None, remove characters in chars instead.");
11163
11164static PyObject *
11165unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11166{
11167    if (PyTuple_GET_SIZE(args) == 0)
11168        return do_strip(self, LEFTSTRIP); /* Common case */
11169    else
11170        return do_argstrip(self, LEFTSTRIP, args);
11171}
11172
11173
11174PyDoc_STRVAR(rstrip__doc__,
11175             "S.rstrip([chars]) -> str\n\
11176\n\
11177Return a copy of the string S with trailing whitespace removed.\n\
11178If chars is given and not None, remove characters in chars instead.");
11179
11180static PyObject *
11181unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11182{
11183    if (PyTuple_GET_SIZE(args) == 0)
11184        return do_strip(self, RIGHTSTRIP); /* Common case */
11185    else
11186        return do_argstrip(self, RIGHTSTRIP, args);
11187}
11188
11189
11190static PyObject*
11191unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
11192{
11193    PyUnicodeObject *u;
11194    Py_ssize_t nchars, n;
11195
11196    if (len < 1) {
11197        Py_INCREF(unicode_empty);
11198        return unicode_empty;
11199    }
11200
11201    if (len == 1 && PyUnicode_CheckExact(str)) {
11202        /* no repeat, return original string */
11203        Py_INCREF(str);
11204        return (PyObject*) str;
11205    }
11206
11207    if (PyUnicode_READY(str) == -1)
11208        return NULL;
11209
11210    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11211        PyErr_SetString(PyExc_OverflowError,
11212                        "repeated string is too long");
11213        return NULL;
11214    }
11215    nchars = len * PyUnicode_GET_LENGTH(str);
11216
11217    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11218    if (!u)
11219        return NULL;
11220    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11221
11222    if (PyUnicode_GET_LENGTH(str) == 1) {
11223        const int kind = PyUnicode_KIND(str);
11224        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11225        void *to = PyUnicode_DATA(u);
11226        if (kind == PyUnicode_1BYTE_KIND)
11227            memset(to, (unsigned char)fill_char, len);
11228        else {
11229            for (n = 0; n < len; ++n)
11230                PyUnicode_WRITE(kind, to, n, fill_char);
11231        }
11232    }
11233    else {
11234        /* number of characters copied this far */
11235        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11236        const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11237        char *to = (char *) PyUnicode_DATA(u);
11238        Py_MEMCPY(to, PyUnicode_DATA(str),
11239                  PyUnicode_GET_LENGTH(str) * char_size);
11240        while (done < nchars) {
11241            n = (done <= nchars-done) ? done : nchars-done;
11242            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11243            done += n;
11244        }
11245    }
11246
11247    return (PyObject*) u;
11248}
11249
11250PyObject *
11251PyUnicode_Replace(PyObject *obj,
11252                  PyObject *subobj,
11253                  PyObject *replobj,
11254                  Py_ssize_t maxcount)
11255{
11256    PyObject *self;
11257    PyObject *str1;
11258    PyObject *str2;
11259    PyObject *result;
11260
11261    self = PyUnicode_FromObject(obj);
11262    if (self == NULL || PyUnicode_READY(self) == -1)
11263        return NULL;
11264    str1 = PyUnicode_FromObject(subobj);
11265    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11266        Py_DECREF(self);
11267        return NULL;
11268    }
11269    str2 = PyUnicode_FromObject(replobj);
11270    if (str2 == NULL || PyUnicode_READY(str2)) {
11271        Py_DECREF(self);
11272        Py_DECREF(str1);
11273        return NULL;
11274    }
11275    result = replace(self, str1, str2, maxcount);
11276    Py_DECREF(self);
11277    Py_DECREF(str1);
11278    Py_DECREF(str2);
11279    return result;
11280}
11281
11282PyDoc_STRVAR(replace__doc__,
11283             "S.replace(old, new[, count]) -> str\n\
11284\n\
11285Return a copy of S with all occurrences of substring\n\
11286old replaced by new.  If the optional argument count is\n\
11287given, only the first count occurrences are replaced.");
11288
11289static PyObject*
11290unicode_replace(PyObject *self, PyObject *args)
11291{
11292    PyObject *str1;
11293    PyObject *str2;
11294    Py_ssize_t maxcount = -1;
11295    PyObject *result;
11296
11297    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11298        return NULL;
11299    if (!PyUnicode_READY(self) == -1)
11300        return NULL;
11301    str1 = PyUnicode_FromObject(str1);
11302    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11303        return NULL;
11304    str2 = PyUnicode_FromObject(str2);
11305    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11306        Py_DECREF(str1);
11307        return NULL;
11308    }
11309
11310    result = replace(self, str1, str2, maxcount);
11311
11312    Py_DECREF(str1);
11313    Py_DECREF(str2);
11314    return result;
11315}
11316
11317static PyObject *
11318unicode_repr(PyObject *unicode)
11319{
11320    PyObject *repr;
11321    Py_ssize_t isize;
11322    Py_ssize_t osize, squote, dquote, i, o;
11323    Py_UCS4 max, quote;
11324    int ikind, okind;
11325    void *idata, *odata;
11326
11327    if (PyUnicode_READY(unicode) == -1)
11328        return NULL;
11329
11330    isize = PyUnicode_GET_LENGTH(unicode);
11331    idata = PyUnicode_DATA(unicode);
11332
11333    /* Compute length of output, quote characters, and
11334       maximum character */
11335    osize = 2; /* quotes */
11336    max = 127;
11337    squote = dquote = 0;
11338    ikind = PyUnicode_KIND(unicode);
11339    for (i = 0; i < isize; i++) {
11340        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11341        switch (ch) {
11342        case '\'': squote++; osize++; break;
11343        case '"':  dquote++; osize++; break;
11344        case '\\': case '\t': case '\r': case '\n':
11345            osize += 2; break;
11346        default:
11347            /* Fast-path ASCII */
11348            if (ch < ' ' || ch == 0x7f)
11349                osize += 4; /* \xHH */
11350            else if (ch < 0x7f)
11351                osize++;
11352            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11353                osize++;
11354                max = ch > max ? ch : max;
11355            }
11356            else if (ch < 0x100)
11357                osize += 4; /* \xHH */
11358            else if (ch < 0x10000)
11359                osize += 6; /* \uHHHH */
11360            else
11361                osize += 10; /* \uHHHHHHHH */
11362        }
11363    }
11364
11365    quote = '\'';
11366    if (squote) {
11367        if (dquote)
11368            /* Both squote and dquote present. Use squote,
11369               and escape them */
11370            osize += squote;
11371        else
11372            quote = '"';
11373    }
11374
11375    repr = PyUnicode_New(osize, max);
11376    if (repr == NULL)
11377        return NULL;
11378    okind = PyUnicode_KIND(repr);
11379    odata = PyUnicode_DATA(repr);
11380
11381    PyUnicode_WRITE(okind, odata, 0, quote);
11382    PyUnicode_WRITE(okind, odata, osize-1, quote);
11383
11384    for (i = 0, o = 1; i < isize; i++) {
11385        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11386
11387        /* Escape quotes and backslashes */
11388        if ((ch == quote) || (ch == '\\')) {
11389            PyUnicode_WRITE(okind, odata, o++, '\\');
11390            PyUnicode_WRITE(okind, odata, o++, ch);
11391            continue;
11392        }
11393
11394        /* Map special whitespace to '\t', \n', '\r' */
11395        if (ch == '\t') {
11396            PyUnicode_WRITE(okind, odata, o++, '\\');
11397            PyUnicode_WRITE(okind, odata, o++, 't');
11398        }
11399        else if (ch == '\n') {
11400            PyUnicode_WRITE(okind, odata, o++, '\\');
11401            PyUnicode_WRITE(okind, odata, o++, 'n');
11402        }
11403        else if (ch == '\r') {
11404            PyUnicode_WRITE(okind, odata, o++, '\\');
11405            PyUnicode_WRITE(okind, odata, o++, 'r');
11406        }
11407
11408        /* Map non-printable US ASCII to '\xhh' */
11409        else if (ch < ' ' || ch == 0x7F) {
11410            PyUnicode_WRITE(okind, odata, o++, '\\');
11411            PyUnicode_WRITE(okind, odata, o++, 'x');
11412            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11413            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11414        }
11415
11416        /* Copy ASCII characters as-is */
11417        else if (ch < 0x7F) {
11418            PyUnicode_WRITE(okind, odata, o++, ch);
11419        }
11420
11421        /* Non-ASCII characters */
11422        else {
11423            /* Map Unicode whitespace and control characters
11424               (categories Z* and C* except ASCII space)
11425            */
11426            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11427                /* Map 8-bit characters to '\xhh' */
11428                if (ch <= 0xff) {
11429                    PyUnicode_WRITE(okind, odata, o++, '\\');
11430                    PyUnicode_WRITE(okind, odata, o++, 'x');
11431                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11432                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11433                }
11434                /* Map 21-bit characters to '\U00xxxxxx' */
11435                else if (ch >= 0x10000) {
11436                    PyUnicode_WRITE(okind, odata, o++, '\\');
11437                    PyUnicode_WRITE(okind, odata, o++, 'U');
11438                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11439                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11440                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11441                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11442                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11443                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11444                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11445                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11446                }
11447                /* Map 16-bit characters to '\uxxxx' */
11448                else {
11449                    PyUnicode_WRITE(okind, odata, o++, '\\');
11450                    PyUnicode_WRITE(okind, odata, o++, 'u');
11451                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11452                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11453                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11454                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11455                }
11456            }
11457            /* Copy characters as-is */
11458            else {
11459                PyUnicode_WRITE(okind, odata, o++, ch);
11460            }
11461        }
11462    }
11463    /* Closing quote already added at the beginning */
11464    return repr;
11465}
11466
11467PyDoc_STRVAR(rfind__doc__,
11468             "S.rfind(sub[, start[, end]]) -> int\n\
11469\n\
11470Return the highest index in S where substring sub is found,\n\
11471such that sub is contained within S[start:end].  Optional\n\
11472arguments start and end are interpreted as in slice notation.\n\
11473\n\
11474Return -1 on failure.");
11475
11476static PyObject *
11477unicode_rfind(PyObject *self, PyObject *args)
11478{
11479    PyUnicodeObject *substring;
11480    Py_ssize_t start;
11481    Py_ssize_t end;
11482    Py_ssize_t result;
11483
11484    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11485                                            &start, &end))
11486        return NULL;
11487
11488    if (PyUnicode_READY(self) == -1)
11489        return NULL;
11490    if (PyUnicode_READY(substring) == -1)
11491        return NULL;
11492
11493    result = any_find_slice(
11494        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11495        self, (PyObject*)substring, start, end
11496        );
11497
11498    Py_DECREF(substring);
11499
11500    if (result == -2)
11501        return NULL;
11502
11503    return PyLong_FromSsize_t(result);
11504}
11505
11506PyDoc_STRVAR(rindex__doc__,
11507             "S.rindex(sub[, start[, end]]) -> int\n\
11508\n\
11509Like S.rfind() but raise ValueError when the substring is not found.");
11510
11511static PyObject *
11512unicode_rindex(PyObject *self, PyObject *args)
11513{
11514    PyUnicodeObject *substring;
11515    Py_ssize_t start;
11516    Py_ssize_t end;
11517    Py_ssize_t result;
11518
11519    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11520                                            &start, &end))
11521        return NULL;
11522
11523    if (PyUnicode_READY(self) == -1)
11524        return NULL;
11525    if (PyUnicode_READY(substring) == -1)
11526        return NULL;
11527
11528    result = any_find_slice(
11529        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11530        self, (PyObject*)substring, start, end
11531        );
11532
11533    Py_DECREF(substring);
11534
11535    if (result == -2)
11536        return NULL;
11537
11538    if (result < 0) {
11539        PyErr_SetString(PyExc_ValueError, "substring not found");
11540        return NULL;
11541    }
11542
11543    return PyLong_FromSsize_t(result);
11544}
11545
11546PyDoc_STRVAR(rjust__doc__,
11547             "S.rjust(width[, fillchar]) -> str\n\
11548\n\
11549Return S right-justified in a string of length width. Padding is\n\
11550done using the specified fill character (default is a space).");
11551
11552static PyObject *
11553unicode_rjust(PyUnicodeObject *self, PyObject *args)
11554{
11555    Py_ssize_t width;
11556    Py_UCS4 fillchar = ' ';
11557
11558    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11559        return NULL;
11560
11561    if (PyUnicode_READY(self) == -1)
11562        return NULL;
11563
11564    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11565        Py_INCREF(self);
11566        return (PyObject*) self;
11567    }
11568
11569    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11570}
11571
11572PyObject *
11573PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11574{
11575    PyObject *result;
11576
11577    s = PyUnicode_FromObject(s);
11578    if (s == NULL)
11579        return NULL;
11580    if (sep != NULL) {
11581        sep = PyUnicode_FromObject(sep);
11582        if (sep == NULL) {
11583            Py_DECREF(s);
11584            return NULL;
11585        }
11586    }
11587
11588    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11589
11590    Py_DECREF(s);
11591    Py_XDECREF(sep);
11592    return result;
11593}
11594
11595PyDoc_STRVAR(split__doc__,
11596             "S.split([sep[, maxsplit]]) -> list of strings\n\
11597\n\
11598Return a list of the words in S, using sep as the\n\
11599delimiter string.  If maxsplit is given, at most maxsplit\n\
11600splits are done. If sep is not specified or is None, any\n\
11601whitespace string is a separator and empty strings are\n\
11602removed from the result.");
11603
11604static PyObject*
11605unicode_split(PyUnicodeObject *self, PyObject *args)
11606{
11607    PyObject *substring = Py_None;
11608    Py_ssize_t maxcount = -1;
11609
11610    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11611        return NULL;
11612
11613    if (substring == Py_None)
11614        return split(self, NULL, maxcount);
11615    else if (PyUnicode_Check(substring))
11616        return split(self, (PyUnicodeObject *)substring, maxcount);
11617    else
11618        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11619}
11620
11621PyObject *
11622PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11623{
11624    PyObject* str_obj;
11625    PyObject* sep_obj;
11626    PyObject* out;
11627    int kind1, kind2, kind;
11628    void *buf1 = NULL, *buf2 = NULL;
11629    Py_ssize_t len1, len2;
11630
11631    str_obj = PyUnicode_FromObject(str_in);
11632    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11633        return NULL;
11634    sep_obj = PyUnicode_FromObject(sep_in);
11635    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11636        Py_DECREF(str_obj);
11637        return NULL;
11638    }
11639
11640    kind1 = PyUnicode_KIND(str_in);
11641    kind2 = PyUnicode_KIND(sep_obj);
11642    kind = kind1 > kind2 ? kind1 : kind2;
11643    buf1 = PyUnicode_DATA(str_in);
11644    if (kind1 != kind)
11645        buf1 = _PyUnicode_AsKind(str_in, kind);
11646    if (!buf1)
11647        goto onError;
11648    buf2 = PyUnicode_DATA(sep_obj);
11649    if (kind2 != kind)
11650        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11651    if (!buf2)
11652        goto onError;
11653    len1 = PyUnicode_GET_LENGTH(str_obj);
11654    len2 = PyUnicode_GET_LENGTH(sep_obj);
11655
11656    switch(PyUnicode_KIND(str_in)) {
11657    case PyUnicode_1BYTE_KIND:
11658        out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11659        break;
11660    case PyUnicode_2BYTE_KIND:
11661        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11662        break;
11663    case PyUnicode_4BYTE_KIND:
11664        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11665        break;
11666    default:
11667        assert(0);
11668        out = 0;
11669    }
11670
11671    Py_DECREF(sep_obj);
11672    Py_DECREF(str_obj);
11673    if (kind1 != kind)
11674        PyMem_Free(buf1);
11675    if (kind2 != kind)
11676        PyMem_Free(buf2);
11677
11678    return out;
11679  onError:
11680    Py_DECREF(sep_obj);
11681    Py_DECREF(str_obj);
11682    if (kind1 != kind && buf1)
11683        PyMem_Free(buf1);
11684    if (kind2 != kind && buf2)
11685        PyMem_Free(buf2);
11686    return NULL;
11687}
11688
11689
11690PyObject *
11691PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11692{
11693    PyObject* str_obj;
11694    PyObject* sep_obj;
11695    PyObject* out;
11696    int kind1, kind2, kind;
11697    void *buf1 = NULL, *buf2 = NULL;
11698    Py_ssize_t len1, len2;
11699
11700    str_obj = PyUnicode_FromObject(str_in);
11701    if (!str_obj)
11702        return NULL;
11703    sep_obj = PyUnicode_FromObject(sep_in);
11704    if (!sep_obj) {
11705        Py_DECREF(str_obj);
11706        return NULL;
11707    }
11708
11709    kind1 = PyUnicode_KIND(str_in);
11710    kind2 = PyUnicode_KIND(sep_obj);
11711    kind = Py_MAX(kind1, kind2);
11712    buf1 = PyUnicode_DATA(str_in);
11713    if (kind1 != kind)
11714        buf1 = _PyUnicode_AsKind(str_in, kind);
11715    if (!buf1)
11716        goto onError;
11717    buf2 = PyUnicode_DATA(sep_obj);
11718    if (kind2 != kind)
11719        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11720    if (!buf2)
11721        goto onError;
11722    len1 = PyUnicode_GET_LENGTH(str_obj);
11723    len2 = PyUnicode_GET_LENGTH(sep_obj);
11724
11725    switch(PyUnicode_KIND(str_in)) {
11726    case PyUnicode_1BYTE_KIND:
11727        out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11728        break;
11729    case PyUnicode_2BYTE_KIND:
11730        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11731        break;
11732    case PyUnicode_4BYTE_KIND:
11733        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11734        break;
11735    default:
11736        assert(0);
11737        out = 0;
11738    }
11739
11740    Py_DECREF(sep_obj);
11741    Py_DECREF(str_obj);
11742    if (kind1 != kind)
11743        PyMem_Free(buf1);
11744    if (kind2 != kind)
11745        PyMem_Free(buf2);
11746
11747    return out;
11748  onError:
11749    Py_DECREF(sep_obj);
11750    Py_DECREF(str_obj);
11751    if (kind1 != kind && buf1)
11752        PyMem_Free(buf1);
11753    if (kind2 != kind && buf2)
11754        PyMem_Free(buf2);
11755    return NULL;
11756}
11757
11758PyDoc_STRVAR(partition__doc__,
11759             "S.partition(sep) -> (head, sep, tail)\n\
11760\n\
11761Search for the separator sep in S, and return the part before it,\n\
11762the separator itself, and the part after it.  If the separator is not\n\
11763found, return S and two empty strings.");
11764
11765static PyObject*
11766unicode_partition(PyUnicodeObject *self, PyObject *separator)
11767{
11768    return PyUnicode_Partition((PyObject *)self, separator);
11769}
11770
11771PyDoc_STRVAR(rpartition__doc__,
11772             "S.rpartition(sep) -> (head, sep, tail)\n\
11773\n\
11774Search for the separator sep in S, starting at the end of S, and return\n\
11775the part before it, the separator itself, and the part after it.  If the\n\
11776separator is not found, return two empty strings and S.");
11777
11778static PyObject*
11779unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11780{
11781    return PyUnicode_RPartition((PyObject *)self, separator);
11782}
11783
11784PyObject *
11785PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11786{
11787    PyObject *result;
11788
11789    s = PyUnicode_FromObject(s);
11790    if (s == NULL)
11791        return NULL;
11792    if (sep != NULL) {
11793        sep = PyUnicode_FromObject(sep);
11794        if (sep == NULL) {
11795            Py_DECREF(s);
11796            return NULL;
11797        }
11798    }
11799
11800    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11801
11802    Py_DECREF(s);
11803    Py_XDECREF(sep);
11804    return result;
11805}
11806
11807PyDoc_STRVAR(rsplit__doc__,
11808             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
11809\n\
11810Return a list of the words in S, using sep as the\n\
11811delimiter string, starting at the end of the string and\n\
11812working to the front.  If maxsplit is given, at most maxsplit\n\
11813splits are done. If sep is not specified, any whitespace string\n\
11814is a separator.");
11815
11816static PyObject*
11817unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11818{
11819    PyObject *substring = Py_None;
11820    Py_ssize_t maxcount = -1;
11821
11822    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
11823        return NULL;
11824
11825    if (substring == Py_None)
11826        return rsplit(self, NULL, maxcount);
11827    else if (PyUnicode_Check(substring))
11828        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
11829    else
11830        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
11831}
11832
11833PyDoc_STRVAR(splitlines__doc__,
11834             "S.splitlines([keepends]) -> list of strings\n\
11835\n\
11836Return a list of the lines in S, breaking at line boundaries.\n\
11837Line breaks are not included in the resulting list unless keepends\n\
11838is given and true.");
11839
11840static PyObject*
11841unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
11842{
11843    static char *kwlist[] = {"keepends", 0};
11844    int keepends = 0;
11845
11846    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11847                                     kwlist, &keepends))
11848        return NULL;
11849
11850    return PyUnicode_Splitlines((PyObject *)self, keepends);
11851}
11852
11853static
11854PyObject *unicode_str(PyObject *self)
11855{
11856    if (PyUnicode_CheckExact(self)) {
11857        Py_INCREF(self);
11858        return self;
11859    } else
11860        /* Subtype -- return genuine unicode string with the same value. */
11861        return PyUnicode_Copy(self);
11862}
11863
11864PyDoc_STRVAR(swapcase__doc__,
11865             "S.swapcase() -> str\n\
11866\n\
11867Return a copy of S with uppercase characters converted to lowercase\n\
11868and vice versa.");
11869
11870static PyObject*
11871unicode_swapcase(PyUnicodeObject *self)
11872{
11873    return fixup(self, fixswapcase);
11874}
11875
11876PyDoc_STRVAR(maketrans__doc__,
11877             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
11878\n\
11879Return a translation table usable for str.translate().\n\
11880If there is only one argument, it must be a dictionary mapping Unicode\n\
11881ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
11882Character keys will be then converted to ordinals.\n\
11883If there are two arguments, they must be strings of equal length, and\n\
11884in the resulting dictionary, each character in x will be mapped to the\n\
11885character at the same position in y. If there is a third argument, it\n\
11886must be a string, whose characters will be mapped to None in the result.");
11887
11888static PyObject*
11889unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11890{
11891    PyObject *x, *y = NULL, *z = NULL;
11892    PyObject *new = NULL, *key, *value;
11893    Py_ssize_t i = 0;
11894    int res;
11895
11896    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11897        return NULL;
11898    new = PyDict_New();
11899    if (!new)
11900        return NULL;
11901    if (y != NULL) {
11902        int x_kind, y_kind, z_kind;
11903        void *x_data, *y_data, *z_data;
11904
11905        /* x must be a string too, of equal length */
11906        if (!PyUnicode_Check(x)) {
11907            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11908                            "be a string if there is a second argument");
11909            goto err;
11910        }
11911        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
11912            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11913                            "arguments must have equal length");
11914            goto err;
11915        }
11916        /* create entries for translating chars in x to those in y */
11917        x_kind = PyUnicode_KIND(x);
11918        y_kind = PyUnicode_KIND(y);
11919        x_data = PyUnicode_DATA(x);
11920        y_data = PyUnicode_DATA(y);
11921        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11922            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11923            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
11924            if (!key || !value)
11925                goto err;
11926            res = PyDict_SetItem(new, key, value);
11927            Py_DECREF(key);
11928            Py_DECREF(value);
11929            if (res < 0)
11930                goto err;
11931        }
11932        /* create entries for deleting chars in z */
11933        if (z != NULL) {
11934            z_kind = PyUnicode_KIND(z);
11935            z_data = PyUnicode_DATA(z);
11936            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
11937                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
11938                if (!key)
11939                    goto err;
11940                res = PyDict_SetItem(new, key, Py_None);
11941                Py_DECREF(key);
11942                if (res < 0)
11943                    goto err;
11944            }
11945        }
11946    } else {
11947        int kind;
11948        void *data;
11949
11950        /* x must be a dict */
11951        if (!PyDict_CheckExact(x)) {
11952            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11953                            "to maketrans it must be a dict");
11954            goto err;
11955        }
11956        /* copy entries into the new dict, converting string keys to int keys */
11957        while (PyDict_Next(x, &i, &key, &value)) {
11958            if (PyUnicode_Check(key)) {
11959                /* convert string keys to integer keys */
11960                PyObject *newkey;
11961                if (PyUnicode_GET_SIZE(key) != 1) {
11962                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
11963                                    "table must be of length 1");
11964                    goto err;
11965                }
11966                kind = PyUnicode_KIND(key);
11967                data = PyUnicode_DATA(key);
11968                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
11969                if (!newkey)
11970                    goto err;
11971                res = PyDict_SetItem(new, newkey, value);
11972                Py_DECREF(newkey);
11973                if (res < 0)
11974                    goto err;
11975            } else if (PyLong_Check(key)) {
11976                /* just keep integer keys */
11977                if (PyDict_SetItem(new, key, value) < 0)
11978                    goto err;
11979            } else {
11980                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11981                                "be strings or integers");
11982                goto err;
11983            }
11984        }
11985    }
11986    return new;
11987  err:
11988    Py_DECREF(new);
11989    return NULL;
11990}
11991
11992PyDoc_STRVAR(translate__doc__,
11993             "S.translate(table) -> str\n\
11994\n\
11995Return a copy of the string S, where all characters have been mapped\n\
11996through the given translation table, which must be a mapping of\n\
11997Unicode ordinals to Unicode ordinals, strings, or None.\n\
11998Unmapped characters are left untouched. Characters mapped to None\n\
11999are deleted.");
12000
12001static PyObject*
12002unicode_translate(PyObject *self, PyObject *table)
12003{
12004    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12005}
12006
12007PyDoc_STRVAR(upper__doc__,
12008             "S.upper() -> str\n\
12009\n\
12010Return a copy of S converted to uppercase.");
12011
12012static PyObject*
12013unicode_upper(PyUnicodeObject *self)
12014{
12015    return fixup(self, fixupper);
12016}
12017
12018PyDoc_STRVAR(zfill__doc__,
12019             "S.zfill(width) -> str\n\
12020\n\
12021Pad a numeric string S with zeros on the left, to fill a field\n\
12022of the specified width. The string S is never truncated.");
12023
12024static PyObject *
12025unicode_zfill(PyUnicodeObject *self, PyObject *args)
12026{
12027    Py_ssize_t fill;
12028    PyUnicodeObject *u;
12029    Py_ssize_t width;
12030    int kind;
12031    void *data;
12032    Py_UCS4 chr;
12033
12034    if (PyUnicode_READY(self) == -1)
12035        return NULL;
12036
12037    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12038        return NULL;
12039
12040    if (PyUnicode_GET_LENGTH(self) >= width) {
12041        if (PyUnicode_CheckExact(self)) {
12042            Py_INCREF(self);
12043            return (PyObject*) self;
12044        }
12045        else
12046            return PyUnicode_Copy((PyObject*)self);
12047    }
12048
12049    fill = width - _PyUnicode_LENGTH(self);
12050
12051    u = pad(self, fill, 0, '0');
12052
12053    if (u == NULL)
12054        return NULL;
12055
12056    kind = PyUnicode_KIND(u);
12057    data = PyUnicode_DATA(u);
12058    chr = PyUnicode_READ(kind, data, fill);
12059
12060    if (chr == '+' || chr == '-') {
12061        /* move sign to beginning of string */
12062        PyUnicode_WRITE(kind, data, 0, chr);
12063        PyUnicode_WRITE(kind, data, fill, '0');
12064    }
12065
12066    return (PyObject*) u;
12067}
12068
12069#if 0
12070static PyObject *
12071unicode__decimal2ascii(PyObject *self)
12072{
12073    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12074}
12075#endif
12076
12077PyDoc_STRVAR(startswith__doc__,
12078             "S.startswith(prefix[, start[, end]]) -> bool\n\
12079\n\
12080Return True if S starts with the specified prefix, False otherwise.\n\
12081With optional start, test S beginning at that position.\n\
12082With optional end, stop comparing S at that position.\n\
12083prefix can also be a tuple of strings to try.");
12084
12085static PyObject *
12086unicode_startswith(PyUnicodeObject *self,
12087                   PyObject *args)
12088{
12089    PyObject *subobj;
12090    PyUnicodeObject *substring;
12091    Py_ssize_t start = 0;
12092    Py_ssize_t end = PY_SSIZE_T_MAX;
12093    int result;
12094
12095    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12096        return NULL;
12097    if (PyTuple_Check(subobj)) {
12098        Py_ssize_t i;
12099        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12100            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12101                PyTuple_GET_ITEM(subobj, i));
12102            if (substring == NULL)
12103                return NULL;
12104            result = tailmatch(self, substring, start, end, -1);
12105            Py_DECREF(substring);
12106            if (result) {
12107                Py_RETURN_TRUE;
12108            }
12109        }
12110        /* nothing matched */
12111        Py_RETURN_FALSE;
12112    }
12113    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12114    if (substring == NULL) {
12115        if (PyErr_ExceptionMatches(PyExc_TypeError))
12116            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12117                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12118        return NULL;
12119    }
12120    result = tailmatch(self, substring, start, end, -1);
12121    Py_DECREF(substring);
12122    return PyBool_FromLong(result);
12123}
12124
12125
12126PyDoc_STRVAR(endswith__doc__,
12127             "S.endswith(suffix[, start[, end]]) -> bool\n\
12128\n\
12129Return True if S ends with the specified suffix, False otherwise.\n\
12130With optional start, test S beginning at that position.\n\
12131With optional end, stop comparing S at that position.\n\
12132suffix can also be a tuple of strings to try.");
12133
12134static PyObject *
12135unicode_endswith(PyUnicodeObject *self,
12136                 PyObject *args)
12137{
12138    PyObject *subobj;
12139    PyUnicodeObject *substring;
12140    Py_ssize_t start = 0;
12141    Py_ssize_t end = PY_SSIZE_T_MAX;
12142    int result;
12143
12144    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12145        return NULL;
12146    if (PyTuple_Check(subobj)) {
12147        Py_ssize_t i;
12148        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12149            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12150                PyTuple_GET_ITEM(subobj, i));
12151            if (substring == NULL)
12152                return NULL;
12153            result = tailmatch(self, substring, start, end, +1);
12154            Py_DECREF(substring);
12155            if (result) {
12156                Py_RETURN_TRUE;
12157            }
12158        }
12159        Py_RETURN_FALSE;
12160    }
12161    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12162    if (substring == NULL) {
12163        if (PyErr_ExceptionMatches(PyExc_TypeError))
12164            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12165                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12166        return NULL;
12167    }
12168    result = tailmatch(self, substring, start, end, +1);
12169    Py_DECREF(substring);
12170    return PyBool_FromLong(result);
12171}
12172
12173#include "stringlib/unicode_format.h"
12174
12175PyDoc_STRVAR(format__doc__,
12176             "S.format(*args, **kwargs) -> str\n\
12177\n\
12178Return a formatted version of S, using substitutions from args and kwargs.\n\
12179The substitutions are identified by braces ('{' and '}').");
12180
12181PyDoc_STRVAR(format_map__doc__,
12182             "S.format_map(mapping) -> str\n\
12183\n\
12184Return a formatted version of S, using substitutions from mapping.\n\
12185The substitutions are identified by braces ('{' and '}').");
12186
12187static PyObject *
12188unicode__format__(PyObject* self, PyObject* args)
12189{
12190    PyObject *format_spec;
12191
12192    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12193        return NULL;
12194
12195    return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12196                                     PyUnicode_GET_LENGTH(format_spec));
12197}
12198
12199PyDoc_STRVAR(p_format__doc__,
12200             "S.__format__(format_spec) -> str\n\
12201\n\
12202Return a formatted version of S as described by format_spec.");
12203
12204static PyObject *
12205unicode__sizeof__(PyUnicodeObject *v)
12206{
12207    Py_ssize_t size;
12208
12209    /* If it's a compact object, account for base structure +
12210       character data. */
12211    if (PyUnicode_IS_COMPACT_ASCII(v))
12212        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12213    else if (PyUnicode_IS_COMPACT(v))
12214        size = sizeof(PyCompactUnicodeObject) +
12215            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12216    else {
12217        /* If it is a two-block object, account for base object, and
12218           for character block if present. */
12219        size = sizeof(PyUnicodeObject);
12220        if (_PyUnicode_DATA_ANY(v))
12221            size += (PyUnicode_GET_LENGTH(v) + 1) *
12222                PyUnicode_CHARACTER_SIZE(v);
12223    }
12224    /* If the wstr pointer is present, account for it unless it is shared
12225       with the data pointer. Check if the data is not shared. */
12226    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12227        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12228    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12229        size += PyUnicode_UTF8_LENGTH(v) + 1;
12230
12231    return PyLong_FromSsize_t(size);
12232}
12233
12234PyDoc_STRVAR(sizeof__doc__,
12235             "S.__sizeof__() -> size of S in memory, in bytes");
12236
12237static PyObject *
12238unicode_getnewargs(PyObject *v)
12239{
12240    PyObject *copy = PyUnicode_Copy(v);
12241    if (!copy)
12242        return NULL;
12243    return Py_BuildValue("(N)", copy);
12244}
12245
12246static PyMethodDef unicode_methods[] = {
12247
12248    /* Order is according to common usage: often used methods should
12249       appear first, since lookup is done sequentially. */
12250
12251    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12252    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12253    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12254    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12255    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12256    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12257    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12258    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12259    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12260    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12261    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12262    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12263    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12264    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12265    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12266    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12267    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12268    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12269    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12270    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12271    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12272    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12273    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12274    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12275    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12276    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12277    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12278    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12279    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12280    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12281    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12282    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12283    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12284    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12285    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12286    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12287    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12288    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12289    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12290    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12291    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12292    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12293    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12294    {"maketrans", (PyCFunction) unicode_maketrans,
12295     METH_VARARGS | METH_STATIC, maketrans__doc__},
12296    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12297#if 0
12298    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12299#endif
12300
12301#if 0
12302    /* These methods are just used for debugging the implementation. */
12303    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12304#endif
12305
12306    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12307    {NULL, NULL}
12308};
12309
12310static PyObject *
12311unicode_mod(PyObject *v, PyObject *w)
12312{
12313    if (!PyUnicode_Check(v))
12314        Py_RETURN_NOTIMPLEMENTED;
12315    return PyUnicode_Format(v, w);
12316}
12317
12318static PyNumberMethods unicode_as_number = {
12319    0,              /*nb_add*/
12320    0,              /*nb_subtract*/
12321    0,              /*nb_multiply*/
12322    unicode_mod,            /*nb_remainder*/
12323};
12324
12325static PySequenceMethods unicode_as_sequence = {
12326    (lenfunc) unicode_length,       /* sq_length */
12327    PyUnicode_Concat,           /* sq_concat */
12328    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12329    (ssizeargfunc) unicode_getitem,     /* sq_item */
12330    0,                  /* sq_slice */
12331    0,                  /* sq_ass_item */
12332    0,                  /* sq_ass_slice */
12333    PyUnicode_Contains,         /* sq_contains */
12334};
12335
12336static PyObject*
12337unicode_subscript(PyUnicodeObject* self, PyObject* item)
12338{
12339    if (PyUnicode_READY(self) == -1)
12340        return NULL;
12341
12342    if (PyIndex_Check(item)) {
12343        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12344        if (i == -1 && PyErr_Occurred())
12345            return NULL;
12346        if (i < 0)
12347            i += PyUnicode_GET_LENGTH(self);
12348        return unicode_getitem((PyObject*)self, i);
12349    } else if (PySlice_Check(item)) {
12350        Py_ssize_t start, stop, step, slicelength, cur, i;
12351        PyObject *result;
12352        void *src_data, *dest_data;
12353        int src_kind, dest_kind;
12354        Py_UCS4 ch, max_char, kind_limit;
12355
12356        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12357                                 &start, &stop, &step, &slicelength) < 0) {
12358            return NULL;
12359        }
12360
12361        if (slicelength <= 0) {
12362            return PyUnicode_New(0, 0);
12363        } else if (start == 0 && step == 1 &&
12364                   slicelength == PyUnicode_GET_LENGTH(self) &&
12365                   PyUnicode_CheckExact(self)) {
12366            Py_INCREF(self);
12367            return (PyObject *)self;
12368        } else if (step == 1) {
12369            return PyUnicode_Substring((PyObject*)self,
12370                                       start, start + slicelength);
12371        }
12372        /* General case */
12373        max_char = 0;
12374        src_kind = PyUnicode_KIND(self);
12375        kind_limit = kind_maxchar_limit(src_kind);
12376        src_data = PyUnicode_DATA(self);
12377        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12378            ch = PyUnicode_READ(src_kind, src_data, cur);
12379            if (ch > max_char) {
12380                max_char = ch;
12381                if (max_char >= kind_limit)
12382                    break;
12383            }
12384        }
12385        result = PyUnicode_New(slicelength, max_char);
12386        if (result == NULL)
12387            return NULL;
12388        dest_kind = PyUnicode_KIND(result);
12389        dest_data = PyUnicode_DATA(result);
12390
12391        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12392            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12393            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
12394        }
12395        return result;
12396    } else {
12397        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12398        return NULL;
12399    }
12400}
12401
12402static PyMappingMethods unicode_as_mapping = {
12403    (lenfunc)unicode_length,        /* mp_length */
12404    (binaryfunc)unicode_subscript,  /* mp_subscript */
12405    (objobjargproc)0,           /* mp_ass_subscript */
12406};
12407
12408
12409/* Helpers for PyUnicode_Format() */
12410
12411static PyObject *
12412getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12413{
12414    Py_ssize_t argidx = *p_argidx;
12415    if (argidx < arglen) {
12416        (*p_argidx)++;
12417        if (arglen < 0)
12418            return args;
12419        else
12420            return PyTuple_GetItem(args, argidx);
12421    }
12422    PyErr_SetString(PyExc_TypeError,
12423                    "not enough arguments for format string");
12424    return NULL;
12425}
12426
12427/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12428
12429static PyObject *
12430formatfloat(PyObject *v, int flags, int prec, int type)
12431{
12432    char *p;
12433    PyObject *result;
12434    double x;
12435
12436    x = PyFloat_AsDouble(v);
12437    if (x == -1.0 && PyErr_Occurred())
12438        return NULL;
12439
12440    if (prec < 0)
12441        prec = 6;
12442
12443    p = PyOS_double_to_string(x, type, prec,
12444                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12445    if (p == NULL)
12446        return NULL;
12447    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12448    PyMem_Free(p);
12449    return result;
12450}
12451
12452static PyObject*
12453formatlong(PyObject *val, int flags, int prec, int type)
12454{
12455    char *buf;
12456    int len;
12457    PyObject *str; /* temporary string object. */
12458    PyObject *result;
12459
12460    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12461    if (!str)
12462        return NULL;
12463    result = PyUnicode_DecodeASCII(buf, len, NULL);
12464    Py_DECREF(str);
12465    return result;
12466}
12467
12468static int
12469formatchar(Py_UCS4 *buf,
12470           size_t buflen,
12471           PyObject *v)
12472{
12473    /* presume that the buffer is at least 3 characters long */
12474    if (PyUnicode_Check(v)) {
12475        if (PyUnicode_GET_LENGTH(v) == 1) {
12476            buf[0] = PyUnicode_READ_CHAR(v, 0);
12477            buf[1] = '\0';
12478            return 1;
12479        }
12480        goto onError;
12481    }
12482    else {
12483        /* Integer input truncated to a character */
12484        long x;
12485        x = PyLong_AsLong(v);
12486        if (x == -1 && PyErr_Occurred())
12487            goto onError;
12488
12489        if (x < 0 || x > 0x10ffff) {
12490            PyErr_SetString(PyExc_OverflowError,
12491                            "%c arg not in range(0x110000)");
12492            return -1;
12493        }
12494
12495        buf[0] = (Py_UCS4) x;
12496        buf[1] = '\0';
12497        return 1;
12498    }
12499
12500  onError:
12501    PyErr_SetString(PyExc_TypeError,
12502                    "%c requires int or char");
12503    return -1;
12504}
12505
12506/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12507   FORMATBUFLEN is the length of the buffer in which chars are formatted.
12508*/
12509#define FORMATBUFLEN (size_t)10
12510
12511PyObject *
12512PyUnicode_Format(PyObject *format, PyObject *args)
12513{
12514    void *fmt;
12515    int fmtkind;
12516    PyObject *result;
12517    Py_UCS4 *res, *res0;
12518    Py_UCS4 max;
12519    int kind;
12520    Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12521    int args_owned = 0;
12522    PyObject *dict = NULL;
12523    PyUnicodeObject *uformat;
12524
12525    if (format == NULL || args == NULL) {
12526        PyErr_BadInternalCall();
12527        return NULL;
12528    }
12529    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12530    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12531        return NULL;
12532    fmt = PyUnicode_DATA(uformat);
12533    fmtkind = PyUnicode_KIND(uformat);
12534    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12535    fmtpos = 0;
12536
12537    reslen = rescnt = fmtcnt + 100;
12538    res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12539    if (res0 == NULL) {
12540        PyErr_NoMemory();
12541        goto onError;
12542    }
12543
12544    if (PyTuple_Check(args)) {
12545        arglen = PyTuple_Size(args);
12546        argidx = 0;
12547    }
12548    else {
12549        arglen = -1;
12550        argidx = -2;
12551    }
12552    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12553        !PyUnicode_Check(args))
12554        dict = args;
12555
12556    while (--fmtcnt >= 0) {
12557        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12558            if (--rescnt < 0) {
12559                rescnt = fmtcnt + 100;
12560                reslen += rescnt;
12561                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12562                if (res0 == NULL){
12563                    PyErr_NoMemory();
12564                    goto onError;
12565                }
12566                res = res0 + reslen - rescnt;
12567                --rescnt;
12568            }
12569            *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12570        }
12571        else {
12572            /* Got a format specifier */
12573            int flags = 0;
12574            Py_ssize_t width = -1;
12575            int prec = -1;
12576            Py_UCS4 c = '\0';
12577            Py_UCS4 fill;
12578            int isnumok;
12579            PyObject *v = NULL;
12580            PyObject *temp = NULL;
12581            void *pbuf;
12582            Py_ssize_t pindex;
12583            Py_UNICODE sign;
12584            Py_ssize_t len, len1;
12585            Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12586
12587            fmtpos++;
12588            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12589                Py_ssize_t keystart;
12590                Py_ssize_t keylen;
12591                PyObject *key;
12592                int pcount = 1;
12593
12594                if (dict == NULL) {
12595                    PyErr_SetString(PyExc_TypeError,
12596                                    "format requires a mapping");
12597                    goto onError;
12598                }
12599                ++fmtpos;
12600                --fmtcnt;
12601                keystart = fmtpos;
12602                /* Skip over balanced parentheses */
12603                while (pcount > 0 && --fmtcnt >= 0) {
12604                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12605                        --pcount;
12606                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12607                        ++pcount;
12608                    fmtpos++;
12609                }
12610                keylen = fmtpos - keystart - 1;
12611                if (fmtcnt < 0 || pcount > 0) {
12612                    PyErr_SetString(PyExc_ValueError,
12613                                    "incomplete format key");
12614                    goto onError;
12615                }
12616                key = PyUnicode_Substring((PyObject*)uformat,
12617                                          keystart, keystart + keylen);
12618                if (key == NULL)
12619                    goto onError;
12620                if (args_owned) {
12621                    Py_DECREF(args);
12622                    args_owned = 0;
12623                }
12624                args = PyObject_GetItem(dict, key);
12625                Py_DECREF(key);
12626                if (args == NULL) {
12627                    goto onError;
12628                }
12629                args_owned = 1;
12630                arglen = -1;
12631                argidx = -2;
12632            }
12633            while (--fmtcnt >= 0) {
12634                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12635                case '-': flags |= F_LJUST; continue;
12636                case '+': flags |= F_SIGN; continue;
12637                case ' ': flags |= F_BLANK; continue;
12638                case '#': flags |= F_ALT; continue;
12639                case '0': flags |= F_ZERO; continue;
12640                }
12641                break;
12642            }
12643            if (c == '*') {
12644                v = getnextarg(args, arglen, &argidx);
12645                if (v == NULL)
12646                    goto onError;
12647                if (!PyLong_Check(v)) {
12648                    PyErr_SetString(PyExc_TypeError,
12649                                    "* wants int");
12650                    goto onError;
12651                }
12652                width = PyLong_AsLong(v);
12653                if (width == -1 && PyErr_Occurred())
12654                    goto onError;
12655                if (width < 0) {
12656                    flags |= F_LJUST;
12657                    width = -width;
12658                }
12659                if (--fmtcnt >= 0)
12660                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12661            }
12662            else if (c >= '0' && c <= '9') {
12663                width = c - '0';
12664                while (--fmtcnt >= 0) {
12665                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12666                    if (c < '0' || c > '9')
12667                        break;
12668                    if ((width*10) / 10 != width) {
12669                        PyErr_SetString(PyExc_ValueError,
12670                                        "width too big");
12671                        goto onError;
12672                    }
12673                    width = width*10 + (c - '0');
12674                }
12675            }
12676            if (c == '.') {
12677                prec = 0;
12678                if (--fmtcnt >= 0)
12679                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12680                if (c == '*') {
12681                    v = getnextarg(args, arglen, &argidx);
12682                    if (v == NULL)
12683                        goto onError;
12684                    if (!PyLong_Check(v)) {
12685                        PyErr_SetString(PyExc_TypeError,
12686                                        "* wants int");
12687                        goto onError;
12688                    }
12689                    prec = PyLong_AsLong(v);
12690                    if (prec == -1 && PyErr_Occurred())
12691                        goto onError;
12692                    if (prec < 0)
12693                        prec = 0;
12694                    if (--fmtcnt >= 0)
12695                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12696                }
12697                else if (c >= '0' && c <= '9') {
12698                    prec = c - '0';
12699                    while (--fmtcnt >= 0) {
12700                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12701                        if (c < '0' || c > '9')
12702                            break;
12703                        if ((prec*10) / 10 != prec) {
12704                            PyErr_SetString(PyExc_ValueError,
12705                                            "prec too big");
12706                            goto onError;
12707                        }
12708                        prec = prec*10 + (c - '0');
12709                    }
12710                }
12711            } /* prec */
12712            if (fmtcnt >= 0) {
12713                if (c == 'h' || c == 'l' || c == 'L') {
12714                    if (--fmtcnt >= 0)
12715                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12716                }
12717            }
12718            if (fmtcnt < 0) {
12719                PyErr_SetString(PyExc_ValueError,
12720                                "incomplete format");
12721                goto onError;
12722            }
12723            if (c != '%') {
12724                v = getnextarg(args, arglen, &argidx);
12725                if (v == NULL)
12726                    goto onError;
12727            }
12728            sign = 0;
12729            fill = ' ';
12730            switch (c) {
12731
12732            case '%':
12733                pbuf = formatbuf;
12734                kind = PyUnicode_4BYTE_KIND;
12735                /* presume that buffer length is at least 1 */
12736                PyUnicode_WRITE(kind, pbuf, 0, '%');
12737                len = 1;
12738                break;
12739
12740            case 's':
12741            case 'r':
12742            case 'a':
12743                if (PyUnicode_CheckExact(v) && c == 's') {
12744                    temp = v;
12745                    Py_INCREF(temp);
12746                }
12747                else {
12748                    if (c == 's')
12749                        temp = PyObject_Str(v);
12750                    else if (c == 'r')
12751                        temp = PyObject_Repr(v);
12752                    else
12753                        temp = PyObject_ASCII(v);
12754                    if (temp == NULL)
12755                        goto onError;
12756                    if (PyUnicode_Check(temp))
12757                        /* nothing to do */;
12758                    else {
12759                        Py_DECREF(temp);
12760                        PyErr_SetString(PyExc_TypeError,
12761                                        "%s argument has non-string str()");
12762                        goto onError;
12763                    }
12764                }
12765                if (PyUnicode_READY(temp) == -1) {
12766                    Py_CLEAR(temp);
12767                    goto onError;
12768                }
12769                pbuf = PyUnicode_DATA(temp);
12770                kind = PyUnicode_KIND(temp);
12771                len = PyUnicode_GET_LENGTH(temp);
12772                if (prec >= 0 && len > prec)
12773                    len = prec;
12774                break;
12775
12776            case 'i':
12777            case 'd':
12778            case 'u':
12779            case 'o':
12780            case 'x':
12781            case 'X':
12782                isnumok = 0;
12783                if (PyNumber_Check(v)) {
12784                    PyObject *iobj=NULL;
12785
12786                    if (PyLong_Check(v)) {
12787                        iobj = v;
12788                        Py_INCREF(iobj);
12789                    }
12790                    else {
12791                        iobj = PyNumber_Long(v);
12792                    }
12793                    if (iobj!=NULL) {
12794                        if (PyLong_Check(iobj)) {
12795                            isnumok = 1;
12796                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
12797                            Py_DECREF(iobj);
12798                            if (!temp)
12799                                goto onError;
12800                            if (PyUnicode_READY(temp) == -1) {
12801                                Py_CLEAR(temp);
12802                                goto onError;
12803                            }
12804                            pbuf = PyUnicode_DATA(temp);
12805                            kind = PyUnicode_KIND(temp);
12806                            len = PyUnicode_GET_LENGTH(temp);
12807                            sign = 1;
12808                        }
12809                        else {
12810                            Py_DECREF(iobj);
12811                        }
12812                    }
12813                }
12814                if (!isnumok) {
12815                    PyErr_Format(PyExc_TypeError,
12816                                 "%%%c format: a number is required, "
12817                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12818                    goto onError;
12819                }
12820                if (flags & F_ZERO)
12821                    fill = '0';
12822                break;
12823
12824            case 'e':
12825            case 'E':
12826            case 'f':
12827            case 'F':
12828            case 'g':
12829            case 'G':
12830                temp = formatfloat(v, flags, prec, c);
12831                if (!temp)
12832                    goto onError;
12833                if (PyUnicode_READY(temp) == -1) {
12834                    Py_CLEAR(temp);
12835                    goto onError;
12836                }
12837                pbuf = PyUnicode_DATA(temp);
12838                kind = PyUnicode_KIND(temp);
12839                len = PyUnicode_GET_LENGTH(temp);
12840                sign = 1;
12841                if (flags & F_ZERO)
12842                    fill = '0';
12843                break;
12844
12845            case 'c':
12846                pbuf = formatbuf;
12847                kind = PyUnicode_4BYTE_KIND;
12848                len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
12849                if (len < 0)
12850                    goto onError;
12851                break;
12852
12853            default:
12854                PyErr_Format(PyExc_ValueError,
12855                             "unsupported format character '%c' (0x%x) "
12856                             "at index %zd",
12857                             (31<=c && c<=126) ? (char)c : '?',
12858                             (int)c,
12859                             fmtpos - 1);
12860                goto onError;
12861            }
12862            /* pbuf is initialized here. */
12863            pindex = 0;
12864            if (sign) {
12865                if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12866                    PyUnicode_READ(kind, pbuf, pindex) == '+') {
12867                    sign = PyUnicode_READ(kind, pbuf, pindex++);
12868                    len--;
12869                }
12870                else if (flags & F_SIGN)
12871                    sign = '+';
12872                else if (flags & F_BLANK)
12873                    sign = ' ';
12874                else
12875                    sign = 0;
12876            }
12877            if (width < len)
12878                width = len;
12879            if (rescnt - (sign != 0) < width) {
12880                reslen -= rescnt;
12881                rescnt = width + fmtcnt + 100;
12882                reslen += rescnt;
12883                if (reslen < 0) {
12884                    Py_XDECREF(temp);
12885                    PyErr_NoMemory();
12886                    goto onError;
12887                }
12888                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12889                if (res0 == 0) {
12890                    PyErr_NoMemory();
12891                    Py_XDECREF(temp);
12892                    goto onError;
12893                }
12894                res = res0 + reslen - rescnt;
12895            }
12896            if (sign) {
12897                if (fill != ' ')
12898                    *res++ = sign;
12899                rescnt--;
12900                if (width > len)
12901                    width--;
12902            }
12903            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12904                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12905                assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12906                if (fill != ' ') {
12907                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12908                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12909                }
12910                rescnt -= 2;
12911                width -= 2;
12912                if (width < 0)
12913                    width = 0;
12914                len -= 2;
12915            }
12916            if (width > len && !(flags & F_LJUST)) {
12917                do {
12918                    --rescnt;
12919                    *res++ = fill;
12920                } while (--width > len);
12921            }
12922            if (fill == ' ') {
12923                if (sign)
12924                    *res++ = sign;
12925                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12926                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12927                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12928                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12929                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12930                }
12931            }
12932            /* Copy all characters, preserving len */
12933            len1 = len;
12934            while (len1--) {
12935                *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12936                rescnt--;
12937            }
12938            while (--width >= len) {
12939                --rescnt;
12940                *res++ = ' ';
12941            }
12942            if (dict && (argidx < arglen) && c != '%') {
12943                PyErr_SetString(PyExc_TypeError,
12944                                "not all arguments converted during string formatting");
12945                Py_XDECREF(temp);
12946                goto onError;
12947            }
12948            Py_XDECREF(temp);
12949        } /* '%' */
12950    } /* until end */
12951    if (argidx < arglen && !dict) {
12952        PyErr_SetString(PyExc_TypeError,
12953                        "not all arguments converted during string formatting");
12954        goto onError;
12955    }
12956
12957
12958    for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12959        if (*res > max)
12960            max = *res;
12961    result = PyUnicode_New(reslen - rescnt, max);
12962    if (!result)
12963        goto onError;
12964    kind = PyUnicode_KIND(result);
12965    for (res = res0; res < res0+reslen-rescnt; res++)
12966        PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12967    PyMem_Free(res0);
12968    if (args_owned) {
12969        Py_DECREF(args);
12970    }
12971    Py_DECREF(uformat);
12972    return (PyObject *)result;
12973
12974  onError:
12975    PyMem_Free(res0);
12976    Py_DECREF(uformat);
12977    if (args_owned) {
12978        Py_DECREF(args);
12979    }
12980    return NULL;
12981}
12982
12983static PyObject *
12984unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12985
12986static PyObject *
12987unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12988{
12989    PyObject *x = NULL;
12990    static char *kwlist[] = {"object", "encoding", "errors", 0};
12991    char *encoding = NULL;
12992    char *errors = NULL;
12993
12994    if (type != &PyUnicode_Type)
12995        return unicode_subtype_new(type, args, kwds);
12996    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
12997                                     kwlist, &x, &encoding, &errors))
12998        return NULL;
12999    if (x == NULL)
13000        return (PyObject *)PyUnicode_New(0, 0);
13001    if (encoding == NULL && errors == NULL)
13002        return PyObject_Str(x);
13003    else
13004        return PyUnicode_FromEncodedObject(x, encoding, errors);
13005}
13006
13007static PyObject *
13008unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
13009{
13010    PyUnicodeObject *unicode, *self;
13011    Py_ssize_t length, char_size;
13012    int share_wstr, share_utf8;
13013    unsigned int kind;
13014    void *data;
13015
13016    assert(PyType_IsSubtype(type, &PyUnicode_Type));
13017
13018    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13019    if (unicode == NULL)
13020        return NULL;
13021    assert(_PyUnicode_CHECK(unicode));
13022    if (PyUnicode_READY(unicode))
13023        return NULL;
13024
13025    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13026    if (self == NULL) {
13027        Py_DECREF(unicode);
13028        return NULL;
13029    }
13030    kind = PyUnicode_KIND(unicode);
13031    length = PyUnicode_GET_LENGTH(unicode);
13032
13033    _PyUnicode_LENGTH(self) = length;
13034    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13035    _PyUnicode_STATE(self).interned = 0;
13036    _PyUnicode_STATE(self).kind = kind;
13037    _PyUnicode_STATE(self).compact = 0;
13038    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13039    _PyUnicode_STATE(self).ready = 1;
13040    _PyUnicode_WSTR(self) = NULL;
13041    _PyUnicode_UTF8_LENGTH(self) = 0;
13042    _PyUnicode_UTF8(self) = NULL;
13043    _PyUnicode_WSTR_LENGTH(self) = 0;
13044    _PyUnicode_DATA_ANY(self) = NULL;
13045
13046    share_utf8 = 0;
13047    share_wstr = 0;
13048    if (kind == PyUnicode_1BYTE_KIND) {
13049        char_size = 1;
13050        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13051            share_utf8 = 1;
13052    }
13053    else if (kind == PyUnicode_2BYTE_KIND) {
13054        char_size = 2;
13055        if (sizeof(wchar_t) == 2)
13056            share_wstr = 1;
13057    }
13058    else {
13059        assert(kind == PyUnicode_4BYTE_KIND);
13060        char_size = 4;
13061        if (sizeof(wchar_t) == 4)
13062            share_wstr = 1;
13063    }
13064
13065    /* Ensure we won't overflow the length. */
13066    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13067        PyErr_NoMemory();
13068        goto onError;
13069    }
13070    data = PyObject_MALLOC((length + 1) * char_size);
13071    if (data == NULL) {
13072        PyErr_NoMemory();
13073        goto onError;
13074    }
13075
13076    _PyUnicode_DATA_ANY(self) = data;
13077    if (share_utf8) {
13078        _PyUnicode_UTF8_LENGTH(self) = length;
13079        _PyUnicode_UTF8(self) = data;
13080    }
13081    if (share_wstr) {
13082        _PyUnicode_WSTR_LENGTH(self) = length;
13083        _PyUnicode_WSTR(self) = (wchar_t *)data;
13084    }
13085
13086    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13087              PyUnicode_KIND_SIZE(kind, length + 1));
13088    Py_DECREF(unicode);
13089    return (PyObject *)self;
13090
13091onError:
13092    Py_DECREF(unicode);
13093    Py_DECREF(self);
13094    return NULL;
13095}
13096
13097PyDoc_STRVAR(unicode_doc,
13098             "str(string[, encoding[, errors]]) -> str\n\
13099\n\
13100Create a new string object from the given encoded string.\n\
13101encoding defaults to the current default string encoding.\n\
13102errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13103
13104static PyObject *unicode_iter(PyObject *seq);
13105
13106PyTypeObject PyUnicode_Type = {
13107    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13108    "str",              /* tp_name */
13109    sizeof(PyUnicodeObject),        /* tp_size */
13110    0,                  /* tp_itemsize */
13111    /* Slots */
13112    (destructor)unicode_dealloc,    /* tp_dealloc */
13113    0,                  /* tp_print */
13114    0,                  /* tp_getattr */
13115    0,                  /* tp_setattr */
13116    0,                  /* tp_reserved */
13117    unicode_repr,           /* tp_repr */
13118    &unicode_as_number,         /* tp_as_number */
13119    &unicode_as_sequence,       /* tp_as_sequence */
13120    &unicode_as_mapping,        /* tp_as_mapping */
13121    (hashfunc) unicode_hash,        /* tp_hash*/
13122    0,                  /* tp_call*/
13123    (reprfunc) unicode_str,     /* tp_str */
13124    PyObject_GenericGetAttr,        /* tp_getattro */
13125    0,                  /* tp_setattro */
13126    0,                  /* tp_as_buffer */
13127    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13128    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13129    unicode_doc,            /* tp_doc */
13130    0,                  /* tp_traverse */
13131    0,                  /* tp_clear */
13132    PyUnicode_RichCompare,      /* tp_richcompare */
13133    0,                  /* tp_weaklistoffset */
13134    unicode_iter,           /* tp_iter */
13135    0,                  /* tp_iternext */
13136    unicode_methods,            /* tp_methods */
13137    0,                  /* tp_members */
13138    0,                  /* tp_getset */
13139    &PyBaseObject_Type,         /* tp_base */
13140    0,                  /* tp_dict */
13141    0,                  /* tp_descr_get */
13142    0,                  /* tp_descr_set */
13143    0,                  /* tp_dictoffset */
13144    0,                  /* tp_init */
13145    0,                  /* tp_alloc */
13146    unicode_new,            /* tp_new */
13147    PyObject_Del,           /* tp_free */
13148};
13149
13150/* Initialize the Unicode implementation */
13151
13152void _PyUnicode_Init(void)
13153{
13154    int i;
13155
13156    /* XXX - move this array to unicodectype.c ? */
13157    Py_UCS2 linebreak[] = {
13158        0x000A, /* LINE FEED */
13159        0x000D, /* CARRIAGE RETURN */
13160        0x001C, /* FILE SEPARATOR */
13161        0x001D, /* GROUP SEPARATOR */
13162        0x001E, /* RECORD SEPARATOR */
13163        0x0085, /* NEXT LINE */
13164        0x2028, /* LINE SEPARATOR */
13165        0x2029, /* PARAGRAPH SEPARATOR */
13166    };
13167
13168    /* Init the implementation */
13169    unicode_empty = PyUnicode_New(0, 0);
13170    if (!unicode_empty)
13171        Py_FatalError("Can't create empty string");
13172
13173    for (i = 0; i < 256; i++)
13174        unicode_latin1[i] = NULL;
13175    if (PyType_Ready(&PyUnicode_Type) < 0)
13176        Py_FatalError("Can't initialize 'unicode'");
13177
13178    /* initialize the linebreak bloom filter */
13179    bloom_linebreak = make_bloom_mask(
13180        PyUnicode_2BYTE_KIND, linebreak,
13181        Py_ARRAY_LENGTH(linebreak));
13182
13183    PyType_Ready(&EncodingMapType);
13184}
13185
13186/* Finalize the Unicode implementation */
13187
13188int
13189PyUnicode_ClearFreeList(void)
13190{
13191    return 0;
13192}
13193
13194void
13195_PyUnicode_Fini(void)
13196{
13197    int i;
13198
13199    Py_XDECREF(unicode_empty);
13200    unicode_empty = NULL;
13201
13202    for (i = 0; i < 256; i++) {
13203        if (unicode_latin1[i]) {
13204            Py_DECREF(unicode_latin1[i]);
13205            unicode_latin1[i] = NULL;
13206        }
13207    }
13208    (void)PyUnicode_ClearFreeList();
13209}
13210
13211void
13212PyUnicode_InternInPlace(PyObject **p)
13213{
13214    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13215    PyObject *t;
13216#ifdef Py_DEBUG
13217    assert(s != NULL);
13218    assert(_PyUnicode_CHECK(s));
13219#else
13220    if (s == NULL || !PyUnicode_Check(s))
13221        return;
13222#endif
13223    /* If it's a subclass, we don't really know what putting
13224       it in the interned dict might do. */
13225    if (!PyUnicode_CheckExact(s))
13226        return;
13227    if (PyUnicode_CHECK_INTERNED(s))
13228        return;
13229    if (_PyUnicode_READY_REPLACE(p)) {
13230        assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
13231        return;
13232    }
13233    s = (PyUnicodeObject *)(*p);
13234    if (interned == NULL) {
13235        interned = PyDict_New();
13236        if (interned == NULL) {
13237            PyErr_Clear(); /* Don't leave an exception */
13238            return;
13239        }
13240    }
13241    /* It might be that the GetItem call fails even
13242       though the key is present in the dictionary,
13243       namely when this happens during a stack overflow. */
13244    Py_ALLOW_RECURSION
13245        t = PyDict_GetItem(interned, (PyObject *)s);
13246    Py_END_ALLOW_RECURSION
13247
13248        if (t) {
13249            Py_INCREF(t);
13250            Py_DECREF(*p);
13251            *p = t;
13252            return;
13253        }
13254
13255    PyThreadState_GET()->recursion_critical = 1;
13256    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13257        PyErr_Clear();
13258        PyThreadState_GET()->recursion_critical = 0;
13259        return;
13260    }
13261    PyThreadState_GET()->recursion_critical = 0;
13262    /* The two references in interned are not counted by refcnt.
13263       The deallocator will take care of this */
13264    Py_REFCNT(s) -= 2;
13265    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13266}
13267
13268void
13269PyUnicode_InternImmortal(PyObject **p)
13270{
13271    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13272
13273    PyUnicode_InternInPlace(p);
13274    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13275        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13276        Py_INCREF(*p);
13277    }
13278}
13279
13280PyObject *
13281PyUnicode_InternFromString(const char *cp)
13282{
13283    PyObject *s = PyUnicode_FromString(cp);
13284    if (s == NULL)
13285        return NULL;
13286    PyUnicode_InternInPlace(&s);
13287    return s;
13288}
13289
13290void
13291_Py_ReleaseInternedUnicodeStrings(void)
13292{
13293    PyObject *keys;
13294    PyUnicodeObject *s;
13295    Py_ssize_t i, n;
13296    Py_ssize_t immortal_size = 0, mortal_size = 0;
13297
13298    if (interned == NULL || !PyDict_Check(interned))
13299        return;
13300    keys = PyDict_Keys(interned);
13301    if (keys == NULL || !PyList_Check(keys)) {
13302        PyErr_Clear();
13303        return;
13304    }
13305
13306    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13307       detector, interned unicode strings are not forcibly deallocated;
13308       rather, we give them their stolen references back, and then clear
13309       and DECREF the interned dict. */
13310
13311    n = PyList_GET_SIZE(keys);
13312    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13313            n);
13314    for (i = 0; i < n; i++) {
13315        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13316        if (PyUnicode_READY(s) == -1) {
13317            assert(0 && "could not ready string");
13318            fprintf(stderr, "could not ready string\n");
13319        }
13320        switch (PyUnicode_CHECK_INTERNED(s)) {
13321        case SSTATE_NOT_INTERNED:
13322            /* XXX Shouldn't happen */
13323            break;
13324        case SSTATE_INTERNED_IMMORTAL:
13325            Py_REFCNT(s) += 1;
13326            immortal_size += PyUnicode_GET_LENGTH(s);
13327            break;
13328        case SSTATE_INTERNED_MORTAL:
13329            Py_REFCNT(s) += 2;
13330            mortal_size += PyUnicode_GET_LENGTH(s);
13331            break;
13332        default:
13333            Py_FatalError("Inconsistent interned string state.");
13334        }
13335        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13336    }
13337    fprintf(stderr, "total size of all interned strings: "
13338            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13339            "mortal/immortal\n", mortal_size, immortal_size);
13340    Py_DECREF(keys);
13341    PyDict_Clear(interned);
13342    Py_DECREF(interned);
13343    interned = NULL;
13344}
13345
13346
13347/********************* Unicode Iterator **************************/
13348
13349typedef struct {
13350    PyObject_HEAD
13351    Py_ssize_t it_index;
13352    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13353} unicodeiterobject;
13354
13355static void
13356unicodeiter_dealloc(unicodeiterobject *it)
13357{
13358    _PyObject_GC_UNTRACK(it);
13359    Py_XDECREF(it->it_seq);
13360    PyObject_GC_Del(it);
13361}
13362
13363static int
13364unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13365{
13366    Py_VISIT(it->it_seq);
13367    return 0;
13368}
13369
13370static PyObject *
13371unicodeiter_next(unicodeiterobject *it)
13372{
13373    PyUnicodeObject *seq;
13374    PyObject *item;
13375
13376    assert(it != NULL);
13377    seq = it->it_seq;
13378    if (seq == NULL)
13379        return NULL;
13380    assert(_PyUnicode_CHECK(seq));
13381
13382    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13383        int kind = PyUnicode_KIND(seq);
13384        void *data = PyUnicode_DATA(seq);
13385        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13386        item = PyUnicode_FromOrdinal(chr);
13387        if (item != NULL)
13388            ++it->it_index;
13389        return item;
13390    }
13391
13392    Py_DECREF(seq);
13393    it->it_seq = NULL;
13394    return NULL;
13395}
13396
13397static PyObject *
13398unicodeiter_len(unicodeiterobject *it)
13399{
13400    Py_ssize_t len = 0;
13401    if (it->it_seq)
13402        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13403    return PyLong_FromSsize_t(len);
13404}
13405
13406PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13407
13408static PyMethodDef unicodeiter_methods[] = {
13409    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13410     length_hint_doc},
13411    {NULL,      NULL}       /* sentinel */
13412};
13413
13414PyTypeObject PyUnicodeIter_Type = {
13415    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13416    "str_iterator",         /* tp_name */
13417    sizeof(unicodeiterobject),      /* tp_basicsize */
13418    0,                  /* tp_itemsize */
13419    /* methods */
13420    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13421    0,                  /* tp_print */
13422    0,                  /* tp_getattr */
13423    0,                  /* tp_setattr */
13424    0,                  /* tp_reserved */
13425    0,                  /* tp_repr */
13426    0,                  /* tp_as_number */
13427    0,                  /* tp_as_sequence */
13428    0,                  /* tp_as_mapping */
13429    0,                  /* tp_hash */
13430    0,                  /* tp_call */
13431    0,                  /* tp_str */
13432    PyObject_GenericGetAttr,        /* tp_getattro */
13433    0,                  /* tp_setattro */
13434    0,                  /* tp_as_buffer */
13435    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13436    0,                  /* tp_doc */
13437    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13438    0,                  /* tp_clear */
13439    0,                  /* tp_richcompare */
13440    0,                  /* tp_weaklistoffset */
13441    PyObject_SelfIter,          /* tp_iter */
13442    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13443    unicodeiter_methods,            /* tp_methods */
13444    0,
13445};
13446
13447static PyObject *
13448unicode_iter(PyObject *seq)
13449{
13450    unicodeiterobject *it;
13451
13452    if (!PyUnicode_Check(seq)) {
13453        PyErr_BadInternalCall();
13454        return NULL;
13455    }
13456    if (PyUnicode_READY(seq) == -1)
13457        return NULL;
13458    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13459    if (it == NULL)
13460        return NULL;
13461    it->it_index = 0;
13462    Py_INCREF(seq);
13463    it->it_seq = (PyUnicodeObject *)seq;
13464    _PyObject_GC_TRACK(it);
13465    return (PyObject *)it;
13466}
13467
13468#define UNIOP(x) Py_UNICODE_##x
13469#define UNIOP_t Py_UNICODE
13470#include "uniops.h"
13471#undef UNIOP
13472#undef UNIOP_t
13473#define UNIOP(x) Py_UCS4_##x
13474#define UNIOP_t Py_UCS4
13475#include "uniops.h"
13476#undef UNIOP
13477#undef UNIOP_t
13478
13479Py_UNICODE*
13480PyUnicode_AsUnicodeCopy(PyObject *object)
13481{
13482    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13483    Py_UNICODE *copy;
13484    Py_ssize_t size;
13485
13486    if (!PyUnicode_Check(unicode)) {
13487        PyErr_BadArgument();
13488        return NULL;
13489    }
13490    /* Ensure we won't overflow the size. */
13491    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13492        PyErr_NoMemory();
13493        return NULL;
13494    }
13495    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13496    size *= sizeof(Py_UNICODE);
13497    copy = PyMem_Malloc(size);
13498    if (copy == NULL) {
13499        PyErr_NoMemory();
13500        return NULL;
13501    }
13502    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13503    return copy;
13504}
13505
13506/* A _string module, to export formatter_parser and formatter_field_name_split
13507   to the string.Formatter class implemented in Python. */
13508
13509static PyMethodDef _string_methods[] = {
13510    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13511     METH_O, PyDoc_STR("split the argument as a field name")},
13512    {"formatter_parser", (PyCFunction) formatter_parser,
13513     METH_O, PyDoc_STR("parse the argument as a format string")},
13514    {NULL, NULL}
13515};
13516
13517static struct PyModuleDef _string_module = {
13518    PyModuleDef_HEAD_INIT,
13519    "_string",
13520    PyDoc_STR("string helper module"),
13521    0,
13522    _string_methods,
13523    NULL,
13524    NULL,
13525    NULL,
13526    NULL
13527};
13528
13529PyMODINIT_FUNC
13530PyInit__string(void)
13531{
13532    return PyModule_Create(&_string_module);
13533}
13534
13535
13536#ifdef __cplusplus
13537}
13538#endif
13539