unicodeobject.c revision b9275c104e50361fe3a785126e5ecad24d319a7a
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44
45#ifdef MS_WINDOWS
46#include <windows.h>
47#endif
48
49/* Limit for the Unicode object free list */
50
51#define PyUnicode_MAXFREELIST       1024
52
53/* Limit for the Unicode object free list stay alive optimization.
54
55   The implementation will keep allocated Unicode memory intact for
56   all objects on the free list having a size less than this
57   limit. This reduces malloc() overhead for small Unicode objects.
58
59   At worst this will result in PyUnicode_MAXFREELIST *
60   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
61   malloc()-overhead) bytes of unused garbage.
62
63   Setting the limit to 0 effectively turns the feature off.
64
65   Note: This is an experimental feature ! If you get core dumps when
66   using Unicode objects, turn this feature off.
67
68*/
69
70#define KEEPALIVE_SIZE_LIMIT       9
71
72/* Endianness switches; defaults to little endian */
73
74#ifdef WORDS_BIGENDIAN
75# define BYTEORDER_IS_BIG_ENDIAN
76#else
77# define BYTEORDER_IS_LITTLE_ENDIAN
78#endif
79
80/* --- Globals ------------------------------------------------------------
81
82   The globals are initialized by the _PyUnicode_Init() API and should
83   not be used before calling that API.
84
85*/
86
87
88#ifdef __cplusplus
89extern "C" {
90#endif
91
92#ifdef Py_DEBUG
93#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op)
94#else
95#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
96#endif
97
98#define _PyUnicode_UTF8(op)                             \
99    (((PyCompactUnicodeObject*)(op))->utf8)
100#define PyUnicode_UTF8(op)                              \
101    (assert(_PyUnicode_CHECK(op)),                      \
102     assert(PyUnicode_IS_READY(op)),                    \
103     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
104         ((char*)((PyASCIIObject*)(op) + 1)) :          \
105         _PyUnicode_UTF8(op))
106#define _PyUnicode_UTF8_LENGTH(op)                      \
107    (((PyCompactUnicodeObject*)(op))->utf8_length)
108#define PyUnicode_UTF8_LENGTH(op)                       \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     assert(PyUnicode_IS_READY(op)),                    \
111     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
112         ((PyASCIIObject*)(op))->length :               \
113         _PyUnicode_UTF8_LENGTH(op))
114#define _PyUnicode_WSTR(op)                             \
115    (((PyASCIIObject*)(op))->wstr)
116#define _PyUnicode_WSTR_LENGTH(op)                      \
117    (((PyCompactUnicodeObject*)(op))->wstr_length)
118#define _PyUnicode_LENGTH(op)                           \
119    (((PyASCIIObject *)(op))->length)
120#define _PyUnicode_STATE(op)                            \
121    (((PyASCIIObject *)(op))->state)
122#define _PyUnicode_HASH(op)                             \
123    (((PyASCIIObject *)(op))->hash)
124#define _PyUnicode_KIND(op)                             \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     ((PyASCIIObject *)(op))->state.kind)
127#define _PyUnicode_GET_LENGTH(op)                       \
128    (assert(_PyUnicode_CHECK(op)),                      \
129     ((PyASCIIObject *)(op))->length)
130#define _PyUnicode_DATA_ANY(op)                         \
131    (((PyUnicodeObject*)(op))->data.any)
132
133#undef PyUnicode_READY
134#define PyUnicode_READY(op)                             \
135    (assert(_PyUnicode_CHECK(op)),                      \
136     (PyUnicode_IS_READY(op) ?                          \
137      0 :                                               \
138      _PyUnicode_Ready((PyObject *)(op))))
139
140#define _PyUnicode_READY_REPLACE(p_obj)                 \
141    (assert(_PyUnicode_CHECK(*p_obj)),                  \
142     (PyUnicode_IS_READY(*p_obj) ?                      \
143      0 : _PyUnicode_ReadyReplace((PyObject **)(p_obj))))
144
145#define _PyUnicode_SHARE_UTF8(op)                       \
146    (assert(_PyUnicode_CHECK(op)),                      \
147     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
148     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
149#define _PyUnicode_SHARE_WSTR(op)                       \
150    (assert(_PyUnicode_CHECK(op)),                      \
151     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
152
153/* true if the Unicode object has an allocated UTF-8 memory block
154   (not shared with other data) */
155#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
156    (assert(_PyUnicode_CHECK(op)),                      \
157     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
158      && _PyUnicode_UTF8(op)                            \
159      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
160
161/* true if the Unicode object has an allocated wstr memory block
162   (not shared with other data) */
163#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
164    (assert(_PyUnicode_CHECK(op)),                      \
165     (_PyUnicode_WSTR(op) &&                            \
166      (!PyUnicode_IS_READY(op) ||                       \
167       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
168
169/* Generic helper macro to convert characters of different types.
170   from_type and to_type have to be valid type names, begin and end
171   are pointers to the source characters which should be of type
172   "from_type *".  to is a pointer of type "to_type *" and points to the
173   buffer where the result characters are written to. */
174#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
175    do {                                                \
176        const from_type *iter_; to_type *to_;           \
177        for (iter_ = (begin), to_ = (to_type *)(to);    \
178             iter_ < (end);                             \
179             ++iter_, ++to_) {                          \
180            *to_ = (to_type)*iter_;                     \
181        }                                               \
182    } while (0)
183
184/* The Unicode string has been modified: reset the hash */
185#define _PyUnicode_DIRTY(op) do { _PyUnicode_HASH(op) = -1; } while (0)
186
187/* This dictionary holds all interned unicode strings.  Note that references
188   to strings in this dictionary are *not* counted in the string's ob_refcnt.
189   When the interned string reaches a refcnt of 0 the string deallocation
190   function will delete the reference from this dictionary.
191
192   Another way to look at this is that to say that the actual reference
193   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
194*/
195static PyObject *interned;
196
197/* The empty Unicode object is shared to improve performance. */
198static PyObject *unicode_empty;
199
200/* Single character Unicode strings in the Latin-1 range are being
201   shared as well. */
202static PyObject *unicode_latin1[256];
203
204/* Fast detection of the most frequent whitespace characters */
205const unsigned char _Py_ascii_whitespace[] = {
206    0, 0, 0, 0, 0, 0, 0, 0,
207/*     case 0x0009: * CHARACTER TABULATION */
208/*     case 0x000A: * LINE FEED */
209/*     case 0x000B: * LINE TABULATION */
210/*     case 0x000C: * FORM FEED */
211/*     case 0x000D: * CARRIAGE RETURN */
212    0, 1, 1, 1, 1, 1, 0, 0,
213    0, 0, 0, 0, 0, 0, 0, 0,
214/*     case 0x001C: * FILE SEPARATOR */
215/*     case 0x001D: * GROUP SEPARATOR */
216/*     case 0x001E: * RECORD SEPARATOR */
217/*     case 0x001F: * UNIT SEPARATOR */
218    0, 0, 0, 0, 1, 1, 1, 1,
219/*     case 0x0020: * SPACE */
220    1, 0, 0, 0, 0, 0, 0, 0,
221    0, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228    0, 0, 0, 0, 0, 0, 0, 0,
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0
233};
234
235/* forward */
236static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
237static PyObject* get_latin1_char(unsigned char ch);
238
239static PyObject *
240unicode_encode_call_errorhandler(const char *errors,
241       PyObject **errorHandler,const char *encoding, const char *reason,
242       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
243       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
244
245static void
246raise_encode_exception(PyObject **exceptionObject,
247                       const char *encoding,
248                       const Py_UNICODE *unicode, Py_ssize_t size,
249                       Py_ssize_t startpos, Py_ssize_t endpos,
250                       const char *reason);
251
252/* Same for linebreaks */
253static unsigned char ascii_linebreak[] = {
254    0, 0, 0, 0, 0, 0, 0, 0,
255/*         0x000A, * LINE FEED */
256/*         0x000B, * LINE TABULATION */
257/*         0x000C, * FORM FEED */
258/*         0x000D, * CARRIAGE RETURN */
259    0, 0, 1, 1, 1, 1, 0, 0,
260    0, 0, 0, 0, 0, 0, 0, 0,
261/*         0x001C, * FILE SEPARATOR */
262/*         0x001D, * GROUP SEPARATOR */
263/*         0x001E, * RECORD SEPARATOR */
264    0, 0, 0, 0, 1, 1, 1, 0,
265    0, 0, 0, 0, 0, 0, 0, 0,
266    0, 0, 0, 0, 0, 0, 0, 0,
267    0, 0, 0, 0, 0, 0, 0, 0,
268    0, 0, 0, 0, 0, 0, 0, 0,
269
270    0, 0, 0, 0, 0, 0, 0, 0,
271    0, 0, 0, 0, 0, 0, 0, 0,
272    0, 0, 0, 0, 0, 0, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0
278};
279
280/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
281   This function is kept for backward compatibility with the old API. */
282Py_UNICODE
283PyUnicode_GetMax(void)
284{
285#ifdef Py_UNICODE_WIDE
286    return 0x10FFFF;
287#else
288    /* This is actually an illegal character, so it should
289       not be passed to unichr. */
290    return 0xFFFF;
291#endif
292}
293
294#ifdef Py_DEBUG
295static int
296_PyUnicode_CheckConsistency(void *op)
297{
298    PyASCIIObject *ascii;
299    unsigned int kind;
300
301    assert(PyUnicode_Check(op));
302
303    ascii = (PyASCIIObject *)op;
304    kind = ascii->state.kind;
305
306    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
307        assert(kind == PyUnicode_1BYTE_KIND);
308        assert(ascii->state.ready == 1);
309    }
310    else {
311        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
312        void *data;
313
314        if (ascii->state.compact == 1) {
315            data = compact + 1;
316            assert(kind == PyUnicode_1BYTE_KIND
317                   || kind == PyUnicode_2BYTE_KIND
318                   || kind == PyUnicode_4BYTE_KIND);
319            assert(ascii->state.ascii == 0);
320            assert(ascii->state.ready == 1);
321            assert (compact->utf8 != data);
322        } else {
323            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
324
325            data = unicode->data.any;
326            if (kind == PyUnicode_WCHAR_KIND) {
327                assert(ascii->state.compact == 0);
328                assert(ascii->state.ascii == 0);
329                assert(ascii->state.ready == 0);
330                assert(ascii->wstr != NULL);
331                assert(data == NULL);
332                assert(compact->utf8 == NULL);
333                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
334            }
335            else {
336                assert(kind == PyUnicode_1BYTE_KIND
337                       || kind == PyUnicode_2BYTE_KIND
338                       || kind == PyUnicode_4BYTE_KIND);
339                assert(ascii->state.compact == 0);
340                assert(ascii->state.ready == 1);
341                assert(data != NULL);
342                if (ascii->state.ascii) {
343                    assert (compact->utf8 == data);
344                    assert (compact->utf8_length == ascii->length);
345                }
346                else
347                    assert (compact->utf8 != data);
348            }
349        }
350        if (kind != PyUnicode_WCHAR_KIND) {
351            if (
352#if SIZEOF_WCHAR_T == 2
353                kind == PyUnicode_2BYTE_KIND
354#else
355                kind == PyUnicode_4BYTE_KIND
356#endif
357               )
358            {
359                assert(ascii->wstr == data);
360                assert(compact->wstr_length == ascii->length);
361            } else
362                assert(ascii->wstr != data);
363        }
364
365        if (compact->utf8 == NULL)
366            assert(compact->utf8_length == 0);
367        if (ascii->wstr == NULL)
368            assert(compact->wstr_length == 0);
369    }
370    return 1;
371}
372#else
373static int
374_PyUnicode_CheckConsistency(void *op)
375{
376    return 1;
377}
378#endif
379
380/* --- Bloom Filters ----------------------------------------------------- */
381
382/* stuff to implement simple "bloom filters" for Unicode characters.
383   to keep things simple, we use a single bitmask, using the least 5
384   bits from each unicode characters as the bit index. */
385
386/* the linebreak mask is set up by Unicode_Init below */
387
388#if LONG_BIT >= 128
389#define BLOOM_WIDTH 128
390#elif LONG_BIT >= 64
391#define BLOOM_WIDTH 64
392#elif LONG_BIT >= 32
393#define BLOOM_WIDTH 32
394#else
395#error "LONG_BIT is smaller than 32"
396#endif
397
398#define BLOOM_MASK unsigned long
399
400static BLOOM_MASK bloom_linebreak;
401
402#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
403#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
404
405#define BLOOM_LINEBREAK(ch)                                             \
406    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
407     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
408
409Py_LOCAL_INLINE(BLOOM_MASK)
410make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
411{
412    /* calculate simple bloom-style bitmask for a given unicode string */
413
414    BLOOM_MASK mask;
415    Py_ssize_t i;
416
417    mask = 0;
418    for (i = 0; i < len; i++)
419        BLOOM_ADD(mask, PyUnicode_READ(kind, ptr, i));
420
421    return mask;
422}
423
424#define BLOOM_MEMBER(mask, chr, str) \
425    (BLOOM(mask, chr) \
426     && (PyUnicode_FindChar(str, chr, 0, PyUnicode_GET_LENGTH(str), 1) >= 0))
427
428/* --- Unicode Object ----------------------------------------------------- */
429
430static PyObject *
431fixup(PyUnicodeObject *self, Py_UCS4 (*fixfct)(PyUnicodeObject *s));
432
433Py_LOCAL_INLINE(char *) findchar(void *s, int kind,
434                                 Py_ssize_t size, Py_UCS4 ch,
435                                 int direction)
436{
437    /* like wcschr, but doesn't stop at NULL characters */
438    Py_ssize_t i;
439    if (direction == 1) {
440        for(i = 0; i < size; i++)
441            if (PyUnicode_READ(kind, s, i) == ch)
442                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
443    }
444    else {
445        for(i = size-1; i >= 0; i--)
446            if (PyUnicode_READ(kind, s, i) == ch)
447                return (char*)s + PyUnicode_KIND_SIZE(kind, i);
448    }
449    return NULL;
450}
451
452static PyObject*
453resize_compact(PyObject *unicode, Py_ssize_t length)
454{
455    Py_ssize_t char_size;
456    Py_ssize_t struct_size;
457    Py_ssize_t new_size;
458    int share_wstr;
459
460    assert(PyUnicode_IS_READY(unicode));
461    char_size = PyUnicode_CHARACTER_SIZE(unicode);
462    if (PyUnicode_IS_COMPACT_ASCII(unicode))
463        struct_size = sizeof(PyASCIIObject);
464    else
465        struct_size = sizeof(PyCompactUnicodeObject);
466    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
467
468    _Py_DEC_REFTOTAL;
469    _Py_ForgetReference(unicode);
470
471    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
472        PyErr_NoMemory();
473        return NULL;
474    }
475    new_size = (struct_size + (length + 1) * char_size);
476
477    unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
478    if (unicode == NULL) {
479        PyObject_Del(unicode);
480        PyErr_NoMemory();
481        return NULL;
482    }
483    _Py_NewReference(unicode);
484    _PyUnicode_LENGTH(unicode) = length;
485    if (share_wstr) {
486        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
487        if (!PyUnicode_IS_COMPACT_ASCII(unicode))
488            _PyUnicode_WSTR_LENGTH(unicode) = length;
489    }
490    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
491                    length, 0);
492    return unicode;
493}
494
495static int
496resize_inplace(PyUnicodeObject *unicode, Py_ssize_t length)
497{
498    wchar_t *wstr;
499    assert(!PyUnicode_IS_COMPACT(unicode));
500    assert(Py_REFCNT(unicode) == 1);
501
502    _PyUnicode_DIRTY(unicode);
503
504    if (PyUnicode_IS_READY(unicode)) {
505        Py_ssize_t char_size;
506        Py_ssize_t new_size;
507        int share_wstr, share_utf8;
508        void *data;
509
510        data = _PyUnicode_DATA_ANY(unicode);
511        assert(data != NULL);
512        char_size = PyUnicode_CHARACTER_SIZE(unicode);
513        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
514        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
515        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
516        {
517            PyObject_DEL(_PyUnicode_UTF8(unicode));
518            _PyUnicode_UTF8(unicode) = NULL;
519            _PyUnicode_UTF8_LENGTH(unicode) = 0;
520        }
521
522        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
523            PyErr_NoMemory();
524            return -1;
525        }
526        new_size = (length + 1) * char_size;
527
528        data = (PyObject *)PyObject_REALLOC(data, new_size);
529        if (data == NULL) {
530            PyErr_NoMemory();
531            return -1;
532        }
533        _PyUnicode_DATA_ANY(unicode) = data;
534        if (share_wstr) {
535            _PyUnicode_WSTR(unicode) = data;
536            _PyUnicode_WSTR_LENGTH(unicode) = length;
537        }
538        if (share_utf8) {
539            _PyUnicode_UTF8(unicode) = data;
540            _PyUnicode_UTF8_LENGTH(unicode) = length;
541        }
542        _PyUnicode_LENGTH(unicode) = length;
543        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
544        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
545            _PyUnicode_CheckConsistency(unicode);
546            return 0;
547        }
548    }
549    assert(_PyUnicode_WSTR(unicode) != NULL);
550
551    /* check for integer overflow */
552    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
553        PyErr_NoMemory();
554        return -1;
555    }
556    wstr =  _PyUnicode_WSTR(unicode);
557    wstr = PyObject_REALLOC(wstr, sizeof(wchar_t) * (length + 1));
558    if (!wstr) {
559        PyErr_NoMemory();
560        return -1;
561    }
562    _PyUnicode_WSTR(unicode) = wstr;
563    _PyUnicode_WSTR(unicode)[length] = 0;
564    _PyUnicode_WSTR_LENGTH(unicode) = length;
565    _PyUnicode_CheckConsistency(unicode);
566    return 0;
567}
568
569static PyObject*
570resize_copy(PyObject *unicode, Py_ssize_t length)
571{
572    Py_ssize_t copy_length;
573    if (PyUnicode_IS_COMPACT(unicode)) {
574        PyObject *copy;
575        assert(PyUnicode_IS_READY(unicode));
576
577        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
578        if (copy == NULL)
579            return NULL;
580
581        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
582        if (PyUnicode_CopyCharacters(copy, 0,
583                                     unicode, 0,
584                                     copy_length) < 0)
585        {
586            Py_DECREF(copy);
587            return NULL;
588        }
589        return copy;
590    }
591    else {
592        PyUnicodeObject *w;
593        assert(_PyUnicode_WSTR(unicode) != NULL);
594        assert(_PyUnicode_DATA_ANY(unicode) == NULL);
595        w = _PyUnicode_New(length);
596        if (w == NULL)
597            return NULL;
598        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
599        copy_length = Py_MIN(copy_length, length);
600        Py_UNICODE_COPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
601                        copy_length);
602        return (PyObject*)w;
603    }
604}
605
606/* We allocate one more byte to make sure the string is
607   Ux0000 terminated; some code (e.g. new_identifier)
608   relies on that.
609
610   XXX This allocator could further be enhanced by assuring that the
611   free list never reduces its size below 1.
612
613*/
614
615#ifdef Py_DEBUG
616int unicode_old_new_calls = 0;
617#endif
618
619static PyUnicodeObject *
620_PyUnicode_New(Py_ssize_t length)
621{
622    register PyUnicodeObject *unicode;
623    size_t new_size;
624
625    /* Optimization for empty strings */
626    if (length == 0 && unicode_empty != NULL) {
627        Py_INCREF(unicode_empty);
628        return (PyUnicodeObject*)unicode_empty;
629    }
630
631    /* Ensure we won't overflow the size. */
632    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
633        return (PyUnicodeObject *)PyErr_NoMemory();
634    }
635    if (length < 0) {
636        PyErr_SetString(PyExc_SystemError,
637                        "Negative size passed to _PyUnicode_New");
638        return NULL;
639    }
640
641#ifdef Py_DEBUG
642    ++unicode_old_new_calls;
643#endif
644
645    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
646    if (unicode == NULL)
647        return NULL;
648    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
649    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
650    if (!_PyUnicode_WSTR(unicode)) {
651        PyErr_NoMemory();
652        goto onError;
653    }
654
655    /* Initialize the first element to guard against cases where
656     * the caller fails before initializing str -- unicode_resize()
657     * reads str[0], and the Keep-Alive optimization can keep memory
658     * allocated for str alive across a call to unicode_dealloc(unicode).
659     * We don't want unicode_resize to read uninitialized memory in
660     * that case.
661     */
662    _PyUnicode_WSTR(unicode)[0] = 0;
663    _PyUnicode_WSTR(unicode)[length] = 0;
664    _PyUnicode_WSTR_LENGTH(unicode) = length;
665    _PyUnicode_HASH(unicode) = -1;
666    _PyUnicode_STATE(unicode).interned = 0;
667    _PyUnicode_STATE(unicode).kind = 0;
668    _PyUnicode_STATE(unicode).compact = 0;
669    _PyUnicode_STATE(unicode).ready = 0;
670    _PyUnicode_STATE(unicode).ascii = 0;
671    _PyUnicode_DATA_ANY(unicode) = NULL;
672    _PyUnicode_LENGTH(unicode) = 0;
673    _PyUnicode_UTF8(unicode) = NULL;
674    _PyUnicode_UTF8_LENGTH(unicode) = 0;
675    return unicode;
676
677  onError:
678    /* XXX UNREF/NEWREF interface should be more symmetrical */
679    _Py_DEC_REFTOTAL;
680    _Py_ForgetReference((PyObject *)unicode);
681    PyObject_Del(unicode);
682    return NULL;
683}
684
685static const char*
686unicode_kind_name(PyObject *unicode)
687{
688    /* don't check consistency: unicode_kind_name() is called from
689       _PyUnicode_Dump() */
690    if (!PyUnicode_IS_COMPACT(unicode))
691    {
692        if (!PyUnicode_IS_READY(unicode))
693            return "wstr";
694        switch(PyUnicode_KIND(unicode))
695        {
696        case PyUnicode_1BYTE_KIND:
697            if (PyUnicode_IS_ASCII(unicode))
698                return "legacy ascii";
699            else
700                return "legacy latin1";
701        case PyUnicode_2BYTE_KIND:
702            return "legacy UCS2";
703        case PyUnicode_4BYTE_KIND:
704            return "legacy UCS4";
705        default:
706            return "<legacy invalid kind>";
707        }
708    }
709    assert(PyUnicode_IS_READY(unicode));
710    switch(PyUnicode_KIND(unicode))
711    {
712    case PyUnicode_1BYTE_KIND:
713        if (PyUnicode_IS_ASCII(unicode))
714            return "ascii";
715        else
716            return "latin1";
717    case PyUnicode_2BYTE_KIND:
718        return "UCS2";
719    case PyUnicode_4BYTE_KIND:
720        return "UCS4";
721    default:
722        return "<invalid compact kind>";
723    }
724}
725
726#ifdef Py_DEBUG
727int unicode_new_new_calls = 0;
728
729/* Functions wrapping macros for use in debugger */
730char *_PyUnicode_utf8(void *unicode){
731    return PyUnicode_UTF8(unicode);
732}
733
734void *_PyUnicode_compact_data(void *unicode) {
735    return _PyUnicode_COMPACT_DATA(unicode);
736}
737void *_PyUnicode_data(void *unicode){
738    printf("obj %p\n", unicode);
739    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
740    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
741    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
742    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
743    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
744    return PyUnicode_DATA(unicode);
745}
746
747void
748_PyUnicode_Dump(PyObject *op)
749{
750    PyASCIIObject *ascii = (PyASCIIObject *)op;
751    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
752    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
753    void *data;
754    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
755    if (ascii->state.compact)
756        data = (compact + 1);
757    else
758        data = unicode->data.any;
759    if (ascii->wstr == data)
760        printf("shared ");
761    printf("wstr=%p", ascii->wstr);
762    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
763        printf(" (%zu), ", compact->wstr_length);
764        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
765            printf("shared ");
766        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
767    }
768    printf(", data=%p\n", data);
769}
770#endif
771
772PyObject *
773PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
774{
775    PyObject *obj;
776    PyCompactUnicodeObject *unicode;
777    void *data;
778    int kind_state;
779    int is_sharing, is_ascii;
780    Py_ssize_t char_size;
781    Py_ssize_t struct_size;
782
783    /* Optimization for empty strings */
784    if (size == 0 && unicode_empty != NULL) {
785        Py_INCREF(unicode_empty);
786        return unicode_empty;
787    }
788
789#ifdef Py_DEBUG
790    ++unicode_new_new_calls;
791#endif
792
793    is_ascii = 0;
794    is_sharing = 0;
795    struct_size = sizeof(PyCompactUnicodeObject);
796    if (maxchar < 128) {
797        kind_state = PyUnicode_1BYTE_KIND;
798        char_size = 1;
799        is_ascii = 1;
800        struct_size = sizeof(PyASCIIObject);
801    }
802    else if (maxchar < 256) {
803        kind_state = PyUnicode_1BYTE_KIND;
804        char_size = 1;
805    }
806    else if (maxchar < 65536) {
807        kind_state = PyUnicode_2BYTE_KIND;
808        char_size = 2;
809        if (sizeof(wchar_t) == 2)
810            is_sharing = 1;
811    }
812    else {
813        kind_state = PyUnicode_4BYTE_KIND;
814        char_size = 4;
815        if (sizeof(wchar_t) == 4)
816            is_sharing = 1;
817    }
818
819    /* Ensure we won't overflow the size. */
820    if (size < 0) {
821        PyErr_SetString(PyExc_SystemError,
822                        "Negative size passed to PyUnicode_New");
823        return NULL;
824    }
825    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
826        return PyErr_NoMemory();
827
828    /* Duplicated allocation code from _PyObject_New() instead of a call to
829     * PyObject_New() so we are able to allocate space for the object and
830     * it's data buffer.
831     */
832    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
833    if (obj == NULL)
834        return PyErr_NoMemory();
835    obj = PyObject_INIT(obj, &PyUnicode_Type);
836    if (obj == NULL)
837        return NULL;
838
839    unicode = (PyCompactUnicodeObject *)obj;
840    if (is_ascii)
841        data = ((PyASCIIObject*)obj) + 1;
842    else
843        data = unicode + 1;
844    _PyUnicode_LENGTH(unicode) = size;
845    _PyUnicode_HASH(unicode) = -1;
846    _PyUnicode_STATE(unicode).interned = 0;
847    _PyUnicode_STATE(unicode).kind = kind_state;
848    _PyUnicode_STATE(unicode).compact = 1;
849    _PyUnicode_STATE(unicode).ready = 1;
850    _PyUnicode_STATE(unicode).ascii = is_ascii;
851    if (is_ascii) {
852        ((char*)data)[size] = 0;
853        _PyUnicode_WSTR(unicode) = NULL;
854    }
855    else if (kind_state == PyUnicode_1BYTE_KIND) {
856        ((char*)data)[size] = 0;
857        _PyUnicode_WSTR(unicode) = NULL;
858        _PyUnicode_WSTR_LENGTH(unicode) = 0;
859        unicode->utf8 = NULL;
860        unicode->utf8_length = 0;
861        }
862    else {
863        unicode->utf8 = NULL;
864        unicode->utf8_length = 0;
865        if (kind_state == PyUnicode_2BYTE_KIND)
866            ((Py_UCS2*)data)[size] = 0;
867        else /* kind_state == PyUnicode_4BYTE_KIND */
868            ((Py_UCS4*)data)[size] = 0;
869        if (is_sharing) {
870            _PyUnicode_WSTR_LENGTH(unicode) = size;
871            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
872        }
873        else {
874            _PyUnicode_WSTR_LENGTH(unicode) = 0;
875            _PyUnicode_WSTR(unicode) = NULL;
876        }
877    }
878    return obj;
879}
880
881#if SIZEOF_WCHAR_T == 2
882/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
883   will decode surrogate pairs, the other conversions are implemented as macros
884   for efficency.
885
886   This function assumes that unicode can hold one more code point than wstr
887   characters for a terminating null character. */
888static void
889unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
890                              PyUnicodeObject *unicode)
891{
892    const wchar_t *iter;
893    Py_UCS4 *ucs4_out;
894
895    assert(unicode != NULL);
896    assert(_PyUnicode_CHECK(unicode));
897    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
898    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
899
900    for (iter = begin; iter < end; ) {
901        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
902                           _PyUnicode_GET_LENGTH(unicode)));
903        if (*iter >= 0xD800 && *iter <= 0xDBFF
904            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
905        {
906            *ucs4_out++ = (((iter[0] & 0x3FF)<<10) | (iter[1] & 0x3FF)) + 0x10000;
907            iter += 2;
908        }
909        else {
910            *ucs4_out++ = *iter;
911            iter++;
912        }
913    }
914    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
915                        _PyUnicode_GET_LENGTH(unicode)));
916
917}
918#endif
919
920static int
921_PyUnicode_Dirty(PyObject *unicode)
922{
923    assert(_PyUnicode_CHECK(unicode));
924    if (Py_REFCNT(unicode) != 1) {
925        PyErr_SetString(PyExc_SystemError,
926                        "Cannot modify a string having more than 1 reference");
927        return -1;
928    }
929    _PyUnicode_DIRTY(unicode);
930    return 0;
931}
932
933Py_ssize_t
934PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
935                         PyObject *from, Py_ssize_t from_start,
936                         Py_ssize_t how_many)
937{
938    unsigned int from_kind, to_kind;
939    void *from_data, *to_data;
940
941    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
942        PyErr_BadInternalCall();
943        return -1;
944    }
945
946    if (PyUnicode_READY(from))
947        return -1;
948    if (PyUnicode_READY(to))
949        return -1;
950
951    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
952    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
953        PyErr_Format(PyExc_SystemError,
954                     "Cannot write %zi characters at %zi "
955                     "in a string of %zi characters",
956                     how_many, to_start, PyUnicode_GET_LENGTH(to));
957        return -1;
958    }
959    if (how_many == 0)
960        return 0;
961
962    if (_PyUnicode_Dirty(to))
963        return -1;
964
965    from_kind = PyUnicode_KIND(from);
966    from_data = PyUnicode_DATA(from);
967    to_kind = PyUnicode_KIND(to);
968    to_data = PyUnicode_DATA(to);
969
970    if (from_kind == to_kind
971        /* deny latin1 => ascii */
972        && !(!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
973    {
974        Py_MEMCPY((char*)to_data
975                      + PyUnicode_KIND_SIZE(to_kind, to_start),
976                  (char*)from_data
977                      + PyUnicode_KIND_SIZE(from_kind, from_start),
978                  PyUnicode_KIND_SIZE(to_kind, how_many));
979    }
980    else if (from_kind == PyUnicode_1BYTE_KIND
981             && to_kind == PyUnicode_2BYTE_KIND)
982    {
983        _PyUnicode_CONVERT_BYTES(
984            Py_UCS1, Py_UCS2,
985            PyUnicode_1BYTE_DATA(from) + from_start,
986            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
987            PyUnicode_2BYTE_DATA(to) + to_start
988            );
989    }
990    else if (from_kind == PyUnicode_1BYTE_KIND
991             && to_kind == PyUnicode_4BYTE_KIND)
992    {
993        _PyUnicode_CONVERT_BYTES(
994            Py_UCS1, Py_UCS4,
995            PyUnicode_1BYTE_DATA(from) + from_start,
996            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
997            PyUnicode_4BYTE_DATA(to) + to_start
998            );
999    }
1000    else if (from_kind == PyUnicode_2BYTE_KIND
1001             && to_kind == PyUnicode_4BYTE_KIND)
1002    {
1003        _PyUnicode_CONVERT_BYTES(
1004            Py_UCS2, Py_UCS4,
1005            PyUnicode_2BYTE_DATA(from) + from_start,
1006            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1007            PyUnicode_4BYTE_DATA(to) + to_start
1008            );
1009    }
1010    else {
1011        int invalid_kinds;
1012
1013        /* check if max_char(from substring) <= max_char(to) */
1014        if (from_kind > to_kind
1015                /* latin1 => ascii */
1016            || (!PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to)))
1017        {
1018            /* slow path to check for character overflow */
1019            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1020            Py_UCS4 ch, maxchar;
1021            Py_ssize_t i;
1022
1023            maxchar = 0;
1024            invalid_kinds = 0;
1025            for (i=0; i < how_many; i++) {
1026                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1027                if (ch > maxchar) {
1028                    maxchar = ch;
1029                    if (maxchar > to_maxchar) {
1030                        invalid_kinds = 1;
1031                        break;
1032                    }
1033                }
1034                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1035            }
1036        }
1037        else
1038            invalid_kinds = 1;
1039        if (invalid_kinds) {
1040            PyErr_Format(PyExc_SystemError,
1041                         "Cannot copy %s characters "
1042                         "into a string of %s characters",
1043                         unicode_kind_name(from),
1044                         unicode_kind_name(to));
1045            return -1;
1046        }
1047    }
1048    return how_many;
1049}
1050
1051/* Find the maximum code point and count the number of surrogate pairs so a
1052   correct string length can be computed before converting a string to UCS4.
1053   This function counts single surrogates as a character and not as a pair.
1054
1055   Return 0 on success, or -1 on error. */
1056static int
1057find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1058                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1059{
1060    const wchar_t *iter;
1061
1062    assert(num_surrogates != NULL && maxchar != NULL);
1063    if (num_surrogates == NULL || maxchar == NULL) {
1064        PyErr_SetString(PyExc_SystemError,
1065                        "unexpected NULL arguments to "
1066                        "PyUnicode_FindMaxCharAndNumSurrogatePairs");
1067        return -1;
1068    }
1069
1070    *num_surrogates = 0;
1071    *maxchar = 0;
1072
1073    for (iter = begin; iter < end; ) {
1074        if (*iter > *maxchar)
1075            *maxchar = *iter;
1076#if SIZEOF_WCHAR_T == 2
1077        if (*iter >= 0xD800 && *iter <= 0xDBFF
1078            && (iter+1) < end && iter[1] >= 0xDC00 && iter[1] <= 0xDFFF)
1079        {
1080            Py_UCS4 surrogate_val;
1081            surrogate_val = (((iter[0] & 0x3FF)<<10)
1082                             | (iter[1] & 0x3FF)) + 0x10000;
1083            ++(*num_surrogates);
1084            if (surrogate_val > *maxchar)
1085                *maxchar = surrogate_val;
1086            iter += 2;
1087        }
1088        else
1089            iter++;
1090#else
1091        iter++;
1092#endif
1093    }
1094    return 0;
1095}
1096
1097#ifdef Py_DEBUG
1098int unicode_ready_calls = 0;
1099#endif
1100
1101static int
1102unicode_ready(PyObject **p_obj, int replace)
1103{
1104    PyUnicodeObject *unicode;
1105    wchar_t *end;
1106    Py_UCS4 maxchar = 0;
1107    Py_ssize_t num_surrogates;
1108#if SIZEOF_WCHAR_T == 2
1109    Py_ssize_t length_wo_surrogates;
1110#endif
1111
1112    assert(p_obj != NULL);
1113    unicode = (PyUnicodeObject *)*p_obj;
1114
1115    /* _PyUnicode_Ready() is only intented for old-style API usage where
1116       strings were created using _PyObject_New() and where no canonical
1117       representation (the str field) has been set yet aka strings
1118       which are not yet ready. */
1119    assert(_PyUnicode_CHECK(unicode));
1120    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1121    assert(_PyUnicode_WSTR(unicode) != NULL);
1122    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1123    assert(_PyUnicode_UTF8(unicode) == NULL);
1124    /* Actually, it should neither be interned nor be anything else: */
1125    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1126
1127#ifdef Py_DEBUG
1128    ++unicode_ready_calls;
1129#endif
1130
1131#ifdef Py_DEBUG
1132    assert(!replace || Py_REFCNT(unicode) == 1);
1133#else
1134    if (replace && Py_REFCNT(unicode) != 1)
1135        replace = 0;
1136#endif
1137    if (replace) {
1138        Py_ssize_t len = _PyUnicode_WSTR_LENGTH(unicode);
1139        wchar_t *wstr = _PyUnicode_WSTR(unicode);
1140        /* Optimization for empty strings */
1141        if (len == 0) {
1142            Py_INCREF(unicode_empty);
1143            Py_DECREF(*p_obj);
1144            *p_obj = unicode_empty;
1145            return 0;
1146        }
1147        if (len == 1 && wstr[0] < 256) {
1148            PyObject *latin1_char = get_latin1_char((unsigned char)wstr[0]);
1149            if (latin1_char == NULL)
1150                return -1;
1151            Py_DECREF(*p_obj);
1152            *p_obj = latin1_char;
1153            return 0;
1154        }
1155    }
1156
1157    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1158    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1159                                &maxchar, &num_surrogates) == -1)
1160        return -1;
1161
1162    if (maxchar < 256) {
1163        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1164        if (!_PyUnicode_DATA_ANY(unicode)) {
1165            PyErr_NoMemory();
1166            return -1;
1167        }
1168        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1169                                _PyUnicode_WSTR(unicode), end,
1170                                PyUnicode_1BYTE_DATA(unicode));
1171        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1172        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1173        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1174        if (maxchar < 128) {
1175            _PyUnicode_STATE(unicode).ascii = 1;
1176            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1177            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1178        }
1179        else {
1180            _PyUnicode_STATE(unicode).ascii = 0;
1181            _PyUnicode_UTF8(unicode) = NULL;
1182            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1183        }
1184        PyObject_FREE(_PyUnicode_WSTR(unicode));
1185        _PyUnicode_WSTR(unicode) = NULL;
1186        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1187    }
1188    /* In this case we might have to convert down from 4-byte native
1189       wchar_t to 2-byte unicode. */
1190    else if (maxchar < 65536) {
1191        assert(num_surrogates == 0 &&
1192               "FindMaxCharAndNumSurrogatePairs() messed up");
1193
1194#if SIZEOF_WCHAR_T == 2
1195        /* We can share representations and are done. */
1196        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1197        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1198        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1199        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1200        _PyUnicode_UTF8(unicode) = NULL;
1201        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1202#else
1203        /* sizeof(wchar_t) == 4 */
1204        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1205            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1206        if (!_PyUnicode_DATA_ANY(unicode)) {
1207            PyErr_NoMemory();
1208            return -1;
1209        }
1210        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1211                                _PyUnicode_WSTR(unicode), end,
1212                                PyUnicode_2BYTE_DATA(unicode));
1213        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1214        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1215        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1216        _PyUnicode_UTF8(unicode) = NULL;
1217        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1218        PyObject_FREE(_PyUnicode_WSTR(unicode));
1219        _PyUnicode_WSTR(unicode) = NULL;
1220        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1221#endif
1222    }
1223    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1224    else {
1225#if SIZEOF_WCHAR_T == 2
1226        /* in case the native representation is 2-bytes, we need to allocate a
1227           new normalized 4-byte version. */
1228        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1229        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1230        if (!_PyUnicode_DATA_ANY(unicode)) {
1231            PyErr_NoMemory();
1232            return -1;
1233        }
1234        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1235        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1236        _PyUnicode_UTF8(unicode) = NULL;
1237        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1238        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1239        _PyUnicode_STATE(unicode).ready = 1;
1240        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1241        PyObject_FREE(_PyUnicode_WSTR(unicode));
1242        _PyUnicode_WSTR(unicode) = NULL;
1243        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1244#else
1245        assert(num_surrogates == 0);
1246
1247        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1248        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1249        _PyUnicode_UTF8(unicode) = NULL;
1250        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1251        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1252#endif
1253        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1254    }
1255    _PyUnicode_STATE(unicode).ready = 1;
1256    return 0;
1257}
1258
1259int
1260_PyUnicode_ReadyReplace(PyObject **op)
1261{
1262    return unicode_ready(op, 1);
1263}
1264
1265int
1266_PyUnicode_Ready(PyObject *op)
1267{
1268    return unicode_ready(&op, 0);
1269}
1270
1271static void
1272unicode_dealloc(register PyUnicodeObject *unicode)
1273{
1274    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1275    case SSTATE_NOT_INTERNED:
1276        break;
1277
1278    case SSTATE_INTERNED_MORTAL:
1279        /* revive dead object temporarily for DelItem */
1280        Py_REFCNT(unicode) = 3;
1281        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
1282            Py_FatalError(
1283                "deletion of interned string failed");
1284        break;
1285
1286    case SSTATE_INTERNED_IMMORTAL:
1287        Py_FatalError("Immortal interned string died.");
1288
1289    default:
1290        Py_FatalError("Inconsistent interned string state.");
1291    }
1292
1293    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1294        PyObject_DEL(_PyUnicode_WSTR(unicode));
1295    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1296        PyObject_DEL(_PyUnicode_UTF8(unicode));
1297
1298    if (PyUnicode_IS_COMPACT(unicode)) {
1299        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1300    }
1301    else {
1302        if (_PyUnicode_DATA_ANY(unicode))
1303            PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1304        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
1305    }
1306}
1307
1308static int
1309unicode_resizable(PyObject *unicode)
1310{
1311    if (Py_REFCNT(unicode) != 1)
1312        return 0;
1313    if (PyUnicode_CHECK_INTERNED(unicode))
1314        return 0;
1315    assert(unicode != unicode_empty);
1316#ifdef Py_DEBUG
1317    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND
1318        && PyUnicode_GET_LENGTH(unicode) == 1)
1319    {
1320        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1321        if (ch < 256 && unicode_latin1[ch] == unicode)
1322            return 0;
1323    }
1324#endif
1325    return 1;
1326}
1327
1328static int
1329unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1330{
1331    PyObject *unicode;
1332    Py_ssize_t old_length;
1333
1334    assert(p_unicode != NULL);
1335    unicode = *p_unicode;
1336
1337    assert(unicode != NULL);
1338    assert(PyUnicode_Check(unicode));
1339    assert(0 <= length);
1340
1341    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1342        old_length = PyUnicode_WSTR_LENGTH(unicode);
1343    else
1344        old_length = PyUnicode_GET_LENGTH(unicode);
1345    if (old_length == length)
1346        return 0;
1347
1348    if (!unicode_resizable(unicode)) {
1349        PyObject *copy = resize_copy(unicode, length);
1350        if (copy == NULL)
1351            return -1;
1352        Py_DECREF(*p_unicode);
1353        *p_unicode = copy;
1354        return 0;
1355    }
1356
1357    if (PyUnicode_IS_COMPACT(unicode)) {
1358        *p_unicode = resize_compact(unicode, length);
1359        if (*p_unicode == NULL)
1360            return -1;
1361        _PyUnicode_CheckConsistency(*p_unicode);
1362        return 0;
1363    }
1364    return resize_inplace((PyUnicodeObject*)unicode, length);
1365}
1366
1367int
1368PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1369{
1370    PyObject *unicode;
1371    if (p_unicode == NULL) {
1372        PyErr_BadInternalCall();
1373        return -1;
1374    }
1375    unicode = *p_unicode;
1376    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0
1377        || _PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND)
1378    {
1379        PyErr_BadInternalCall();
1380        return -1;
1381    }
1382    return unicode_resize(p_unicode, length);
1383}
1384
1385static PyObject*
1386get_latin1_char(unsigned char ch)
1387{
1388    PyObject *unicode = unicode_latin1[ch];
1389    if (!unicode) {
1390        unicode = PyUnicode_New(1, ch);
1391        if (!unicode)
1392            return NULL;
1393        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1394        unicode_latin1[ch] = unicode;
1395    }
1396    Py_INCREF(unicode);
1397    return unicode;
1398}
1399
1400PyObject *
1401PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1402{
1403    PyUnicodeObject *unicode;
1404    Py_UCS4 maxchar = 0;
1405    Py_ssize_t num_surrogates;
1406
1407    if (u == NULL)
1408        return (PyObject*)_PyUnicode_New(size);
1409
1410    /* If the Unicode data is known at construction time, we can apply
1411       some optimizations which share commonly used objects. */
1412
1413    /* Optimization for empty strings */
1414    if (size == 0 && unicode_empty != NULL) {
1415        Py_INCREF(unicode_empty);
1416        return unicode_empty;
1417    }
1418
1419    /* Single character Unicode objects in the Latin-1 range are
1420       shared when using this constructor */
1421    if (size == 1 && *u < 256)
1422        return get_latin1_char((unsigned char)*u);
1423
1424    /* If not empty and not single character, copy the Unicode data
1425       into the new object */
1426    if (find_maxchar_surrogates(u, u + size,
1427                                &maxchar, &num_surrogates) == -1)
1428        return NULL;
1429
1430    unicode = (PyUnicodeObject *) PyUnicode_New(size - num_surrogates,
1431                                                maxchar);
1432    if (!unicode)
1433        return NULL;
1434
1435    switch (PyUnicode_KIND(unicode)) {
1436    case PyUnicode_1BYTE_KIND:
1437        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1438                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1439        break;
1440    case PyUnicode_2BYTE_KIND:
1441#if Py_UNICODE_SIZE == 2
1442        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1443#else
1444        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1445                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1446#endif
1447        break;
1448    case PyUnicode_4BYTE_KIND:
1449#if SIZEOF_WCHAR_T == 2
1450        /* This is the only case which has to process surrogates, thus
1451           a simple copy loop is not enough and we need a function. */
1452        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1453#else
1454        assert(num_surrogates == 0);
1455        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1456#endif
1457        break;
1458    default:
1459        assert(0 && "Impossible state");
1460    }
1461
1462    return (PyObject *)unicode;
1463}
1464
1465PyObject *
1466PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1467{
1468    PyUnicodeObject *unicode;
1469
1470    if (size < 0) {
1471        PyErr_SetString(PyExc_SystemError,
1472                        "Negative size passed to PyUnicode_FromStringAndSize");
1473        return NULL;
1474    }
1475
1476    /* If the Unicode data is known at construction time, we can apply
1477       some optimizations which share commonly used objects.
1478       Also, this means the input must be UTF-8, so fall back to the
1479       UTF-8 decoder at the end. */
1480    if (u != NULL) {
1481
1482        /* Optimization for empty strings */
1483        if (size == 0 && unicode_empty != NULL) {
1484            Py_INCREF(unicode_empty);
1485            return unicode_empty;
1486        }
1487
1488        /* Single characters are shared when using this constructor.
1489           Restrict to ASCII, since the input must be UTF-8. */
1490        if (size == 1 && Py_CHARMASK(*u) < 128)
1491            return get_latin1_char(Py_CHARMASK(*u));
1492
1493        return PyUnicode_DecodeUTF8(u, size, NULL);
1494    }
1495
1496    unicode = _PyUnicode_New(size);
1497    if (!unicode)
1498        return NULL;
1499
1500    return (PyObject *)unicode;
1501}
1502
1503PyObject *
1504PyUnicode_FromString(const char *u)
1505{
1506    size_t size = strlen(u);
1507    if (size > PY_SSIZE_T_MAX) {
1508        PyErr_SetString(PyExc_OverflowError, "input too long");
1509        return NULL;
1510    }
1511
1512    return PyUnicode_FromStringAndSize(u, size);
1513}
1514
1515static PyObject*
1516unicode_fromascii(const unsigned char* u, Py_ssize_t size)
1517{
1518    PyObject *res = PyUnicode_New(size, 127);
1519    if (!res)
1520        return NULL;
1521    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1522    return res;
1523}
1524
1525static PyObject*
1526_PyUnicode_FromUCS1(const unsigned char* u, Py_ssize_t size)
1527{
1528    PyObject *res;
1529    unsigned char max_char = 127;
1530    Py_ssize_t i;
1531
1532    assert(size >= 0);
1533    for (i = 0; i < size; i++) {
1534        if (u[i] & 0x80) {
1535            max_char = 255;
1536            break;
1537        }
1538    }
1539    res = PyUnicode_New(size, max_char);
1540    if (!res)
1541        return NULL;
1542    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1543    return res;
1544}
1545
1546static PyObject*
1547_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1548{
1549    PyObject *res;
1550    Py_UCS2 max_char = 0;
1551    Py_ssize_t i;
1552
1553    assert(size >= 0);
1554    for (i = 0; i < size; i++) {
1555        if (u[i] > max_char) {
1556            max_char = u[i];
1557            if (max_char >= 256)
1558                break;
1559        }
1560    }
1561    res = PyUnicode_New(size, max_char);
1562    if (!res)
1563        return NULL;
1564    if (max_char >= 256)
1565        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1566    else
1567        for (i = 0; i < size; i++)
1568            PyUnicode_1BYTE_DATA(res)[i] = (Py_UCS1)u[i];
1569    return res;
1570}
1571
1572static PyObject*
1573_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1574{
1575    PyObject *res;
1576    Py_UCS4 max_char = 0;
1577    Py_ssize_t i;
1578
1579    assert(size >= 0);
1580    for (i = 0; i < size; i++) {
1581        if (u[i] > max_char) {
1582            max_char = u[i];
1583            if (max_char >= 0x10000)
1584                break;
1585        }
1586    }
1587    res = PyUnicode_New(size, max_char);
1588    if (!res)
1589        return NULL;
1590    if (max_char >= 0x10000)
1591        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
1592    else {
1593        int kind = PyUnicode_KIND(res);
1594        void *data = PyUnicode_DATA(res);
1595        for (i = 0; i < size; i++)
1596            PyUnicode_WRITE(kind, data, i, u[i]);
1597    }
1598    return res;
1599}
1600
1601PyObject*
1602PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
1603{
1604    switch(kind) {
1605    case PyUnicode_1BYTE_KIND:
1606        return _PyUnicode_FromUCS1(buffer, size);
1607    case PyUnicode_2BYTE_KIND:
1608        return _PyUnicode_FromUCS2(buffer, size);
1609    case PyUnicode_4BYTE_KIND:
1610        return _PyUnicode_FromUCS4(buffer, size);
1611    default:
1612        assert(0 && "invalid kind");
1613        PyErr_SetString(PyExc_SystemError, "invalid kind");
1614        return NULL;
1615    }
1616}
1617
1618PyObject*
1619PyUnicode_Copy(PyObject *unicode)
1620{
1621    Py_ssize_t size;
1622    PyObject *copy;
1623    void *data;
1624
1625    if (!PyUnicode_Check(unicode)) {
1626        PyErr_BadInternalCall();
1627        return NULL;
1628    }
1629    if (PyUnicode_READY(unicode))
1630        return NULL;
1631
1632    size = PyUnicode_GET_LENGTH(unicode);
1633    copy = PyUnicode_New(size, PyUnicode_MAX_CHAR_VALUE(unicode));
1634    if (!copy)
1635        return NULL;
1636    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
1637
1638    data = PyUnicode_DATA(unicode);
1639    switch (PyUnicode_KIND(unicode))
1640    {
1641    case PyUnicode_1BYTE_KIND:
1642        memcpy(PyUnicode_1BYTE_DATA(copy), data, size);
1643        break;
1644    case PyUnicode_2BYTE_KIND:
1645        memcpy(PyUnicode_2BYTE_DATA(copy), data, sizeof(Py_UCS2) * size);
1646        break;
1647    case PyUnicode_4BYTE_KIND:
1648        memcpy(PyUnicode_4BYTE_DATA(copy), data, sizeof(Py_UCS4) * size);
1649        break;
1650    default:
1651        assert(0);
1652        break;
1653    }
1654    return copy;
1655}
1656
1657
1658/* Widen Unicode objects to larger buffers. Don't write terminating null
1659   character. Return NULL on error. */
1660
1661void*
1662_PyUnicode_AsKind(PyObject *s, unsigned int kind)
1663{
1664    Py_ssize_t len;
1665    void *result;
1666    unsigned int skind;
1667
1668    if (PyUnicode_READY(s))
1669        return NULL;
1670
1671    len = PyUnicode_GET_LENGTH(s);
1672    skind = PyUnicode_KIND(s);
1673    if (skind >= kind) {
1674        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
1675        return NULL;
1676    }
1677    switch(kind) {
1678    case PyUnicode_2BYTE_KIND:
1679        result = PyMem_Malloc(len * sizeof(Py_UCS2));
1680        if (!result)
1681            return PyErr_NoMemory();
1682        assert(skind == PyUnicode_1BYTE_KIND);
1683        _PyUnicode_CONVERT_BYTES(
1684            Py_UCS1, Py_UCS2,
1685            PyUnicode_1BYTE_DATA(s),
1686            PyUnicode_1BYTE_DATA(s) + len,
1687            result);
1688        return result;
1689    case PyUnicode_4BYTE_KIND:
1690        result = PyMem_Malloc(len * sizeof(Py_UCS4));
1691        if (!result)
1692            return PyErr_NoMemory();
1693        if (skind == PyUnicode_2BYTE_KIND) {
1694            _PyUnicode_CONVERT_BYTES(
1695                Py_UCS2, Py_UCS4,
1696                PyUnicode_2BYTE_DATA(s),
1697                PyUnicode_2BYTE_DATA(s) + len,
1698                result);
1699        }
1700        else {
1701            assert(skind == PyUnicode_1BYTE_KIND);
1702            _PyUnicode_CONVERT_BYTES(
1703                Py_UCS1, Py_UCS4,
1704                PyUnicode_1BYTE_DATA(s),
1705                PyUnicode_1BYTE_DATA(s) + len,
1706                result);
1707        }
1708        return result;
1709    default:
1710        break;
1711    }
1712    PyErr_SetString(PyExc_SystemError, "invalid kind");
1713    return NULL;
1714}
1715
1716static Py_UCS4*
1717as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1718        int copy_null)
1719{
1720    int kind;
1721    void *data;
1722    Py_ssize_t len, targetlen;
1723    if (PyUnicode_READY(string) == -1)
1724        return NULL;
1725    kind = PyUnicode_KIND(string);
1726    data = PyUnicode_DATA(string);
1727    len = PyUnicode_GET_LENGTH(string);
1728    targetlen = len;
1729    if (copy_null)
1730        targetlen++;
1731    if (!target) {
1732        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
1733            PyErr_NoMemory();
1734            return NULL;
1735        }
1736        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
1737        if (!target) {
1738            PyErr_NoMemory();
1739            return NULL;
1740        }
1741    }
1742    else {
1743        if (targetsize < targetlen) {
1744            PyErr_Format(PyExc_SystemError,
1745                         "string is longer than the buffer");
1746            if (copy_null && 0 < targetsize)
1747                target[0] = 0;
1748            return NULL;
1749        }
1750    }
1751    if (kind != PyUnicode_4BYTE_KIND) {
1752        Py_ssize_t i;
1753        for (i = 0; i < len; i++)
1754            target[i] = PyUnicode_READ(kind, data, i);
1755    }
1756    else
1757        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
1758    if (copy_null)
1759        target[len] = 0;
1760    return target;
1761}
1762
1763Py_UCS4*
1764PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
1765                 int copy_null)
1766{
1767    if (target == NULL || targetsize < 1) {
1768        PyErr_BadInternalCall();
1769        return NULL;
1770    }
1771    return as_ucs4(string, target, targetsize, copy_null);
1772}
1773
1774Py_UCS4*
1775PyUnicode_AsUCS4Copy(PyObject *string)
1776{
1777    return as_ucs4(string, NULL, 0, 1);
1778}
1779
1780#ifdef HAVE_WCHAR_H
1781
1782PyObject *
1783PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
1784{
1785    if (w == NULL) {
1786        if (size == 0)
1787            return PyUnicode_New(0, 0);
1788        PyErr_BadInternalCall();
1789        return NULL;
1790    }
1791
1792    if (size == -1) {
1793        size = wcslen(w);
1794    }
1795
1796    return PyUnicode_FromUnicode(w, size);
1797}
1798
1799#endif /* HAVE_WCHAR_H */
1800
1801static void
1802makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
1803        int zeropad, int width, int precision, char c)
1804{
1805    *fmt++ = '%';
1806    if (width) {
1807        if (zeropad)
1808            *fmt++ = '0';
1809        fmt += sprintf(fmt, "%d", width);
1810    }
1811    if (precision)
1812        fmt += sprintf(fmt, ".%d", precision);
1813    if (longflag)
1814        *fmt++ = 'l';
1815    else if (longlongflag) {
1816        /* longlongflag should only ever be nonzero on machines with
1817           HAVE_LONG_LONG defined */
1818#ifdef HAVE_LONG_LONG
1819        char *f = PY_FORMAT_LONG_LONG;
1820        while (*f)
1821            *fmt++ = *f++;
1822#else
1823        /* we shouldn't ever get here */
1824        assert(0);
1825        *fmt++ = 'l';
1826#endif
1827    }
1828    else if (size_tflag) {
1829        char *f = PY_FORMAT_SIZE_T;
1830        while (*f)
1831            *fmt++ = *f++;
1832    }
1833    *fmt++ = c;
1834    *fmt = '\0';
1835}
1836
1837/* helper for PyUnicode_FromFormatV() */
1838
1839static const char*
1840parse_format_flags(const char *f,
1841                   int *p_width, int *p_precision,
1842                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
1843{
1844    int width, precision, longflag, longlongflag, size_tflag;
1845
1846    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
1847    f++;
1848    width = 0;
1849    while (Py_ISDIGIT((unsigned)*f))
1850        width = (width*10) + *f++ - '0';
1851    precision = 0;
1852    if (*f == '.') {
1853        f++;
1854        while (Py_ISDIGIT((unsigned)*f))
1855            precision = (precision*10) + *f++ - '0';
1856        if (*f == '%') {
1857            /* "%.3%s" => f points to "3" */
1858            f--;
1859        }
1860    }
1861    if (*f == '\0') {
1862        /* bogus format "%.1" => go backward, f points to "1" */
1863        f--;
1864    }
1865    if (p_width != NULL)
1866        *p_width = width;
1867    if (p_precision != NULL)
1868        *p_precision = precision;
1869
1870    /* Handle %ld, %lu, %lld and %llu. */
1871    longflag = 0;
1872    longlongflag = 0;
1873    size_tflag = 0;
1874
1875    if (*f == 'l') {
1876        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
1877            longflag = 1;
1878            ++f;
1879        }
1880#ifdef HAVE_LONG_LONG
1881        else if (f[1] == 'l' &&
1882                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
1883            longlongflag = 1;
1884            f += 2;
1885        }
1886#endif
1887    }
1888    /* handle the size_t flag. */
1889    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
1890        size_tflag = 1;
1891        ++f;
1892    }
1893    if (p_longflag != NULL)
1894        *p_longflag = longflag;
1895    if (p_longlongflag != NULL)
1896        *p_longlongflag = longlongflag;
1897    if (p_size_tflag != NULL)
1898        *p_size_tflag = size_tflag;
1899    return f;
1900}
1901
1902/* maximum number of characters required for output of %ld.  21 characters
1903   allows for 64-bit integers (in decimal) and an optional sign. */
1904#define MAX_LONG_CHARS 21
1905/* maximum number of characters required for output of %lld.
1906   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
1907   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
1908#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
1909
1910PyObject *
1911PyUnicode_FromFormatV(const char *format, va_list vargs)
1912{
1913    va_list count;
1914    Py_ssize_t callcount = 0;
1915    PyObject **callresults = NULL;
1916    PyObject **callresult = NULL;
1917    Py_ssize_t n = 0;
1918    int width = 0;
1919    int precision = 0;
1920    int zeropad;
1921    const char* f;
1922    PyUnicodeObject *string;
1923    /* used by sprintf */
1924    char fmt[61]; /* should be enough for %0width.precisionlld */
1925    Py_UCS4 maxchar = 127; /* result is ASCII by default */
1926    Py_UCS4 argmaxchar;
1927    Py_ssize_t numbersize = 0;
1928    char *numberresults = NULL;
1929    char *numberresult = NULL;
1930    Py_ssize_t i;
1931    int kind;
1932    void *data;
1933
1934    Py_VA_COPY(count, vargs);
1935    /* step 1: count the number of %S/%R/%A/%s format specifications
1936     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
1937     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
1938     * result in an array)
1939     * also esimate a upper bound for all the number formats in the string,
1940     * numbers will be formated in step 3 and be keept in a '\0'-separated
1941     * buffer before putting everything together. */
1942    for (f = format; *f; f++) {
1943        if (*f == '%') {
1944            int longlongflag;
1945            /* skip width or width.precision (eg. "1.2" of "%1.2f") */
1946            f = parse_format_flags(f, &width, NULL, NULL, &longlongflag, NULL);
1947            if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
1948                ++callcount;
1949
1950            else if (*f == 'd' || *f=='u' || *f=='i' || *f=='x' || *f=='p') {
1951#ifdef HAVE_LONG_LONG
1952                if (longlongflag) {
1953                    if (width < MAX_LONG_LONG_CHARS)
1954                        width = MAX_LONG_LONG_CHARS;
1955                }
1956                else
1957#endif
1958                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
1959                       including sign.  Decimal takes the most space.  This
1960                       isn't enough for octal.  If a width is specified we
1961                       need more (which we allocate later). */
1962                    if (width < MAX_LONG_CHARS)
1963                        width = MAX_LONG_CHARS;
1964
1965                /* account for the size + '\0' to separate numbers
1966                   inside of the numberresults buffer */
1967                numbersize += (width + 1);
1968            }
1969        }
1970        else if ((unsigned char)*f > 127) {
1971            PyErr_Format(PyExc_ValueError,
1972                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
1973                "string, got a non-ASCII byte: 0x%02x",
1974                (unsigned char)*f);
1975            return NULL;
1976        }
1977    }
1978    /* step 2: allocate memory for the results of
1979     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
1980    if (callcount) {
1981        callresults = PyObject_Malloc(sizeof(PyObject *) * callcount);
1982        if (!callresults) {
1983            PyErr_NoMemory();
1984            return NULL;
1985        }
1986        callresult = callresults;
1987    }
1988    /* step 2.5: allocate memory for the results of formating numbers */
1989    if (numbersize) {
1990        numberresults = PyObject_Malloc(numbersize);
1991        if (!numberresults) {
1992            PyErr_NoMemory();
1993            goto fail;
1994        }
1995        numberresult = numberresults;
1996    }
1997
1998    /* step 3: format numbers and figure out how large a buffer we need */
1999    for (f = format; *f; f++) {
2000        if (*f == '%') {
2001            const char* p;
2002            int longflag;
2003            int longlongflag;
2004            int size_tflag;
2005            int numprinted;
2006
2007            p = f;
2008            zeropad = (f[1] == '0');
2009            f = parse_format_flags(f, &width, &precision,
2010                                   &longflag, &longlongflag, &size_tflag);
2011            switch (*f) {
2012            case 'c':
2013            {
2014                Py_UCS4 ordinal = va_arg(count, int);
2015                maxchar = Py_MAX(maxchar, ordinal);
2016                n++;
2017                break;
2018            }
2019            case '%':
2020                n++;
2021                break;
2022            case 'i':
2023            case 'd':
2024                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2025                        width, precision, *f);
2026                if (longflag)
2027                    numprinted = sprintf(numberresult, fmt,
2028                                         va_arg(count, long));
2029#ifdef HAVE_LONG_LONG
2030                else if (longlongflag)
2031                    numprinted = sprintf(numberresult, fmt,
2032                                         va_arg(count, PY_LONG_LONG));
2033#endif
2034                else if (size_tflag)
2035                    numprinted = sprintf(numberresult, fmt,
2036                                         va_arg(count, Py_ssize_t));
2037                else
2038                    numprinted = sprintf(numberresult, fmt,
2039                                         va_arg(count, int));
2040                n += numprinted;
2041                /* advance by +1 to skip over the '\0' */
2042                numberresult += (numprinted + 1);
2043                assert(*(numberresult - 1) == '\0');
2044                assert(*(numberresult - 2) != '\0');
2045                assert(numprinted >= 0);
2046                assert(numberresult <= numberresults + numbersize);
2047                break;
2048            case 'u':
2049                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
2050                        width, precision, 'u');
2051                if (longflag)
2052                    numprinted = sprintf(numberresult, fmt,
2053                                         va_arg(count, unsigned long));
2054#ifdef HAVE_LONG_LONG
2055                else if (longlongflag)
2056                    numprinted = sprintf(numberresult, fmt,
2057                                         va_arg(count, unsigned PY_LONG_LONG));
2058#endif
2059                else if (size_tflag)
2060                    numprinted = sprintf(numberresult, fmt,
2061                                         va_arg(count, size_t));
2062                else
2063                    numprinted = sprintf(numberresult, fmt,
2064                                         va_arg(count, unsigned int));
2065                n += numprinted;
2066                numberresult += (numprinted + 1);
2067                assert(*(numberresult - 1) == '\0');
2068                assert(*(numberresult - 2) != '\0');
2069                assert(numprinted >= 0);
2070                assert(numberresult <= numberresults + numbersize);
2071                break;
2072            case 'x':
2073                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
2074                numprinted = sprintf(numberresult, fmt, va_arg(count, int));
2075                n += numprinted;
2076                numberresult += (numprinted + 1);
2077                assert(*(numberresult - 1) == '\0');
2078                assert(*(numberresult - 2) != '\0');
2079                assert(numprinted >= 0);
2080                assert(numberresult <= numberresults + numbersize);
2081                break;
2082            case 'p':
2083                numprinted = sprintf(numberresult, "%p", va_arg(count, void*));
2084                /* %p is ill-defined:  ensure leading 0x. */
2085                if (numberresult[1] == 'X')
2086                    numberresult[1] = 'x';
2087                else if (numberresult[1] != 'x') {
2088                    memmove(numberresult + 2, numberresult,
2089                            strlen(numberresult) + 1);
2090                    numberresult[0] = '0';
2091                    numberresult[1] = 'x';
2092                    numprinted += 2;
2093                }
2094                n += numprinted;
2095                numberresult += (numprinted + 1);
2096                assert(*(numberresult - 1) == '\0');
2097                assert(*(numberresult - 2) != '\0');
2098                assert(numprinted >= 0);
2099                assert(numberresult <= numberresults + numbersize);
2100                break;
2101            case 's':
2102            {
2103                /* UTF-8 */
2104                const char *s = va_arg(count, const char*);
2105                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
2106                if (!str)
2107                    goto fail;
2108                /* since PyUnicode_DecodeUTF8 returns already flexible
2109                   unicode objects, there is no need to call ready on them */
2110                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2111                maxchar = Py_MAX(maxchar, argmaxchar);
2112                n += PyUnicode_GET_LENGTH(str);
2113                /* Remember the str and switch to the next slot */
2114                *callresult++ = str;
2115                break;
2116            }
2117            case 'U':
2118            {
2119                PyObject *obj = va_arg(count, PyObject *);
2120                assert(obj && _PyUnicode_CHECK(obj));
2121                if (PyUnicode_READY(obj) == -1)
2122                    goto fail;
2123                argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2124                maxchar = Py_MAX(maxchar, argmaxchar);
2125                n += PyUnicode_GET_LENGTH(obj);
2126                break;
2127            }
2128            case 'V':
2129            {
2130                PyObject *obj = va_arg(count, PyObject *);
2131                const char *str = va_arg(count, const char *);
2132                PyObject *str_obj;
2133                assert(obj || str);
2134                assert(!obj || _PyUnicode_CHECK(obj));
2135                if (obj) {
2136                    if (PyUnicode_READY(obj) == -1)
2137                        goto fail;
2138                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(obj);
2139                    maxchar = Py_MAX(maxchar, argmaxchar);
2140                    n += PyUnicode_GET_LENGTH(obj);
2141                    *callresult++ = NULL;
2142                }
2143                else {
2144                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
2145                    if (!str_obj)
2146                        goto fail;
2147                    if (PyUnicode_READY(str_obj)) {
2148                        Py_DECREF(str_obj);
2149                        goto fail;
2150                    }
2151                    argmaxchar = PyUnicode_MAX_CHAR_VALUE(str_obj);
2152                    maxchar = Py_MAX(maxchar, argmaxchar);
2153                    n += PyUnicode_GET_LENGTH(str_obj);
2154                    *callresult++ = str_obj;
2155                }
2156                break;
2157            }
2158            case 'S':
2159            {
2160                PyObject *obj = va_arg(count, PyObject *);
2161                PyObject *str;
2162                assert(obj);
2163                str = PyObject_Str(obj);
2164                if (!str || PyUnicode_READY(str) == -1)
2165                    goto fail;
2166                argmaxchar = PyUnicode_MAX_CHAR_VALUE(str);
2167                maxchar = Py_MAX(maxchar, argmaxchar);
2168                n += PyUnicode_GET_LENGTH(str);
2169                /* Remember the str and switch to the next slot */
2170                *callresult++ = str;
2171                break;
2172            }
2173            case 'R':
2174            {
2175                PyObject *obj = va_arg(count, PyObject *);
2176                PyObject *repr;
2177                assert(obj);
2178                repr = PyObject_Repr(obj);
2179                if (!repr || PyUnicode_READY(repr) == -1)
2180                    goto fail;
2181                argmaxchar = PyUnicode_MAX_CHAR_VALUE(repr);
2182                maxchar = Py_MAX(maxchar, argmaxchar);
2183                n += PyUnicode_GET_LENGTH(repr);
2184                /* Remember the repr and switch to the next slot */
2185                *callresult++ = repr;
2186                break;
2187            }
2188            case 'A':
2189            {
2190                PyObject *obj = va_arg(count, PyObject *);
2191                PyObject *ascii;
2192                assert(obj);
2193                ascii = PyObject_ASCII(obj);
2194                if (!ascii || PyUnicode_READY(ascii) == -1)
2195                    goto fail;
2196                argmaxchar = PyUnicode_MAX_CHAR_VALUE(ascii);
2197                maxchar = Py_MAX(maxchar, argmaxchar);
2198                n += PyUnicode_GET_LENGTH(ascii);
2199                /* Remember the repr and switch to the next slot */
2200                *callresult++ = ascii;
2201                break;
2202            }
2203            default:
2204                /* if we stumble upon an unknown
2205                   formatting code, copy the rest of
2206                   the format string to the output
2207                   string. (we cannot just skip the
2208                   code, since there's no way to know
2209                   what's in the argument list) */
2210                n += strlen(p);
2211                goto expand;
2212            }
2213        } else
2214            n++;
2215    }
2216  expand:
2217    /* step 4: fill the buffer */
2218    /* Since we've analyzed how much space we need,
2219       we don't have to resize the string.
2220       There can be no errors beyond this point. */
2221    string = (PyUnicodeObject *)PyUnicode_New(n, maxchar);
2222    if (!string)
2223        goto fail;
2224    kind = PyUnicode_KIND(string);
2225    data = PyUnicode_DATA(string);
2226    callresult = callresults;
2227    numberresult = numberresults;
2228
2229    for (i = 0, f = format; *f; f++) {
2230        if (*f == '%') {
2231            const char* p;
2232
2233            p = f;
2234            f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
2235            /* checking for == because the last argument could be a empty
2236               string, which causes i to point to end, the assert at the end of
2237               the loop */
2238            assert(i <= PyUnicode_GET_LENGTH(string));
2239
2240            switch (*f) {
2241            case 'c':
2242            {
2243                const int ordinal = va_arg(vargs, int);
2244                PyUnicode_WRITE(kind, data, i++, ordinal);
2245                break;
2246            }
2247            case 'i':
2248            case 'd':
2249            case 'u':
2250            case 'x':
2251            case 'p':
2252                /* unused, since we already have the result */
2253                if (*f == 'p')
2254                    (void) va_arg(vargs, void *);
2255                else
2256                    (void) va_arg(vargs, int);
2257                /* extract the result from numberresults and append. */
2258                for (; *numberresult; ++i, ++numberresult)
2259                    PyUnicode_WRITE(kind, data, i, *numberresult);
2260                /* skip over the separating '\0' */
2261                assert(*numberresult == '\0');
2262                numberresult++;
2263                assert(numberresult <= numberresults + numbersize);
2264                break;
2265            case 's':
2266            {
2267                /* unused, since we already have the result */
2268                Py_ssize_t size;
2269                (void) va_arg(vargs, char *);
2270                size = PyUnicode_GET_LENGTH(*callresult);
2271                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2272                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2273                                             *callresult, 0,
2274                                             size) < 0)
2275                    goto fail;
2276                i += size;
2277                /* We're done with the unicode()/repr() => forget it */
2278                Py_DECREF(*callresult);
2279                /* switch to next unicode()/repr() result */
2280                ++callresult;
2281                break;
2282            }
2283            case 'U':
2284            {
2285                PyObject *obj = va_arg(vargs, PyObject *);
2286                Py_ssize_t size;
2287                assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2288                size = PyUnicode_GET_LENGTH(obj);
2289                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2290                                             obj, 0,
2291                                             size) < 0)
2292                    goto fail;
2293                i += size;
2294                break;
2295            }
2296            case 'V':
2297            {
2298                Py_ssize_t size;
2299                PyObject *obj = va_arg(vargs, PyObject *);
2300                va_arg(vargs, const char *);
2301                if (obj) {
2302                    size = PyUnicode_GET_LENGTH(obj);
2303                    assert(PyUnicode_KIND(obj) <= PyUnicode_KIND(string));
2304                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2305                                                 obj, 0,
2306                                                 size) < 0)
2307                        goto fail;
2308                    i += size;
2309                } else {
2310                    size = PyUnicode_GET_LENGTH(*callresult);
2311                    assert(PyUnicode_KIND(*callresult) <=
2312                           PyUnicode_KIND(string));
2313                    if (PyUnicode_CopyCharacters((PyObject*)string, i,
2314                                                 *callresult,
2315                                                 0, size) < 0)
2316                        goto fail;
2317                    i += size;
2318                    Py_DECREF(*callresult);
2319                }
2320                ++callresult;
2321                break;
2322            }
2323            case 'S':
2324            case 'R':
2325            case 'A':
2326            {
2327                /* unused, since we already have the result */
2328                (void) va_arg(vargs, PyObject *);
2329                assert(PyUnicode_KIND(*callresult) <= PyUnicode_KIND(string));
2330                if (PyUnicode_CopyCharacters((PyObject*)string, i,
2331                                             *callresult, 0,
2332                                             PyUnicode_GET_LENGTH(*callresult)) < 0)
2333                    goto fail;
2334                i += PyUnicode_GET_LENGTH(*callresult);
2335                /* We're done with the unicode()/repr() => forget it */
2336                Py_DECREF(*callresult);
2337                /* switch to next unicode()/repr() result */
2338                ++callresult;
2339                break;
2340            }
2341            case '%':
2342                PyUnicode_WRITE(kind, data, i++, '%');
2343                break;
2344            default:
2345                for (; *p; ++p, ++i)
2346                    PyUnicode_WRITE(kind, data, i, *p);
2347                assert(i == PyUnicode_GET_LENGTH(string));
2348                goto end;
2349            }
2350        }
2351        else {
2352            assert(i < PyUnicode_GET_LENGTH(string));
2353            PyUnicode_WRITE(kind, data, i++, *f);
2354        }
2355    }
2356    assert(i == PyUnicode_GET_LENGTH(string));
2357
2358  end:
2359    if (callresults)
2360        PyObject_Free(callresults);
2361    if (numberresults)
2362        PyObject_Free(numberresults);
2363    return (PyObject *)string;
2364  fail:
2365    if (callresults) {
2366        PyObject **callresult2 = callresults;
2367        while (callresult2 < callresult) {
2368            Py_XDECREF(*callresult2);
2369            ++callresult2;
2370        }
2371        PyObject_Free(callresults);
2372    }
2373    if (numberresults)
2374        PyObject_Free(numberresults);
2375    return NULL;
2376}
2377
2378PyObject *
2379PyUnicode_FromFormat(const char *format, ...)
2380{
2381    PyObject* ret;
2382    va_list vargs;
2383
2384#ifdef HAVE_STDARG_PROTOTYPES
2385    va_start(vargs, format);
2386#else
2387    va_start(vargs);
2388#endif
2389    ret = PyUnicode_FromFormatV(format, vargs);
2390    va_end(vargs);
2391    return ret;
2392}
2393
2394#ifdef HAVE_WCHAR_H
2395
2396/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2397   convert a Unicode object to a wide character string.
2398
2399   - If w is NULL: return the number of wide characters (including the null
2400     character) required to convert the unicode object. Ignore size argument.
2401
2402   - Otherwise: return the number of wide characters (excluding the null
2403     character) written into w. Write at most size wide characters (including
2404     the null character). */
2405static Py_ssize_t
2406unicode_aswidechar(PyUnicodeObject *unicode,
2407                   wchar_t *w,
2408                   Py_ssize_t size)
2409{
2410    Py_ssize_t res;
2411    const wchar_t *wstr;
2412
2413    wstr = PyUnicode_AsUnicodeAndSize((PyObject *)unicode, &res);
2414    if (wstr == NULL)
2415        return -1;
2416
2417    if (w != NULL) {
2418        if (size > res)
2419            size = res + 1;
2420        else
2421            res = size;
2422        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2423        return res;
2424    }
2425    else
2426        return res + 1;
2427}
2428
2429Py_ssize_t
2430PyUnicode_AsWideChar(PyObject *unicode,
2431                     wchar_t *w,
2432                     Py_ssize_t size)
2433{
2434    if (unicode == NULL) {
2435        PyErr_BadInternalCall();
2436        return -1;
2437    }
2438    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
2439}
2440
2441wchar_t*
2442PyUnicode_AsWideCharString(PyObject *unicode,
2443                           Py_ssize_t *size)
2444{
2445    wchar_t* buffer;
2446    Py_ssize_t buflen;
2447
2448    if (unicode == NULL) {
2449        PyErr_BadInternalCall();
2450        return NULL;
2451    }
2452
2453    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
2454    if (buflen == -1)
2455        return NULL;
2456    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2457        PyErr_NoMemory();
2458        return NULL;
2459    }
2460
2461    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2462    if (buffer == NULL) {
2463        PyErr_NoMemory();
2464        return NULL;
2465    }
2466    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
2467    if (buflen == -1)
2468        return NULL;
2469    if (size != NULL)
2470        *size = buflen;
2471    return buffer;
2472}
2473
2474#endif /* HAVE_WCHAR_H */
2475
2476PyObject *
2477PyUnicode_FromOrdinal(int ordinal)
2478{
2479    PyObject *v;
2480    if (ordinal < 0 || ordinal > 0x10ffff) {
2481        PyErr_SetString(PyExc_ValueError,
2482                        "chr() arg not in range(0x110000)");
2483        return NULL;
2484    }
2485
2486    if (ordinal < 256)
2487        return get_latin1_char(ordinal);
2488
2489    v = PyUnicode_New(1, ordinal);
2490    if (v == NULL)
2491        return NULL;
2492    PyUnicode_WRITE(PyUnicode_KIND(v), PyUnicode_DATA(v), 0, ordinal);
2493    return v;
2494}
2495
2496PyObject *
2497PyUnicode_FromObject(register PyObject *obj)
2498{
2499    /* XXX Perhaps we should make this API an alias of
2500       PyObject_Str() instead ?! */
2501    if (PyUnicode_CheckExact(obj)) {
2502        if (PyUnicode_READY(obj))
2503            return NULL;
2504        Py_INCREF(obj);
2505        return obj;
2506    }
2507    if (PyUnicode_Check(obj)) {
2508        /* For a Unicode subtype that's not a Unicode object,
2509           return a true Unicode object with the same data. */
2510        return PyUnicode_Copy(obj);
2511    }
2512    PyErr_Format(PyExc_TypeError,
2513                 "Can't convert '%.100s' object to str implicitly",
2514                 Py_TYPE(obj)->tp_name);
2515    return NULL;
2516}
2517
2518PyObject *
2519PyUnicode_FromEncodedObject(register PyObject *obj,
2520                            const char *encoding,
2521                            const char *errors)
2522{
2523    Py_buffer buffer;
2524    PyObject *v;
2525
2526    if (obj == NULL) {
2527        PyErr_BadInternalCall();
2528        return NULL;
2529    }
2530
2531    /* Decoding bytes objects is the most common case and should be fast */
2532    if (PyBytes_Check(obj)) {
2533        if (PyBytes_GET_SIZE(obj) == 0) {
2534            Py_INCREF(unicode_empty);
2535            v = unicode_empty;
2536        }
2537        else {
2538            v = PyUnicode_Decode(
2539                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2540                    encoding, errors);
2541        }
2542        return v;
2543    }
2544
2545    if (PyUnicode_Check(obj)) {
2546        PyErr_SetString(PyExc_TypeError,
2547                        "decoding str is not supported");
2548        return NULL;
2549    }
2550
2551    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2552    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2553        PyErr_Format(PyExc_TypeError,
2554                     "coercing to str: need bytes, bytearray "
2555                     "or buffer-like object, %.80s found",
2556                     Py_TYPE(obj)->tp_name);
2557        return NULL;
2558    }
2559
2560    if (buffer.len == 0) {
2561        Py_INCREF(unicode_empty);
2562        v = unicode_empty;
2563    }
2564    else
2565        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2566
2567    PyBuffer_Release(&buffer);
2568    return v;
2569}
2570
2571/* Convert encoding to lower case and replace '_' with '-' in order to
2572   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2573   1 on success. */
2574static int
2575normalize_encoding(const char *encoding,
2576                   char *lower,
2577                   size_t lower_len)
2578{
2579    const char *e;
2580    char *l;
2581    char *l_end;
2582
2583    e = encoding;
2584    l = lower;
2585    l_end = &lower[lower_len - 1];
2586    while (*e) {
2587        if (l == l_end)
2588            return 0;
2589        if (Py_ISUPPER(*e)) {
2590            *l++ = Py_TOLOWER(*e++);
2591        }
2592        else if (*e == '_') {
2593            *l++ = '-';
2594            e++;
2595        }
2596        else {
2597            *l++ = *e++;
2598        }
2599    }
2600    *l = '\0';
2601    return 1;
2602}
2603
2604PyObject *
2605PyUnicode_Decode(const char *s,
2606                 Py_ssize_t size,
2607                 const char *encoding,
2608                 const char *errors)
2609{
2610    PyObject *buffer = NULL, *unicode;
2611    Py_buffer info;
2612    char lower[11];  /* Enough for any encoding shortcut */
2613
2614    if (encoding == NULL)
2615        return PyUnicode_DecodeUTF8(s, size, errors);
2616
2617    /* Shortcuts for common default encodings */
2618    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2619        if ((strcmp(lower, "utf-8") == 0) ||
2620            (strcmp(lower, "utf8") == 0))
2621            return PyUnicode_DecodeUTF8(s, size, errors);
2622        else if ((strcmp(lower, "latin-1") == 0) ||
2623                 (strcmp(lower, "latin1") == 0) ||
2624                 (strcmp(lower, "iso-8859-1") == 0))
2625            return PyUnicode_DecodeLatin1(s, size, errors);
2626#ifdef HAVE_MBCS
2627        else if (strcmp(lower, "mbcs") == 0)
2628            return PyUnicode_DecodeMBCS(s, size, errors);
2629#endif
2630        else if (strcmp(lower, "ascii") == 0)
2631            return PyUnicode_DecodeASCII(s, size, errors);
2632        else if (strcmp(lower, "utf-16") == 0)
2633            return PyUnicode_DecodeUTF16(s, size, errors, 0);
2634        else if (strcmp(lower, "utf-32") == 0)
2635            return PyUnicode_DecodeUTF32(s, size, errors, 0);
2636    }
2637
2638    /* Decode via the codec registry */
2639    buffer = NULL;
2640    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
2641        goto onError;
2642    buffer = PyMemoryView_FromBuffer(&info);
2643    if (buffer == NULL)
2644        goto onError;
2645    unicode = PyCodec_Decode(buffer, encoding, errors);
2646    if (unicode == NULL)
2647        goto onError;
2648    if (!PyUnicode_Check(unicode)) {
2649        PyErr_Format(PyExc_TypeError,
2650                     "decoder did not return a str object (type=%.400s)",
2651                     Py_TYPE(unicode)->tp_name);
2652        Py_DECREF(unicode);
2653        goto onError;
2654    }
2655    Py_DECREF(buffer);
2656#ifndef DONT_MAKE_RESULT_READY
2657    if (_PyUnicode_READY_REPLACE(&unicode)) {
2658        Py_DECREF(unicode);
2659        return NULL;
2660    }
2661#endif
2662    return unicode;
2663
2664  onError:
2665    Py_XDECREF(buffer);
2666    return NULL;
2667}
2668
2669PyObject *
2670PyUnicode_AsDecodedObject(PyObject *unicode,
2671                          const char *encoding,
2672                          const char *errors)
2673{
2674    PyObject *v;
2675
2676    if (!PyUnicode_Check(unicode)) {
2677        PyErr_BadArgument();
2678        goto onError;
2679    }
2680
2681    if (encoding == NULL)
2682        encoding = PyUnicode_GetDefaultEncoding();
2683
2684    /* Decode via the codec registry */
2685    v = PyCodec_Decode(unicode, encoding, errors);
2686    if (v == NULL)
2687        goto onError;
2688    return v;
2689
2690  onError:
2691    return NULL;
2692}
2693
2694PyObject *
2695PyUnicode_AsDecodedUnicode(PyObject *unicode,
2696                           const char *encoding,
2697                           const char *errors)
2698{
2699    PyObject *v;
2700
2701    if (!PyUnicode_Check(unicode)) {
2702        PyErr_BadArgument();
2703        goto onError;
2704    }
2705
2706    if (encoding == NULL)
2707        encoding = PyUnicode_GetDefaultEncoding();
2708
2709    /* Decode via the codec registry */
2710    v = PyCodec_Decode(unicode, encoding, errors);
2711    if (v == NULL)
2712        goto onError;
2713    if (!PyUnicode_Check(v)) {
2714        PyErr_Format(PyExc_TypeError,
2715                     "decoder did not return a str object (type=%.400s)",
2716                     Py_TYPE(v)->tp_name);
2717        Py_DECREF(v);
2718        goto onError;
2719    }
2720    return v;
2721
2722  onError:
2723    return NULL;
2724}
2725
2726PyObject *
2727PyUnicode_Encode(const Py_UNICODE *s,
2728                 Py_ssize_t size,
2729                 const char *encoding,
2730                 const char *errors)
2731{
2732    PyObject *v, *unicode;
2733
2734    unicode = PyUnicode_FromUnicode(s, size);
2735    if (unicode == NULL)
2736        return NULL;
2737    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
2738    Py_DECREF(unicode);
2739    return v;
2740}
2741
2742PyObject *
2743PyUnicode_AsEncodedObject(PyObject *unicode,
2744                          const char *encoding,
2745                          const char *errors)
2746{
2747    PyObject *v;
2748
2749    if (!PyUnicode_Check(unicode)) {
2750        PyErr_BadArgument();
2751        goto onError;
2752    }
2753
2754    if (encoding == NULL)
2755        encoding = PyUnicode_GetDefaultEncoding();
2756
2757    /* Encode via the codec registry */
2758    v = PyCodec_Encode(unicode, encoding, errors);
2759    if (v == NULL)
2760        goto onError;
2761    return v;
2762
2763  onError:
2764    return NULL;
2765}
2766
2767PyObject *
2768PyUnicode_EncodeFSDefault(PyObject *unicode)
2769{
2770#ifdef HAVE_MBCS
2771    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2772                                PyUnicode_GET_SIZE(unicode),
2773                                NULL);
2774#elif defined(__APPLE__)
2775    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
2776#else
2777    PyInterpreterState *interp = PyThreadState_GET()->interp;
2778    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2779       cannot use it to encode and decode filenames before it is loaded. Load
2780       the Python codec requires to encode at least its own filename. Use the C
2781       version of the locale codec until the codec registry is initialized and
2782       the Python codec is loaded.
2783
2784       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2785       cannot only rely on it: check also interp->fscodec_initialized for
2786       subinterpreters. */
2787    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2788        return PyUnicode_AsEncodedString(unicode,
2789                                         Py_FileSystemDefaultEncoding,
2790                                         "surrogateescape");
2791    }
2792    else {
2793        /* locale encoding with surrogateescape */
2794        wchar_t *wchar;
2795        char *bytes;
2796        PyObject *bytes_obj;
2797        size_t error_pos;
2798
2799        wchar = PyUnicode_AsWideCharString(unicode, NULL);
2800        if (wchar == NULL)
2801            return NULL;
2802        bytes = _Py_wchar2char(wchar, &error_pos);
2803        if (bytes == NULL) {
2804            if (error_pos != (size_t)-1) {
2805                char *errmsg = strerror(errno);
2806                PyObject *exc = NULL;
2807                if (errmsg == NULL)
2808                    errmsg = "Py_wchar2char() failed";
2809                raise_encode_exception(&exc,
2810                    "filesystemencoding",
2811                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
2812                    error_pos, error_pos+1,
2813                    errmsg);
2814                Py_XDECREF(exc);
2815            }
2816            else
2817                PyErr_NoMemory();
2818            PyMem_Free(wchar);
2819            return NULL;
2820        }
2821        PyMem_Free(wchar);
2822
2823        bytes_obj = PyBytes_FromString(bytes);
2824        PyMem_Free(bytes);
2825        return bytes_obj;
2826    }
2827#endif
2828}
2829
2830PyObject *
2831PyUnicode_AsEncodedString(PyObject *unicode,
2832                          const char *encoding,
2833                          const char *errors)
2834{
2835    PyObject *v;
2836    char lower[11];  /* Enough for any encoding shortcut */
2837
2838    if (!PyUnicode_Check(unicode)) {
2839        PyErr_BadArgument();
2840        return NULL;
2841    }
2842
2843    if (encoding == NULL) {
2844        if (errors == NULL || strcmp(errors, "strict") == 0)
2845            return _PyUnicode_AsUTF8String(unicode, NULL);
2846        else
2847            return _PyUnicode_AsUTF8String(unicode, errors);
2848    }
2849
2850    /* Shortcuts for common default encodings */
2851    if (normalize_encoding(encoding, lower, sizeof(lower))) {
2852        if ((strcmp(lower, "utf-8") == 0) ||
2853            (strcmp(lower, "utf8") == 0))
2854        {
2855            if (errors == NULL || strcmp(errors, "strict") == 0)
2856                return _PyUnicode_AsUTF8String(unicode, NULL);
2857            else
2858                return _PyUnicode_AsUTF8String(unicode, errors);
2859        }
2860        else if ((strcmp(lower, "latin-1") == 0) ||
2861                 (strcmp(lower, "latin1") == 0) ||
2862                 (strcmp(lower, "iso-8859-1") == 0))
2863            return _PyUnicode_AsLatin1String(unicode, errors);
2864#ifdef HAVE_MBCS
2865        else if (strcmp(lower, "mbcs") == 0)
2866            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
2867                                        PyUnicode_GET_SIZE(unicode),
2868                                        errors);
2869#endif
2870        else if (strcmp(lower, "ascii") == 0)
2871            return _PyUnicode_AsASCIIString(unicode, errors);
2872    }
2873
2874    /* Encode via the codec registry */
2875    v = PyCodec_Encode(unicode, encoding, errors);
2876    if (v == NULL)
2877        return NULL;
2878
2879    /* The normal path */
2880    if (PyBytes_Check(v))
2881        return v;
2882
2883    /* If the codec returns a buffer, raise a warning and convert to bytes */
2884    if (PyByteArray_Check(v)) {
2885        int error;
2886        PyObject *b;
2887
2888        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
2889            "encoder %s returned bytearray instead of bytes",
2890            encoding);
2891        if (error) {
2892            Py_DECREF(v);
2893            return NULL;
2894        }
2895
2896        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
2897        Py_DECREF(v);
2898        return b;
2899    }
2900
2901    PyErr_Format(PyExc_TypeError,
2902                 "encoder did not return a bytes object (type=%.400s)",
2903                 Py_TYPE(v)->tp_name);
2904    Py_DECREF(v);
2905    return NULL;
2906}
2907
2908PyObject *
2909PyUnicode_AsEncodedUnicode(PyObject *unicode,
2910                           const char *encoding,
2911                           const char *errors)
2912{
2913    PyObject *v;
2914
2915    if (!PyUnicode_Check(unicode)) {
2916        PyErr_BadArgument();
2917        goto onError;
2918    }
2919
2920    if (encoding == NULL)
2921        encoding = PyUnicode_GetDefaultEncoding();
2922
2923    /* Encode via the codec registry */
2924    v = PyCodec_Encode(unicode, encoding, errors);
2925    if (v == NULL)
2926        goto onError;
2927    if (!PyUnicode_Check(v)) {
2928        PyErr_Format(PyExc_TypeError,
2929                     "encoder did not return an str object (type=%.400s)",
2930                     Py_TYPE(v)->tp_name);
2931        Py_DECREF(v);
2932        goto onError;
2933    }
2934    return v;
2935
2936  onError:
2937    return NULL;
2938}
2939
2940PyObject*
2941PyUnicode_DecodeFSDefault(const char *s) {
2942    Py_ssize_t size = (Py_ssize_t)strlen(s);
2943    return PyUnicode_DecodeFSDefaultAndSize(s, size);
2944}
2945
2946PyObject*
2947PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
2948{
2949#ifdef HAVE_MBCS
2950    return PyUnicode_DecodeMBCS(s, size, NULL);
2951#elif defined(__APPLE__)
2952    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
2953#else
2954    PyInterpreterState *interp = PyThreadState_GET()->interp;
2955    /* Bootstrap check: if the filesystem codec is implemented in Python, we
2956       cannot use it to encode and decode filenames before it is loaded. Load
2957       the Python codec requires to encode at least its own filename. Use the C
2958       version of the locale codec until the codec registry is initialized and
2959       the Python codec is loaded.
2960
2961       Py_FileSystemDefaultEncoding is shared between all interpreters, we
2962       cannot only rely on it: check also interp->fscodec_initialized for
2963       subinterpreters. */
2964    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
2965        return PyUnicode_Decode(s, size,
2966                                Py_FileSystemDefaultEncoding,
2967                                "surrogateescape");
2968    }
2969    else {
2970        /* locale encoding with surrogateescape */
2971        wchar_t *wchar;
2972        PyObject *unicode;
2973        size_t len;
2974
2975        if (s[size] != '\0' || size != strlen(s)) {
2976            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
2977            return NULL;
2978        }
2979
2980        wchar = _Py_char2wchar(s, &len);
2981        if (wchar == NULL)
2982            return PyErr_NoMemory();
2983
2984        unicode = PyUnicode_FromWideChar(wchar, len);
2985        PyMem_Free(wchar);
2986        return unicode;
2987    }
2988#endif
2989}
2990
2991
2992int
2993PyUnicode_FSConverter(PyObject* arg, void* addr)
2994{
2995    PyObject *output = NULL;
2996    Py_ssize_t size;
2997    void *data;
2998    if (arg == NULL) {
2999        Py_DECREF(*(PyObject**)addr);
3000        return 1;
3001    }
3002    if (PyBytes_Check(arg)) {
3003        output = arg;
3004        Py_INCREF(output);
3005    }
3006    else {
3007        arg = PyUnicode_FromObject(arg);
3008        if (!arg)
3009            return 0;
3010        output = PyUnicode_EncodeFSDefault(arg);
3011        Py_DECREF(arg);
3012        if (!output)
3013            return 0;
3014        if (!PyBytes_Check(output)) {
3015            Py_DECREF(output);
3016            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3017            return 0;
3018        }
3019    }
3020    size = PyBytes_GET_SIZE(output);
3021    data = PyBytes_AS_STRING(output);
3022    if (size != strlen(data)) {
3023        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3024        Py_DECREF(output);
3025        return 0;
3026    }
3027    *(PyObject**)addr = output;
3028    return Py_CLEANUP_SUPPORTED;
3029}
3030
3031
3032int
3033PyUnicode_FSDecoder(PyObject* arg, void* addr)
3034{
3035    PyObject *output = NULL;
3036    if (arg == NULL) {
3037        Py_DECREF(*(PyObject**)addr);
3038        return 1;
3039    }
3040    if (PyUnicode_Check(arg)) {
3041        if (PyUnicode_READY(arg))
3042            return 0;
3043        output = arg;
3044        Py_INCREF(output);
3045    }
3046    else {
3047        arg = PyBytes_FromObject(arg);
3048        if (!arg)
3049            return 0;
3050        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3051                                                  PyBytes_GET_SIZE(arg));
3052        Py_DECREF(arg);
3053        if (!output)
3054            return 0;
3055        if (!PyUnicode_Check(output)) {
3056            Py_DECREF(output);
3057            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3058            return 0;
3059        }
3060    }
3061    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3062                 PyUnicode_GET_LENGTH(output), 0, 1)) {
3063        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3064        Py_DECREF(output);
3065        return 0;
3066    }
3067    *(PyObject**)addr = output;
3068    return Py_CLEANUP_SUPPORTED;
3069}
3070
3071
3072char*
3073PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3074{
3075    PyObject *bytes;
3076    PyUnicodeObject *u = (PyUnicodeObject *)unicode;
3077
3078    if (!PyUnicode_Check(unicode)) {
3079        PyErr_BadArgument();
3080        return NULL;
3081    }
3082    if (PyUnicode_READY(u) == -1)
3083        return NULL;
3084
3085    if (PyUnicode_UTF8(unicode) == NULL) {
3086        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3087        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3088        if (bytes == NULL)
3089            return NULL;
3090        _PyUnicode_UTF8(u) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3091        if (_PyUnicode_UTF8(u) == NULL) {
3092            Py_DECREF(bytes);
3093            return NULL;
3094        }
3095        _PyUnicode_UTF8_LENGTH(u) = PyBytes_GET_SIZE(bytes);
3096        Py_MEMCPY(_PyUnicode_UTF8(u), PyBytes_AS_STRING(bytes), _PyUnicode_UTF8_LENGTH(u) + 1);
3097        Py_DECREF(bytes);
3098    }
3099
3100    if (psize)
3101        *psize = PyUnicode_UTF8_LENGTH(unicode);
3102    return PyUnicode_UTF8(unicode);
3103}
3104
3105char*
3106PyUnicode_AsUTF8(PyObject *unicode)
3107{
3108    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3109}
3110
3111#ifdef Py_DEBUG
3112int unicode_as_unicode_calls = 0;
3113#endif
3114
3115
3116Py_UNICODE *
3117PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3118{
3119    PyUnicodeObject *u;
3120    const unsigned char *one_byte;
3121#if SIZEOF_WCHAR_T == 4
3122    const Py_UCS2 *two_bytes;
3123#else
3124    const Py_UCS4 *four_bytes;
3125    const Py_UCS4 *ucs4_end;
3126    Py_ssize_t num_surrogates;
3127#endif
3128    wchar_t *w;
3129    wchar_t *wchar_end;
3130
3131    if (!PyUnicode_Check(unicode)) {
3132        PyErr_BadArgument();
3133        return NULL;
3134    }
3135    u = (PyUnicodeObject*)unicode;
3136    if (_PyUnicode_WSTR(u) == NULL) {
3137        /* Non-ASCII compact unicode object */
3138        assert(_PyUnicode_KIND(u) != 0);
3139        assert(PyUnicode_IS_READY(u));
3140
3141#ifdef Py_DEBUG
3142        ++unicode_as_unicode_calls;
3143#endif
3144
3145        if (PyUnicode_KIND(u) == PyUnicode_4BYTE_KIND) {
3146#if SIZEOF_WCHAR_T == 2
3147            four_bytes = PyUnicode_4BYTE_DATA(u);
3148            ucs4_end = four_bytes + _PyUnicode_LENGTH(u);
3149            num_surrogates = 0;
3150
3151            for (; four_bytes < ucs4_end; ++four_bytes) {
3152                if (*four_bytes > 0xFFFF)
3153                    ++num_surrogates;
3154            }
3155
3156            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(
3157                    sizeof(wchar_t) * (_PyUnicode_LENGTH(u) + 1 + num_surrogates));
3158            if (!_PyUnicode_WSTR(u)) {
3159                PyErr_NoMemory();
3160                return NULL;
3161            }
3162            _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u) + num_surrogates;
3163
3164            w = _PyUnicode_WSTR(u);
3165            wchar_end = w + _PyUnicode_WSTR_LENGTH(u);
3166            four_bytes = PyUnicode_4BYTE_DATA(u);
3167            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3168                if (*four_bytes > 0xFFFF) {
3169                    /* encode surrogate pair in this case */
3170                    *w++ = 0xD800 | ((*four_bytes - 0x10000) >> 10);
3171                    *w   = 0xDC00 | ((*four_bytes - 0x10000) & 0x3FF);
3172                }
3173                else
3174                    *w = *four_bytes;
3175
3176                if (w > wchar_end) {
3177                    assert(0 && "Miscalculated string end");
3178                }
3179            }
3180            *w = 0;
3181#else
3182            /* sizeof(wchar_t) == 4 */
3183            Py_FatalError("Impossible unicode object state, wstr and str "
3184                          "should share memory already.");
3185            return NULL;
3186#endif
3187        }
3188        else {
3189            _PyUnicode_WSTR(u) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3190                                                  (_PyUnicode_LENGTH(u) + 1));
3191            if (!_PyUnicode_WSTR(u)) {
3192                PyErr_NoMemory();
3193                return NULL;
3194            }
3195            if (!PyUnicode_IS_COMPACT_ASCII(u))
3196                _PyUnicode_WSTR_LENGTH(u) = _PyUnicode_LENGTH(u);
3197            w = _PyUnicode_WSTR(u);
3198            wchar_end = w + _PyUnicode_LENGTH(u);
3199
3200            if (PyUnicode_KIND(u) == PyUnicode_1BYTE_KIND) {
3201                one_byte = PyUnicode_1BYTE_DATA(u);
3202                for (; w < wchar_end; ++one_byte, ++w)
3203                    *w = *one_byte;
3204                /* null-terminate the wstr */
3205                *w = 0;
3206            }
3207            else if (PyUnicode_KIND(u) == PyUnicode_2BYTE_KIND) {
3208#if SIZEOF_WCHAR_T == 4
3209                two_bytes = PyUnicode_2BYTE_DATA(u);
3210                for (; w < wchar_end; ++two_bytes, ++w)
3211                    *w = *two_bytes;
3212                /* null-terminate the wstr */
3213                *w = 0;
3214#else
3215                /* sizeof(wchar_t) == 2 */
3216                PyObject_FREE(_PyUnicode_WSTR(u));
3217                _PyUnicode_WSTR(u) = NULL;
3218                Py_FatalError("Impossible unicode object state, wstr "
3219                              "and str should share memory already.");
3220                return NULL;
3221#endif
3222            }
3223            else {
3224                assert(0 && "This should never happen.");
3225            }
3226        }
3227    }
3228    if (size != NULL)
3229        *size = PyUnicode_WSTR_LENGTH(u);
3230    return _PyUnicode_WSTR(u);
3231}
3232
3233Py_UNICODE *
3234PyUnicode_AsUnicode(PyObject *unicode)
3235{
3236    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3237}
3238
3239
3240Py_ssize_t
3241PyUnicode_GetSize(PyObject *unicode)
3242{
3243    if (!PyUnicode_Check(unicode)) {
3244        PyErr_BadArgument();
3245        goto onError;
3246    }
3247    return PyUnicode_GET_SIZE(unicode);
3248
3249  onError:
3250    return -1;
3251}
3252
3253Py_ssize_t
3254PyUnicode_GetLength(PyObject *unicode)
3255{
3256    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3257        PyErr_BadArgument();
3258        return -1;
3259    }
3260
3261    return PyUnicode_GET_LENGTH(unicode);
3262}
3263
3264Py_UCS4
3265PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3266{
3267    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3268        PyErr_BadArgument();
3269        return (Py_UCS4)-1;
3270    }
3271    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3272        PyErr_SetString(PyExc_IndexError, "string index out of range");
3273        return (Py_UCS4)-1;
3274    }
3275    return PyUnicode_READ_CHAR(unicode, index);
3276}
3277
3278int
3279PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3280{
3281    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3282        PyErr_BadArgument();
3283        return -1;
3284    }
3285    if (index < 0 || index >= _PyUnicode_LENGTH(unicode)) {
3286        PyErr_SetString(PyExc_IndexError, "string index out of range");
3287        return -1;
3288    }
3289    if (_PyUnicode_Dirty(unicode))
3290        return -1;
3291    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3292                    index, ch);
3293    return 0;
3294}
3295
3296const char *
3297PyUnicode_GetDefaultEncoding(void)
3298{
3299    return "utf-8";
3300}
3301
3302/* create or adjust a UnicodeDecodeError */
3303static void
3304make_decode_exception(PyObject **exceptionObject,
3305                      const char *encoding,
3306                      const char *input, Py_ssize_t length,
3307                      Py_ssize_t startpos, Py_ssize_t endpos,
3308                      const char *reason)
3309{
3310    if (*exceptionObject == NULL) {
3311        *exceptionObject = PyUnicodeDecodeError_Create(
3312            encoding, input, length, startpos, endpos, reason);
3313    }
3314    else {
3315        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3316            goto onError;
3317        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3318            goto onError;
3319        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3320            goto onError;
3321    }
3322    return;
3323
3324onError:
3325    Py_DECREF(*exceptionObject);
3326    *exceptionObject = NULL;
3327}
3328
3329/* error handling callback helper:
3330   build arguments, call the callback and check the arguments,
3331   if no exception occurred, copy the replacement to the output
3332   and adjust various state variables.
3333   return 0 on success, -1 on error
3334*/
3335
3336static int
3337unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
3338                                 const char *encoding, const char *reason,
3339                                 const char **input, const char **inend, Py_ssize_t *startinpos,
3340                                 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3341                                 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
3342{
3343    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
3344
3345    PyObject *restuple = NULL;
3346    PyObject *repunicode = NULL;
3347    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
3348    Py_ssize_t insize;
3349    Py_ssize_t requiredsize;
3350    Py_ssize_t newpos;
3351    const Py_UNICODE *repptr;
3352    PyObject *inputobj = NULL;
3353    Py_ssize_t repsize;
3354    int res = -1;
3355
3356    if (*errorHandler == NULL) {
3357        *errorHandler = PyCodec_LookupError(errors);
3358        if (*errorHandler == NULL)
3359            goto onError;
3360    }
3361
3362    make_decode_exception(exceptionObject,
3363        encoding,
3364        *input, *inend - *input,
3365        *startinpos, *endinpos,
3366        reason);
3367    if (*exceptionObject == NULL)
3368        goto onError;
3369
3370    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
3371    if (restuple == NULL)
3372        goto onError;
3373    if (!PyTuple_Check(restuple)) {
3374        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3375        goto onError;
3376    }
3377    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
3378        goto onError;
3379
3380    /* Copy back the bytes variables, which might have been modified by the
3381       callback */
3382    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
3383    if (!inputobj)
3384        goto onError;
3385    if (!PyBytes_Check(inputobj)) {
3386        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
3387    }
3388    *input = PyBytes_AS_STRING(inputobj);
3389    insize = PyBytes_GET_SIZE(inputobj);
3390    *inend = *input + insize;
3391    /* we can DECREF safely, as the exception has another reference,
3392       so the object won't go away. */
3393    Py_DECREF(inputobj);
3394
3395    if (newpos<0)
3396        newpos = insize+newpos;
3397    if (newpos<0 || newpos>insize) {
3398        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
3399        goto onError;
3400    }
3401
3402    /* need more space? (at least enough for what we
3403       have+the replacement+the rest of the string (starting
3404       at the new input position), so we won't have to check space
3405       when there are no errors in the rest of the string) */
3406    repptr = PyUnicode_AS_UNICODE(repunicode);
3407    repsize = PyUnicode_GET_SIZE(repunicode);
3408    requiredsize = *outpos + repsize + insize-newpos;
3409    if (requiredsize > outsize) {
3410        if (requiredsize<2*outsize)
3411            requiredsize = 2*outsize;
3412        if (PyUnicode_Resize((PyObject**)output, requiredsize) < 0)
3413            goto onError;
3414        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
3415    }
3416    *endinpos = newpos;
3417    *inptr = *input + newpos;
3418    Py_UNICODE_COPY(*outptr, repptr, repsize);
3419    *outptr += repsize;
3420    *outpos += repsize;
3421
3422    /* we made it! */
3423    res = 0;
3424
3425  onError:
3426    Py_XDECREF(restuple);
3427    return res;
3428}
3429
3430/* --- UTF-7 Codec -------------------------------------------------------- */
3431
3432/* See RFC2152 for details.  We encode conservatively and decode liberally. */
3433
3434/* Three simple macros defining base-64. */
3435
3436/* Is c a base-64 character? */
3437
3438#define IS_BASE64(c) \
3439    (((c) >= 'A' && (c) <= 'Z') ||     \
3440     ((c) >= 'a' && (c) <= 'z') ||     \
3441     ((c) >= '0' && (c) <= '9') ||     \
3442     (c) == '+' || (c) == '/')
3443
3444/* given that c is a base-64 character, what is its base-64 value? */
3445
3446#define FROM_BASE64(c)                                                  \
3447    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
3448     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
3449     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
3450     (c) == '+' ? 62 : 63)
3451
3452/* What is the base-64 character of the bottom 6 bits of n? */
3453
3454#define TO_BASE64(n)  \
3455    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
3456
3457/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
3458 * decoded as itself.  We are permissive on decoding; the only ASCII
3459 * byte not decoding to itself is the + which begins a base64
3460 * string. */
3461
3462#define DECODE_DIRECT(c)                                \
3463    ((c) <= 127 && (c) != '+')
3464
3465/* The UTF-7 encoder treats ASCII characters differently according to
3466 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
3467 * the above).  See RFC2152.  This array identifies these different
3468 * sets:
3469 * 0 : "Set D"
3470 *     alphanumeric and '(),-./:?
3471 * 1 : "Set O"
3472 *     !"#$%&*;<=>@[]^_`{|}
3473 * 2 : "whitespace"
3474 *     ht nl cr sp
3475 * 3 : special (must be base64 encoded)
3476 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
3477 */
3478
3479static
3480char utf7_category[128] = {
3481/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
3482    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
3483/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
3484    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
3485/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
3486    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
3487/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
3488    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
3489/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
3490    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3491/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
3492    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
3493/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
3494    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
3495/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
3496    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
3497};
3498
3499/* ENCODE_DIRECT: this character should be encoded as itself.  The
3500 * answer depends on whether we are encoding set O as itself, and also
3501 * on whether we are encoding whitespace as itself.  RFC2152 makes it
3502 * clear that the answers to these questions vary between
3503 * applications, so this code needs to be flexible.  */
3504
3505#define ENCODE_DIRECT(c, directO, directWS)             \
3506    ((c) < 128 && (c) > 0 &&                            \
3507     ((utf7_category[(c)] == 0) ||                      \
3508      (directWS && (utf7_category[(c)] == 2)) ||        \
3509      (directO && (utf7_category[(c)] == 1))))
3510
3511PyObject *
3512PyUnicode_DecodeUTF7(const char *s,
3513                     Py_ssize_t size,
3514                     const char *errors)
3515{
3516    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
3517}
3518
3519/* The decoder.  The only state we preserve is our read position,
3520 * i.e. how many characters we have consumed.  So if we end in the
3521 * middle of a shift sequence we have to back off the read position
3522 * and the output to the beginning of the sequence, otherwise we lose
3523 * all the shift state (seen bits, number of bits seen, high
3524 * surrogate). */
3525
3526PyObject *
3527PyUnicode_DecodeUTF7Stateful(const char *s,
3528                             Py_ssize_t size,
3529                             const char *errors,
3530                             Py_ssize_t *consumed)
3531{
3532    const char *starts = s;
3533    Py_ssize_t startinpos;
3534    Py_ssize_t endinpos;
3535    Py_ssize_t outpos;
3536    const char *e;
3537    PyUnicodeObject *unicode;
3538    Py_UNICODE *p;
3539    const char *errmsg = "";
3540    int inShift = 0;
3541    Py_UNICODE *shiftOutStart;
3542    unsigned int base64bits = 0;
3543    unsigned long base64buffer = 0;
3544    Py_UNICODE surrogate = 0;
3545    PyObject *errorHandler = NULL;
3546    PyObject *exc = NULL;
3547
3548    unicode = _PyUnicode_New(size);
3549    if (!unicode)
3550        return NULL;
3551    if (size == 0) {
3552        if (consumed)
3553            *consumed = 0;
3554        return (PyObject *)unicode;
3555    }
3556
3557    p = PyUnicode_AS_UNICODE(unicode);
3558    shiftOutStart = p;
3559    e = s + size;
3560
3561    while (s < e) {
3562        Py_UNICODE ch;
3563      restart:
3564        ch = (unsigned char) *s;
3565
3566        if (inShift) { /* in a base-64 section */
3567            if (IS_BASE64(ch)) { /* consume a base-64 character */
3568                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
3569                base64bits += 6;
3570                s++;
3571                if (base64bits >= 16) {
3572                    /* we have enough bits for a UTF-16 value */
3573                    Py_UNICODE outCh = (Py_UNICODE)
3574                                       (base64buffer >> (base64bits-16));
3575                    base64bits -= 16;
3576                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
3577                    if (surrogate) {
3578                        /* expecting a second surrogate */
3579                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3580#ifdef Py_UNICODE_WIDE
3581                            *p++ = (((surrogate & 0x3FF)<<10)
3582                                    | (outCh & 0x3FF)) + 0x10000;
3583#else
3584                            *p++ = surrogate;
3585                            *p++ = outCh;
3586#endif
3587                            surrogate = 0;
3588                        }
3589                        else {
3590                            surrogate = 0;
3591                            errmsg = "second surrogate missing";
3592                            goto utf7Error;
3593                        }
3594                    }
3595                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
3596                        /* first surrogate */
3597                        surrogate = outCh;
3598                    }
3599                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
3600                        errmsg = "unexpected second surrogate";
3601                        goto utf7Error;
3602                    }
3603                    else {
3604                        *p++ = outCh;
3605                    }
3606                }
3607            }
3608            else { /* now leaving a base-64 section */
3609                inShift = 0;
3610                s++;
3611                if (surrogate) {
3612                    errmsg = "second surrogate missing at end of shift sequence";
3613                    goto utf7Error;
3614                }
3615                if (base64bits > 0) { /* left-over bits */
3616                    if (base64bits >= 6) {
3617                        /* We've seen at least one base-64 character */
3618                        errmsg = "partial character in shift sequence";
3619                        goto utf7Error;
3620                    }
3621                    else {
3622                        /* Some bits remain; they should be zero */
3623                        if (base64buffer != 0) {
3624                            errmsg = "non-zero padding bits in shift sequence";
3625                            goto utf7Error;
3626                        }
3627                    }
3628                }
3629                if (ch != '-') {
3630                    /* '-' is absorbed; other terminating
3631                       characters are preserved */
3632                    *p++ = ch;
3633                }
3634            }
3635        }
3636        else if ( ch == '+' ) {
3637            startinpos = s-starts;
3638            s++; /* consume '+' */
3639            if (s < e && *s == '-') { /* '+-' encodes '+' */
3640                s++;
3641                *p++ = '+';
3642            }
3643            else { /* begin base64-encoded section */
3644                inShift = 1;
3645                shiftOutStart = p;
3646                base64bits = 0;
3647            }
3648        }
3649        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
3650            *p++ = ch;
3651            s++;
3652        }
3653        else {
3654            startinpos = s-starts;
3655            s++;
3656            errmsg = "unexpected special character";
3657            goto utf7Error;
3658        }
3659        continue;
3660utf7Error:
3661        outpos = p-PyUnicode_AS_UNICODE(unicode);
3662        endinpos = s-starts;
3663        if (unicode_decode_call_errorhandler(
3664                errors, &errorHandler,
3665                "utf7", errmsg,
3666                &starts, &e, &startinpos, &endinpos, &exc, &s,
3667                &unicode, &outpos, &p))
3668            goto onError;
3669    }
3670
3671    /* end of string */
3672
3673    if (inShift && !consumed) { /* in shift sequence, no more to follow */
3674        /* if we're in an inconsistent state, that's an error */
3675        if (surrogate ||
3676                (base64bits >= 6) ||
3677                (base64bits > 0 && base64buffer != 0)) {
3678            outpos = p-PyUnicode_AS_UNICODE(unicode);
3679            endinpos = size;
3680            if (unicode_decode_call_errorhandler(
3681                    errors, &errorHandler,
3682                    "utf7", "unterminated shift sequence",
3683                    &starts, &e, &startinpos, &endinpos, &exc, &s,
3684                    &unicode, &outpos, &p))
3685                goto onError;
3686            if (s < e)
3687                goto restart;
3688        }
3689    }
3690
3691    /* return state */
3692    if (consumed) {
3693        if (inShift) {
3694            p = shiftOutStart; /* back off output */
3695            *consumed = startinpos;
3696        }
3697        else {
3698            *consumed = s-starts;
3699        }
3700    }
3701
3702    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
3703        goto onError;
3704
3705    Py_XDECREF(errorHandler);
3706    Py_XDECREF(exc);
3707#ifndef DONT_MAKE_RESULT_READY
3708    if (_PyUnicode_READY_REPLACE(&unicode)) {
3709        Py_DECREF(unicode);
3710        return NULL;
3711    }
3712#endif
3713    return (PyObject *)unicode;
3714
3715  onError:
3716    Py_XDECREF(errorHandler);
3717    Py_XDECREF(exc);
3718    Py_DECREF(unicode);
3719    return NULL;
3720}
3721
3722
3723PyObject *
3724PyUnicode_EncodeUTF7(const Py_UNICODE *s,
3725                     Py_ssize_t size,
3726                     int base64SetO,
3727                     int base64WhiteSpace,
3728                     const char *errors)
3729{
3730    PyObject *v;
3731    /* It might be possible to tighten this worst case */
3732    Py_ssize_t allocated = 8 * size;
3733    int inShift = 0;
3734    Py_ssize_t i = 0;
3735    unsigned int base64bits = 0;
3736    unsigned long base64buffer = 0;
3737    char * out;
3738    char * start;
3739
3740    if (size == 0)
3741        return PyBytes_FromStringAndSize(NULL, 0);
3742
3743    if (allocated / 8 != size)
3744        return PyErr_NoMemory();
3745
3746    v = PyBytes_FromStringAndSize(NULL, allocated);
3747    if (v == NULL)
3748        return NULL;
3749
3750    start = out = PyBytes_AS_STRING(v);
3751    for (;i < size; ++i) {
3752        Py_UNICODE ch = s[i];
3753
3754        if (inShift) {
3755            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3756                /* shifting out */
3757                if (base64bits) { /* output remaining bits */
3758                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
3759                    base64buffer = 0;
3760                    base64bits = 0;
3761                }
3762                inShift = 0;
3763                /* Characters not in the BASE64 set implicitly unshift the sequence
3764                   so no '-' is required, except if the character is itself a '-' */
3765                if (IS_BASE64(ch) || ch == '-') {
3766                    *out++ = '-';
3767                }
3768                *out++ = (char) ch;
3769            }
3770            else {
3771                goto encode_char;
3772            }
3773        }
3774        else { /* not in a shift sequence */
3775            if (ch == '+') {
3776                *out++ = '+';
3777                        *out++ = '-';
3778            }
3779            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
3780                *out++ = (char) ch;
3781            }
3782            else {
3783                *out++ = '+';
3784                inShift = 1;
3785                goto encode_char;
3786            }
3787        }
3788        continue;
3789encode_char:
3790#ifdef Py_UNICODE_WIDE
3791        if (ch >= 0x10000) {
3792            /* code first surrogate */
3793            base64bits += 16;
3794            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
3795            while (base64bits >= 6) {
3796                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3797                base64bits -= 6;
3798            }
3799            /* prepare second surrogate */
3800            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
3801        }
3802#endif
3803        base64bits += 16;
3804        base64buffer = (base64buffer << 16) | ch;
3805        while (base64bits >= 6) {
3806            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
3807            base64bits -= 6;
3808        }
3809    }
3810    if (base64bits)
3811        *out++= TO_BASE64(base64buffer << (6-base64bits) );
3812    if (inShift)
3813        *out++ = '-';
3814    if (_PyBytes_Resize(&v, out - start) < 0)
3815        return NULL;
3816    return v;
3817}
3818
3819#undef IS_BASE64
3820#undef FROM_BASE64
3821#undef TO_BASE64
3822#undef DECODE_DIRECT
3823#undef ENCODE_DIRECT
3824
3825/* --- UTF-8 Codec -------------------------------------------------------- */
3826
3827static
3828char utf8_code_length[256] = {
3829    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
3830       illegal prefix.  See RFC 3629 for details */
3831    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
3832    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3833    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3834    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3835    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3836    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3837    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
3838    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
3839    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
3840    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3841    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
3842    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
3843    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
3844    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
3845    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
3846    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
3847};
3848
3849PyObject *
3850PyUnicode_DecodeUTF8(const char *s,
3851                     Py_ssize_t size,
3852                     const char *errors)
3853{
3854    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3855}
3856
3857/* Mask to check or force alignment of a pointer to C 'long' boundaries */
3858#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
3859
3860/* Mask to quickly check whether a C 'long' contains a
3861   non-ASCII, UTF8-encoded char. */
3862#if (SIZEOF_LONG == 8)
3863# define ASCII_CHAR_MASK 0x8080808080808080L
3864#elif (SIZEOF_LONG == 4)
3865# define ASCII_CHAR_MASK 0x80808080L
3866#else
3867# error C 'long' size should be either 4 or 8!
3868#endif
3869
3870/* Scans a UTF-8 string and returns the maximum character to be expected,
3871   the size of the decoded unicode string and if any major errors were
3872   encountered.
3873
3874   This function does check basic UTF-8 sanity, it does however NOT CHECK
3875   if the string contains surrogates, and if all continuation bytes are
3876   within the correct ranges, these checks are performed in
3877   PyUnicode_DecodeUTF8Stateful.
3878
3879   If it sets has_errors to 1, it means the value of unicode_size and max_char
3880   will be bogus and you should not rely on useful information in them.
3881   */
3882static Py_UCS4
3883utf8_max_char_size_and_has_errors(const char *s, Py_ssize_t string_size,
3884                                  Py_ssize_t *unicode_size, Py_ssize_t* consumed,
3885                                  int *has_errors)
3886{
3887    Py_ssize_t n;
3888    Py_ssize_t char_count = 0;
3889    Py_UCS4 max_char = 127, new_max;
3890    Py_UCS4 upper_bound;
3891    const unsigned char *p = (const unsigned char *)s;
3892    const unsigned char *end = p + string_size;
3893    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
3894    int err = 0;
3895
3896    for (; p < end && !err; ++p, ++char_count) {
3897        /* Only check value if it's not a ASCII char... */
3898        if (*p < 0x80) {
3899            /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
3900               an explanation. */
3901            if (!((size_t) p & LONG_PTR_MASK)) {
3902                /* Help register allocation */
3903                register const unsigned char *_p = p;
3904                while (_p < aligned_end) {
3905                    unsigned long value = *(unsigned long *) _p;
3906                    if (value & ASCII_CHAR_MASK)
3907                        break;
3908                    _p += SIZEOF_LONG;
3909                    char_count += SIZEOF_LONG;
3910                }
3911                p = _p;
3912                if (p == end)
3913                    break;
3914            }
3915        }
3916        if (*p >= 0x80) {
3917            n = utf8_code_length[*p];
3918            new_max = max_char;
3919            switch (n) {
3920            /* invalid start byte */
3921            case 0:
3922                err = 1;
3923                break;
3924            case 2:
3925                /* Code points between 0x00FF and 0x07FF inclusive.
3926                   Approximate the upper bound of the code point,
3927                   if this flips over 255 we can be sure it will be more
3928                   than 255 and the string will need 2 bytes per code coint,
3929                   if it stays under or equal to 255, we can be sure 1 byte
3930                   is enough.
3931                   ((*p & 0b00011111) << 6) | 0b00111111 */
3932                upper_bound = ((*p & 0x1F) << 6) | 0x3F;
3933                if (max_char < upper_bound)
3934                    new_max = upper_bound;
3935                /* Ensure we track at least that we left ASCII space. */
3936                if (new_max < 128)
3937                    new_max = 128;
3938                break;
3939            case 3:
3940                /* Between 0x0FFF and 0xFFFF inclusive, so values are
3941                   always > 255 and <= 65535 and will always need 2 bytes. */
3942                if (max_char < 65535)
3943                    new_max = 65535;
3944                break;
3945            case 4:
3946                /* Code point will be above 0xFFFF for sure in this case. */
3947                new_max = 65537;
3948                break;
3949            /* Internal error, this should be caught by the first if */
3950            case 1:
3951            default:
3952                assert(0 && "Impossible case in utf8_max_char_and_size");
3953                err = 1;
3954            }
3955            /* Instead of number of overall bytes for this code point,
3956               n containts the number of following bytes: */
3957            --n;
3958            /* Check if the follow up chars are all valid continuation bytes */
3959            if (n >= 1) {
3960                const unsigned char *cont;
3961                if ((p + n) >= end) {
3962                    if (consumed == 0)
3963                        /* incomplete data, non-incremental decoding */
3964                        err = 1;
3965                    break;
3966                }
3967                for (cont = p + 1; cont < (p + n); ++cont) {
3968                    if ((*cont & 0xc0) != 0x80) {
3969                        err = 1;
3970                        break;
3971                    }
3972                }
3973                p += n;
3974            }
3975            else
3976                err = 1;
3977            max_char = new_max;
3978        }
3979    }
3980
3981    if (unicode_size)
3982        *unicode_size = char_count;
3983    if (has_errors)
3984        *has_errors = err;
3985    return max_char;
3986}
3987
3988/* Similar to PyUnicode_WRITE but can also write into wstr field
3989   of the legacy unicode representation */
3990#define WRITE_FLEXIBLE_OR_WSTR(kind, buf, index, value) \
3991    do { \
3992        const int k_ = (kind); \
3993        if (k_ == PyUnicode_WCHAR_KIND) \
3994            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
3995        else if (k_ == PyUnicode_1BYTE_KIND) \
3996            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
3997        else if (k_ == PyUnicode_2BYTE_KIND) \
3998            ((Py_UCS2 *)(buf))[(index)] = (Py_UCS2)(value); \
3999        else \
4000            ((Py_UCS4 *)(buf))[(index)] = (Py_UCS4)(value); \
4001    } while (0)
4002
4003PyObject *
4004PyUnicode_DecodeUTF8Stateful(const char *s,
4005                             Py_ssize_t size,
4006                             const char *errors,
4007                             Py_ssize_t *consumed)
4008{
4009    const char *starts = s;
4010    int n;
4011    int k;
4012    Py_ssize_t startinpos;
4013    Py_ssize_t endinpos;
4014    const char *e, *aligned_end;
4015    PyUnicodeObject *unicode;
4016    const char *errmsg = "";
4017    PyObject *errorHandler = NULL;
4018    PyObject *exc = NULL;
4019    Py_UCS4 maxchar = 0;
4020    Py_ssize_t unicode_size;
4021    Py_ssize_t i;
4022    int kind;
4023    void *data;
4024    int has_errors;
4025    Py_UNICODE *error_outptr;
4026#if SIZEOF_WCHAR_T == 2
4027    Py_ssize_t wchar_offset = 0;
4028#endif
4029
4030    if (size == 0) {
4031        if (consumed)
4032            *consumed = 0;
4033        return (PyObject *)PyUnicode_New(0, 0);
4034    }
4035    maxchar = utf8_max_char_size_and_has_errors(s, size, &unicode_size,
4036                                                consumed, &has_errors);
4037    if (has_errors) {
4038        unicode = _PyUnicode_New(size);
4039        if (!unicode)
4040            return NULL;
4041        kind = PyUnicode_WCHAR_KIND;
4042        data = PyUnicode_AS_UNICODE(unicode);
4043        assert(data != NULL);
4044    }
4045    else {
4046        unicode = (PyUnicodeObject *)PyUnicode_New(unicode_size, maxchar);
4047        if (!unicode)
4048            return NULL;
4049        /* When the string is ASCII only, just use memcpy and return.
4050           unicode_size may be != size if there is an incomplete UTF-8
4051           sequence at the end of the ASCII block.  */
4052        if (maxchar < 128 && size == unicode_size) {
4053            Py_MEMCPY(PyUnicode_1BYTE_DATA(unicode), s, unicode_size);
4054            return (PyObject *)unicode;
4055        }
4056        kind = PyUnicode_KIND(unicode);
4057        data = PyUnicode_DATA(unicode);
4058    }
4059    /* Unpack UTF-8 encoded data */
4060    i = 0;
4061    e = s + size;
4062    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4063
4064    while (s < e) {
4065        Py_UCS4 ch = (unsigned char)*s;
4066
4067        if (ch < 0x80) {
4068            /* Fast path for runs of ASCII characters. Given that common UTF-8
4069               input will consist of an overwhelming majority of ASCII
4070               characters, we try to optimize for this case by checking
4071               as many characters as a C 'long' can contain.
4072               First, check if we can do an aligned read, as most CPUs have
4073               a penalty for unaligned reads.
4074            */
4075            if (!((size_t) s & LONG_PTR_MASK)) {
4076                /* Help register allocation */
4077                register const char *_s = s;
4078                register Py_ssize_t _i = i;
4079                while (_s < aligned_end) {
4080                    /* Read a whole long at a time (either 4 or 8 bytes),
4081                       and do a fast unrolled copy if it only contains ASCII
4082                       characters. */
4083                    unsigned long value = *(unsigned long *) _s;
4084                    if (value & ASCII_CHAR_MASK)
4085                        break;
4086                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+0, _s[0]);
4087                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+1, _s[1]);
4088                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+2, _s[2]);
4089                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+3, _s[3]);
4090#if (SIZEOF_LONG == 8)
4091                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+4, _s[4]);
4092                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+5, _s[5]);
4093                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+6, _s[6]);
4094                    WRITE_FLEXIBLE_OR_WSTR(kind, data, _i+7, _s[7]);
4095#endif
4096                    _s += SIZEOF_LONG;
4097                    _i += SIZEOF_LONG;
4098                }
4099                s = _s;
4100                i = _i;
4101                if (s == e)
4102                    break;
4103                ch = (unsigned char)*s;
4104            }
4105        }
4106
4107        if (ch < 0x80) {
4108            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4109            s++;
4110            continue;
4111        }
4112
4113        n = utf8_code_length[ch];
4114
4115        if (s + n > e) {
4116            if (consumed)
4117                break;
4118            else {
4119                errmsg = "unexpected end of data";
4120                startinpos = s-starts;
4121                endinpos = startinpos+1;
4122                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
4123                    endinpos++;
4124                goto utf8Error;
4125            }
4126        }
4127
4128        switch (n) {
4129
4130        case 0:
4131            errmsg = "invalid start byte";
4132            startinpos = s-starts;
4133            endinpos = startinpos+1;
4134            goto utf8Error;
4135
4136        case 1:
4137            errmsg = "internal error";
4138            startinpos = s-starts;
4139            endinpos = startinpos+1;
4140            goto utf8Error;
4141
4142        case 2:
4143            if ((s[1] & 0xc0) != 0x80) {
4144                errmsg = "invalid continuation byte";
4145                startinpos = s-starts;
4146                endinpos = startinpos + 1;
4147                goto utf8Error;
4148            }
4149            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4150            assert ((ch > 0x007F) && (ch <= 0x07FF));
4151            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4152            break;
4153
4154        case 3:
4155            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4156               will result in surrogates in range d800-dfff. Surrogates are
4157               not valid UTF-8 so they are rejected.
4158               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4159               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4160            if ((s[1] & 0xc0) != 0x80 ||
4161                (s[2] & 0xc0) != 0x80 ||
4162                ((unsigned char)s[0] == 0xE0 &&
4163                 (unsigned char)s[1] < 0xA0) ||
4164                ((unsigned char)s[0] == 0xED &&
4165                 (unsigned char)s[1] > 0x9F)) {
4166                errmsg = "invalid continuation byte";
4167                startinpos = s-starts;
4168                endinpos = startinpos + 1;
4169
4170                /* if s[1] first two bits are 1 and 0, then the invalid
4171                   continuation byte is s[2], so increment endinpos by 1,
4172                   if not, s[1] is invalid and endinpos doesn't need to
4173                   be incremented. */
4174                if ((s[1] & 0xC0) == 0x80)
4175                    endinpos++;
4176                goto utf8Error;
4177            }
4178            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4179            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4180            WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4181            break;
4182
4183        case 4:
4184            if ((s[1] & 0xc0) != 0x80 ||
4185                (s[2] & 0xc0) != 0x80 ||
4186                (s[3] & 0xc0) != 0x80 ||
4187                ((unsigned char)s[0] == 0xF0 &&
4188                 (unsigned char)s[1] < 0x90) ||
4189                ((unsigned char)s[0] == 0xF4 &&
4190                 (unsigned char)s[1] > 0x8F)) {
4191                errmsg = "invalid continuation byte";
4192                startinpos = s-starts;
4193                endinpos = startinpos + 1;
4194                if ((s[1] & 0xC0) == 0x80) {
4195                    endinpos++;
4196                    if ((s[2] & 0xC0) == 0x80)
4197                        endinpos++;
4198                }
4199                goto utf8Error;
4200            }
4201            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4202                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4203            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4204
4205            /* If the string is flexible or we have native UCS-4, write
4206               directly.. */
4207            if (sizeof(Py_UNICODE) > 2 || kind != PyUnicode_WCHAR_KIND)
4208                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++, ch);
4209
4210            else {
4211                /* compute and append the two surrogates: */
4212
4213                /* translate from 10000..10FFFF to 0..FFFF */
4214                ch -= 0x10000;
4215
4216                /* high surrogate = top 10 bits added to D800 */
4217                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4218                                       (Py_UNICODE)(0xD800 + (ch >> 10)));
4219
4220                /* low surrogate = bottom 10 bits added to DC00 */
4221                WRITE_FLEXIBLE_OR_WSTR(kind, data, i++,
4222                                       (Py_UNICODE)(0xDC00 + (ch & 0x03FF)));
4223            }
4224#if SIZEOF_WCHAR_T == 2
4225            wchar_offset++;
4226#endif
4227            break;
4228        }
4229        s += n;
4230        continue;
4231
4232      utf8Error:
4233        /* If this is not yet a resizable string, make it one.. */
4234        if (kind != PyUnicode_WCHAR_KIND) {
4235            const Py_UNICODE *u;
4236            PyUnicodeObject *new_unicode = _PyUnicode_New(size);
4237            if (!new_unicode)
4238                goto onError;
4239            u = PyUnicode_AsUnicode((PyObject *)unicode);
4240            if (!u)
4241                goto onError;
4242#if SIZEOF_WCHAR_T == 2
4243            i += wchar_offset;
4244#endif
4245            Py_UNICODE_COPY(PyUnicode_AS_UNICODE(new_unicode), u, i);
4246            Py_DECREF(unicode);
4247            unicode = new_unicode;
4248            kind = 0;
4249            data = PyUnicode_AS_UNICODE(new_unicode);
4250            assert(data != NULL);
4251        }
4252        error_outptr = PyUnicode_AS_UNICODE(unicode) + i;
4253        if (unicode_decode_call_errorhandler(
4254                errors, &errorHandler,
4255                "utf8", errmsg,
4256                &starts, &e, &startinpos, &endinpos, &exc, &s,
4257                &unicode, &i, &error_outptr))
4258            goto onError;
4259        /* Update data because unicode_decode_call_errorhandler might have
4260           re-created or resized the unicode object. */
4261        data = PyUnicode_AS_UNICODE(unicode);
4262        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
4263    }
4264    /* Ensure the unicode_size calculation above was correct: */
4265    assert(kind == PyUnicode_WCHAR_KIND || i == unicode_size);
4266
4267    if (consumed)
4268        *consumed = s-starts;
4269
4270    /* Adjust length and ready string when it contained errors and
4271       is of the old resizable kind. */
4272    if (kind == PyUnicode_WCHAR_KIND) {
4273        if (PyUnicode_Resize((PyObject**)&unicode, i) < 0)
4274            goto onError;
4275    }
4276
4277    Py_XDECREF(errorHandler);
4278    Py_XDECREF(exc);
4279#ifndef DONT_MAKE_RESULT_READY
4280    if (_PyUnicode_READY_REPLACE(&unicode)) {
4281        Py_DECREF(unicode);
4282        return NULL;
4283    }
4284#endif
4285    return (PyObject *)unicode;
4286
4287  onError:
4288    Py_XDECREF(errorHandler);
4289    Py_XDECREF(exc);
4290    Py_DECREF(unicode);
4291    return NULL;
4292}
4293
4294#undef WRITE_FLEXIBLE_OR_WSTR
4295
4296#ifdef __APPLE__
4297
4298/* Simplified UTF-8 decoder using surrogateescape error handler,
4299   used to decode the command line arguments on Mac OS X. */
4300
4301wchar_t*
4302_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4303{
4304    int n;
4305    const char *e;
4306    wchar_t *unicode, *p;
4307
4308    /* Note: size will always be longer than the resulting Unicode
4309       character count */
4310    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
4311        PyErr_NoMemory();
4312        return NULL;
4313    }
4314    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4315    if (!unicode)
4316        return NULL;
4317
4318    /* Unpack UTF-8 encoded data */
4319    p = unicode;
4320    e = s + size;
4321    while (s < e) {
4322        Py_UCS4 ch = (unsigned char)*s;
4323
4324        if (ch < 0x80) {
4325            *p++ = (wchar_t)ch;
4326            s++;
4327            continue;
4328        }
4329
4330        n = utf8_code_length[ch];
4331        if (s + n > e) {
4332            goto surrogateescape;
4333        }
4334
4335        switch (n) {
4336        case 0:
4337        case 1:
4338            goto surrogateescape;
4339
4340        case 2:
4341            if ((s[1] & 0xc0) != 0x80)
4342                goto surrogateescape;
4343            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
4344            assert ((ch > 0x007F) && (ch <= 0x07FF));
4345            *p++ = (wchar_t)ch;
4346            break;
4347
4348        case 3:
4349            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
4350               will result in surrogates in range d800-dfff. Surrogates are
4351               not valid UTF-8 so they are rejected.
4352               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
4353               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
4354            if ((s[1] & 0xc0) != 0x80 ||
4355                (s[2] & 0xc0) != 0x80 ||
4356                ((unsigned char)s[0] == 0xE0 &&
4357                 (unsigned char)s[1] < 0xA0) ||
4358                ((unsigned char)s[0] == 0xED &&
4359                 (unsigned char)s[1] > 0x9F)) {
4360
4361                goto surrogateescape;
4362            }
4363            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
4364            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
4365            *p++ = (wchar_t)ch;
4366            break;
4367
4368        case 4:
4369            if ((s[1] & 0xc0) != 0x80 ||
4370                (s[2] & 0xc0) != 0x80 ||
4371                (s[3] & 0xc0) != 0x80 ||
4372                ((unsigned char)s[0] == 0xF0 &&
4373                 (unsigned char)s[1] < 0x90) ||
4374                ((unsigned char)s[0] == 0xF4 &&
4375                 (unsigned char)s[1] > 0x8F)) {
4376                goto surrogateescape;
4377            }
4378            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
4379                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
4380            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
4381
4382#if SIZEOF_WCHAR_T == 4
4383            *p++ = (wchar_t)ch;
4384#else
4385            /*  compute and append the two surrogates: */
4386
4387            /*  translate from 10000..10FFFF to 0..FFFF */
4388            ch -= 0x10000;
4389
4390            /*  high surrogate = top 10 bits added to D800 */
4391            *p++ = (wchar_t)(0xD800 + (ch >> 10));
4392
4393            /*  low surrogate = bottom 10 bits added to DC00 */
4394            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
4395#endif
4396            break;
4397        }
4398        s += n;
4399        continue;
4400
4401      surrogateescape:
4402        *p++ = 0xDC00 + ch;
4403        s++;
4404    }
4405    *p = L'\0';
4406    return unicode;
4407}
4408
4409#endif /* __APPLE__ */
4410
4411/* Primary internal function which creates utf8 encoded bytes objects.
4412
4413   Allocation strategy:  if the string is short, convert into a stack buffer
4414   and allocate exactly as much space needed at the end.  Else allocate the
4415   maximum possible needed (4 result bytes per Unicode character), and return
4416   the excess memory at the end.
4417*/
4418PyObject *
4419_PyUnicode_AsUTF8String(PyObject *obj, const char *errors)
4420{
4421#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
4422
4423    Py_ssize_t i;                /* index into s of next input byte */
4424    PyObject *result;            /* result string object */
4425    char *p;                     /* next free byte in output buffer */
4426    Py_ssize_t nallocated;      /* number of result bytes allocated */
4427    Py_ssize_t nneeded;            /* number of result bytes needed */
4428    char stackbuf[MAX_SHORT_UNICHARS * 4];
4429    PyObject *errorHandler = NULL;
4430    PyObject *exc = NULL;
4431    int kind;
4432    void *data;
4433    Py_ssize_t size;
4434    PyUnicodeObject *unicode = (PyUnicodeObject *)obj;
4435#if SIZEOF_WCHAR_T == 2
4436    Py_ssize_t wchar_offset = 0;
4437#endif
4438
4439    if (!PyUnicode_Check(unicode)) {
4440        PyErr_BadArgument();
4441        return NULL;
4442    }
4443
4444    if (PyUnicode_READY(unicode) == -1)
4445        return NULL;
4446
4447    if (PyUnicode_UTF8(unicode))
4448        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4449                                         PyUnicode_UTF8_LENGTH(unicode));
4450
4451    kind = PyUnicode_KIND(unicode);
4452    data = PyUnicode_DATA(unicode);
4453    size = PyUnicode_GET_LENGTH(unicode);
4454
4455    assert(size >= 0);
4456
4457    if (size <= MAX_SHORT_UNICHARS) {
4458        /* Write into the stack buffer; nallocated can't overflow.
4459         * At the end, we'll allocate exactly as much heap space as it
4460         * turns out we need.
4461         */
4462        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
4463        result = NULL;   /* will allocate after we're done */
4464        p = stackbuf;
4465    }
4466    else {
4467        /* Overallocate on the heap, and give the excess back at the end. */
4468        nallocated = size * 4;
4469        if (nallocated / 4 != size)  /* overflow! */
4470            return PyErr_NoMemory();
4471        result = PyBytes_FromStringAndSize(NULL, nallocated);
4472        if (result == NULL)
4473            return NULL;
4474        p = PyBytes_AS_STRING(result);
4475    }
4476
4477    for (i = 0; i < size;) {
4478        Py_UCS4 ch = PyUnicode_READ(kind, data, i++);
4479
4480        if (ch < 0x80)
4481            /* Encode ASCII */
4482            *p++ = (char) ch;
4483
4484        else if (ch < 0x0800) {
4485            /* Encode Latin-1 */
4486            *p++ = (char)(0xc0 | (ch >> 6));
4487            *p++ = (char)(0x80 | (ch & 0x3f));
4488        } else if (0xD800 <= ch && ch <= 0xDFFF) {
4489            Py_ssize_t newpos;
4490            PyObject *rep;
4491            Py_ssize_t repsize, k, startpos;
4492            startpos = i-1;
4493#if SIZEOF_WCHAR_T == 2
4494            startpos += wchar_offset;
4495#endif
4496            rep = unicode_encode_call_errorhandler(
4497                  errors, &errorHandler, "utf-8", "surrogates not allowed",
4498                  PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
4499                  &exc, startpos, startpos+1, &newpos);
4500            if (!rep)
4501                goto error;
4502
4503            if (PyBytes_Check(rep))
4504                repsize = PyBytes_GET_SIZE(rep);
4505            else
4506                repsize = PyUnicode_GET_SIZE(rep);
4507
4508            if (repsize > 4) {
4509                Py_ssize_t offset;
4510
4511                if (result == NULL)
4512                    offset = p - stackbuf;
4513                else
4514                    offset = p - PyBytes_AS_STRING(result);
4515
4516                if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
4517                    /* integer overflow */
4518                    PyErr_NoMemory();
4519                    goto error;
4520                }
4521                nallocated += repsize - 4;
4522                if (result != NULL) {
4523                    if (_PyBytes_Resize(&result, nallocated) < 0)
4524                        goto error;
4525                } else {
4526                    result = PyBytes_FromStringAndSize(NULL, nallocated);
4527                    if (result == NULL)
4528                        goto error;
4529                    Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
4530                }
4531                p = PyBytes_AS_STRING(result) + offset;
4532            }
4533
4534            if (PyBytes_Check(rep)) {
4535                char *prep = PyBytes_AS_STRING(rep);
4536                for(k = repsize; k > 0; k--)
4537                    *p++ = *prep++;
4538            } else /* rep is unicode */ {
4539                const Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
4540                Py_UNICODE c;
4541
4542                for(k=0; k<repsize; k++) {
4543                    c = prep[k];
4544                    if (0x80 <= c) {
4545                        raise_encode_exception(&exc, "utf-8",
4546                                               PyUnicode_AS_UNICODE(unicode),
4547                                               size, i-1, i,
4548                                               "surrogates not allowed");
4549                        goto error;
4550                    }
4551                    *p++ = (char)prep[k];
4552                }
4553            }
4554            Py_DECREF(rep);
4555        } else if (ch < 0x10000) {
4556            *p++ = (char)(0xe0 | (ch >> 12));
4557            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4558            *p++ = (char)(0x80 | (ch & 0x3f));
4559        } else /* ch >= 0x10000 */ {
4560            /* Encode UCS4 Unicode ordinals */
4561            *p++ = (char)(0xf0 | (ch >> 18));
4562            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
4563            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
4564            *p++ = (char)(0x80 | (ch & 0x3f));
4565#if SIZEOF_WCHAR_T == 2
4566            wchar_offset++;
4567#endif
4568        }
4569    }
4570
4571    if (result == NULL) {
4572        /* This was stack allocated. */
4573        nneeded = p - stackbuf;
4574        assert(nneeded <= nallocated);
4575        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
4576    }
4577    else {
4578        /* Cut back to size actually needed. */
4579        nneeded = p - PyBytes_AS_STRING(result);
4580        assert(nneeded <= nallocated);
4581        _PyBytes_Resize(&result, nneeded);
4582    }
4583
4584    Py_XDECREF(errorHandler);
4585    Py_XDECREF(exc);
4586    return result;
4587 error:
4588    Py_XDECREF(errorHandler);
4589    Py_XDECREF(exc);
4590    Py_XDECREF(result);
4591    return NULL;
4592
4593#undef MAX_SHORT_UNICHARS
4594}
4595
4596PyObject *
4597PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4598                     Py_ssize_t size,
4599                     const char *errors)
4600{
4601    PyObject *v, *unicode;
4602
4603    unicode = PyUnicode_FromUnicode(s, size);
4604    if (unicode == NULL)
4605        return NULL;
4606    v = _PyUnicode_AsUTF8String(unicode, errors);
4607    Py_DECREF(unicode);
4608    return v;
4609}
4610
4611PyObject *
4612PyUnicode_AsUTF8String(PyObject *unicode)
4613{
4614    return _PyUnicode_AsUTF8String(unicode, NULL);
4615}
4616
4617/* --- UTF-32 Codec ------------------------------------------------------- */
4618
4619PyObject *
4620PyUnicode_DecodeUTF32(const char *s,
4621                      Py_ssize_t size,
4622                      const char *errors,
4623                      int *byteorder)
4624{
4625    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4626}
4627
4628PyObject *
4629PyUnicode_DecodeUTF32Stateful(const char *s,
4630                              Py_ssize_t size,
4631                              const char *errors,
4632                              int *byteorder,
4633                              Py_ssize_t *consumed)
4634{
4635    const char *starts = s;
4636    Py_ssize_t startinpos;
4637    Py_ssize_t endinpos;
4638    Py_ssize_t outpos;
4639    PyUnicodeObject *unicode;
4640    Py_UNICODE *p;
4641#ifndef Py_UNICODE_WIDE
4642    int pairs = 0;
4643    const unsigned char *qq;
4644#else
4645    const int pairs = 0;
4646#endif
4647    const unsigned char *q, *e;
4648    int bo = 0;       /* assume native ordering by default */
4649    const char *errmsg = "";
4650    /* Offsets from q for retrieving bytes in the right order. */
4651#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4652    int iorder[] = {0, 1, 2, 3};
4653#else
4654    int iorder[] = {3, 2, 1, 0};
4655#endif
4656    PyObject *errorHandler = NULL;
4657    PyObject *exc = NULL;
4658
4659    q = (unsigned char *)s;
4660    e = q + size;
4661
4662    if (byteorder)
4663        bo = *byteorder;
4664
4665    /* Check for BOM marks (U+FEFF) in the input and adjust current
4666       byte order setting accordingly. In native mode, the leading BOM
4667       mark is skipped, in all other modes, it is copied to the output
4668       stream as-is (giving a ZWNBSP character). */
4669    if (bo == 0) {
4670        if (size >= 4) {
4671            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4672                (q[iorder[1]] << 8) | q[iorder[0]];
4673#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4674            if (bom == 0x0000FEFF) {
4675                q += 4;
4676                bo = -1;
4677            }
4678            else if (bom == 0xFFFE0000) {
4679                q += 4;
4680                bo = 1;
4681            }
4682#else
4683            if (bom == 0x0000FEFF) {
4684                q += 4;
4685                bo = 1;
4686            }
4687            else if (bom == 0xFFFE0000) {
4688                q += 4;
4689                bo = -1;
4690            }
4691#endif
4692        }
4693    }
4694
4695    if (bo == -1) {
4696        /* force LE */
4697        iorder[0] = 0;
4698        iorder[1] = 1;
4699        iorder[2] = 2;
4700        iorder[3] = 3;
4701    }
4702    else if (bo == 1) {
4703        /* force BE */
4704        iorder[0] = 3;
4705        iorder[1] = 2;
4706        iorder[2] = 1;
4707        iorder[3] = 0;
4708    }
4709
4710    /* On narrow builds we split characters outside the BMP into two
4711       codepoints => count how much extra space we need. */
4712#ifndef Py_UNICODE_WIDE
4713    for (qq = q; qq < e; qq += 4)
4714        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
4715            pairs++;
4716#endif
4717
4718    /* This might be one to much, because of a BOM */
4719    unicode = _PyUnicode_New((size+3)/4+pairs);
4720    if (!unicode)
4721        return NULL;
4722    if (size == 0)
4723        return (PyObject *)unicode;
4724
4725    /* Unpack UTF-32 encoded data */
4726    p = PyUnicode_AS_UNICODE(unicode);
4727
4728    while (q < e) {
4729        Py_UCS4 ch;
4730        /* remaining bytes at the end? (size should be divisible by 4) */
4731        if (e-q<4) {
4732            if (consumed)
4733                break;
4734            errmsg = "truncated data";
4735            startinpos = ((const char *)q)-starts;
4736            endinpos = ((const char *)e)-starts;
4737            goto utf32Error;
4738            /* The remaining input chars are ignored if the callback
4739               chooses to skip the input */
4740        }
4741        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
4742            (q[iorder[1]] << 8) | q[iorder[0]];
4743
4744        if (ch >= 0x110000)
4745        {
4746            errmsg = "codepoint not in range(0x110000)";
4747            startinpos = ((const char *)q)-starts;
4748            endinpos = startinpos+4;
4749            goto utf32Error;
4750        }
4751#ifndef Py_UNICODE_WIDE
4752        if (ch >= 0x10000)
4753        {
4754            *p++ = 0xD800 | ((ch-0x10000) >> 10);
4755            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
4756        }
4757        else
4758#endif
4759            *p++ = ch;
4760        q += 4;
4761        continue;
4762      utf32Error:
4763        outpos = p-PyUnicode_AS_UNICODE(unicode);
4764        if (unicode_decode_call_errorhandler(
4765                errors, &errorHandler,
4766                "utf32", errmsg,
4767                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
4768                &unicode, &outpos, &p))
4769            goto onError;
4770    }
4771
4772    if (byteorder)
4773        *byteorder = bo;
4774
4775    if (consumed)
4776        *consumed = (const char *)q-starts;
4777
4778    /* Adjust length */
4779    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
4780        goto onError;
4781
4782    Py_XDECREF(errorHandler);
4783    Py_XDECREF(exc);
4784#ifndef DONT_MAKE_RESULT_READY
4785    if (_PyUnicode_READY_REPLACE(&unicode)) {
4786        Py_DECREF(unicode);
4787        return NULL;
4788    }
4789#endif
4790    return (PyObject *)unicode;
4791
4792  onError:
4793    Py_DECREF(unicode);
4794    Py_XDECREF(errorHandler);
4795    Py_XDECREF(exc);
4796    return NULL;
4797}
4798
4799PyObject *
4800PyUnicode_EncodeUTF32(const Py_UNICODE *s,
4801                      Py_ssize_t size,
4802                      const char *errors,
4803                      int byteorder)
4804{
4805    PyObject *v;
4806    unsigned char *p;
4807    Py_ssize_t nsize, bytesize;
4808#ifndef Py_UNICODE_WIDE
4809    Py_ssize_t i, pairs;
4810#else
4811    const int pairs = 0;
4812#endif
4813    /* Offsets from p for storing byte pairs in the right order. */
4814#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4815    int iorder[] = {0, 1, 2, 3};
4816#else
4817    int iorder[] = {3, 2, 1, 0};
4818#endif
4819
4820#define STORECHAR(CH)                           \
4821    do {                                        \
4822        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
4823        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
4824        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
4825        p[iorder[0]] = (CH) & 0xff;             \
4826        p += 4;                                 \
4827    } while(0)
4828
4829    /* In narrow builds we can output surrogate pairs as one codepoint,
4830       so we need less space. */
4831#ifndef Py_UNICODE_WIDE
4832    for (i = pairs = 0; i < size-1; i++)
4833        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
4834            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
4835            pairs++;
4836#endif
4837    nsize = (size - pairs + (byteorder == 0));
4838    bytesize = nsize * 4;
4839    if (bytesize / 4 != nsize)
4840        return PyErr_NoMemory();
4841    v = PyBytes_FromStringAndSize(NULL, bytesize);
4842    if (v == NULL)
4843        return NULL;
4844
4845    p = (unsigned char *)PyBytes_AS_STRING(v);
4846    if (byteorder == 0)
4847        STORECHAR(0xFEFF);
4848    if (size == 0)
4849        goto done;
4850
4851    if (byteorder == -1) {
4852        /* force LE */
4853        iorder[0] = 0;
4854        iorder[1] = 1;
4855        iorder[2] = 2;
4856        iorder[3] = 3;
4857    }
4858    else if (byteorder == 1) {
4859        /* force BE */
4860        iorder[0] = 3;
4861        iorder[1] = 2;
4862        iorder[2] = 1;
4863        iorder[3] = 0;
4864    }
4865
4866    while (size-- > 0) {
4867        Py_UCS4 ch = *s++;
4868#ifndef Py_UNICODE_WIDE
4869        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
4870            Py_UCS4 ch2 = *s;
4871            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
4872                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
4873                s++;
4874                size--;
4875            }
4876        }
4877#endif
4878        STORECHAR(ch);
4879    }
4880
4881  done:
4882    return v;
4883#undef STORECHAR
4884}
4885
4886PyObject *
4887PyUnicode_AsUTF32String(PyObject *unicode)
4888{
4889    if (!PyUnicode_Check(unicode)) {
4890        PyErr_BadArgument();
4891        return NULL;
4892    }
4893    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
4894                                 PyUnicode_GET_SIZE(unicode),
4895                                 NULL,
4896                                 0);
4897}
4898
4899/* --- UTF-16 Codec ------------------------------------------------------- */
4900
4901PyObject *
4902PyUnicode_DecodeUTF16(const char *s,
4903                      Py_ssize_t size,
4904                      const char *errors,
4905                      int *byteorder)
4906{
4907    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
4908}
4909
4910/* Two masks for fast checking of whether a C 'long' may contain
4911   UTF16-encoded surrogate characters. This is an efficient heuristic,
4912   assuming that non-surrogate characters with a code point >= 0x8000 are
4913   rare in most input.
4914   FAST_CHAR_MASK is used when the input is in native byte ordering,
4915   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
4916*/
4917#if (SIZEOF_LONG == 8)
4918# define FAST_CHAR_MASK         0x8000800080008000L
4919# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
4920#elif (SIZEOF_LONG == 4)
4921# define FAST_CHAR_MASK         0x80008000L
4922# define SWAPPED_FAST_CHAR_MASK 0x00800080L
4923#else
4924# error C 'long' size should be either 4 or 8!
4925#endif
4926
4927PyObject *
4928PyUnicode_DecodeUTF16Stateful(const char *s,
4929                              Py_ssize_t size,
4930                              const char *errors,
4931                              int *byteorder,
4932                              Py_ssize_t *consumed)
4933{
4934    const char *starts = s;
4935    Py_ssize_t startinpos;
4936    Py_ssize_t endinpos;
4937    Py_ssize_t outpos;
4938    PyUnicodeObject *unicode;
4939    Py_UNICODE *p;
4940    const unsigned char *q, *e, *aligned_end;
4941    int bo = 0;       /* assume native ordering by default */
4942    int native_ordering = 0;
4943    const char *errmsg = "";
4944    /* Offsets from q for retrieving byte pairs in the right order. */
4945#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4946    int ihi = 1, ilo = 0;
4947#else
4948    int ihi = 0, ilo = 1;
4949#endif
4950    PyObject *errorHandler = NULL;
4951    PyObject *exc = NULL;
4952
4953    /* Note: size will always be longer than the resulting Unicode
4954       character count */
4955    unicode = _PyUnicode_New(size);
4956    if (!unicode)
4957        return NULL;
4958    if (size == 0)
4959        return (PyObject *)unicode;
4960
4961    /* Unpack UTF-16 encoded data */
4962    p = PyUnicode_AS_UNICODE(unicode);
4963    q = (unsigned char *)s;
4964    e = q + size - 1;
4965
4966    if (byteorder)
4967        bo = *byteorder;
4968
4969    /* Check for BOM marks (U+FEFF) in the input and adjust current
4970       byte order setting accordingly. In native mode, the leading BOM
4971       mark is skipped, in all other modes, it is copied to the output
4972       stream as-is (giving a ZWNBSP character). */
4973    if (bo == 0) {
4974        if (size >= 2) {
4975            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
4976#ifdef BYTEORDER_IS_LITTLE_ENDIAN
4977            if (bom == 0xFEFF) {
4978                q += 2;
4979                bo = -1;
4980            }
4981            else if (bom == 0xFFFE) {
4982                q += 2;
4983                bo = 1;
4984            }
4985#else
4986            if (bom == 0xFEFF) {
4987                q += 2;
4988                bo = 1;
4989            }
4990            else if (bom == 0xFFFE) {
4991                q += 2;
4992                bo = -1;
4993            }
4994#endif
4995        }
4996    }
4997
4998    if (bo == -1) {
4999        /* force LE */
5000        ihi = 1;
5001        ilo = 0;
5002    }
5003    else if (bo == 1) {
5004        /* force BE */
5005        ihi = 0;
5006        ilo = 1;
5007    }
5008#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5009    native_ordering = ilo < ihi;
5010#else
5011    native_ordering = ilo > ihi;
5012#endif
5013
5014    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
5015    while (q < e) {
5016        Py_UNICODE ch;
5017        /* First check for possible aligned read of a C 'long'. Unaligned
5018           reads are more expensive, better to defer to another iteration. */
5019        if (!((size_t) q & LONG_PTR_MASK)) {
5020            /* Fast path for runs of non-surrogate chars. */
5021            register const unsigned char *_q = q;
5022            Py_UNICODE *_p = p;
5023            if (native_ordering) {
5024                /* Native ordering is simple: as long as the input cannot
5025                   possibly contain a surrogate char, do an unrolled copy
5026                   of several 16-bit code points to the target object.
5027                   The non-surrogate check is done on several input bytes
5028                   at a time (as many as a C 'long' can contain). */
5029                while (_q < aligned_end) {
5030                    unsigned long data = * (unsigned long *) _q;
5031                    if (data & FAST_CHAR_MASK)
5032                        break;
5033                    _p[0] = ((unsigned short *) _q)[0];
5034                    _p[1] = ((unsigned short *) _q)[1];
5035#if (SIZEOF_LONG == 8)
5036                    _p[2] = ((unsigned short *) _q)[2];
5037                    _p[3] = ((unsigned short *) _q)[3];
5038#endif
5039                    _q += SIZEOF_LONG;
5040                    _p += SIZEOF_LONG / 2;
5041                }
5042            }
5043            else {
5044                /* Byteswapped ordering is similar, but we must decompose
5045                   the copy bytewise, and take care of zero'ing out the
5046                   upper bytes if the target object is in 32-bit units
5047                   (that is, in UCS-4 builds). */
5048                while (_q < aligned_end) {
5049                    unsigned long data = * (unsigned long *) _q;
5050                    if (data & SWAPPED_FAST_CHAR_MASK)
5051                        break;
5052                    /* Zero upper bytes in UCS-4 builds */
5053#if (Py_UNICODE_SIZE > 2)
5054                    _p[0] = 0;
5055                    _p[1] = 0;
5056#if (SIZEOF_LONG == 8)
5057                    _p[2] = 0;
5058                    _p[3] = 0;
5059#endif
5060#endif
5061                    /* Issue #4916; UCS-4 builds on big endian machines must
5062                       fill the two last bytes of each 4-byte unit. */
5063#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
5064# define OFF 2
5065#else
5066# define OFF 0
5067#endif
5068                    ((unsigned char *) _p)[OFF + 1] = _q[0];
5069                    ((unsigned char *) _p)[OFF + 0] = _q[1];
5070                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
5071                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
5072#if (SIZEOF_LONG == 8)
5073                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
5074                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
5075                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
5076                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
5077#endif
5078#undef OFF
5079                    _q += SIZEOF_LONG;
5080                    _p += SIZEOF_LONG / 2;
5081                }
5082            }
5083            p = _p;
5084            q = _q;
5085            if (q >= e)
5086                break;
5087        }
5088        ch = (q[ihi] << 8) | q[ilo];
5089
5090        q += 2;
5091
5092        if (ch < 0xD800 || ch > 0xDFFF) {
5093            *p++ = ch;
5094            continue;
5095        }
5096
5097        /* UTF-16 code pair: */
5098        if (q > e) {
5099            errmsg = "unexpected end of data";
5100            startinpos = (((const char *)q) - 2) - starts;
5101            endinpos = ((const char *)e) + 1 - starts;
5102            goto utf16Error;
5103        }
5104        if (0xD800 <= ch && ch <= 0xDBFF) {
5105            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
5106            q += 2;
5107            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
5108#ifndef Py_UNICODE_WIDE
5109                *p++ = ch;
5110                *p++ = ch2;
5111#else
5112                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
5113#endif
5114                continue;
5115            }
5116            else {
5117                errmsg = "illegal UTF-16 surrogate";
5118                startinpos = (((const char *)q)-4)-starts;
5119                endinpos = startinpos+2;
5120                goto utf16Error;
5121            }
5122
5123        }
5124        errmsg = "illegal encoding";
5125        startinpos = (((const char *)q)-2)-starts;
5126        endinpos = startinpos+2;
5127        /* Fall through to report the error */
5128
5129      utf16Error:
5130        outpos = p - PyUnicode_AS_UNICODE(unicode);
5131        if (unicode_decode_call_errorhandler(
5132                errors,
5133                &errorHandler,
5134                "utf16", errmsg,
5135                &starts,
5136                (const char **)&e,
5137                &startinpos,
5138                &endinpos,
5139                &exc,
5140                (const char **)&q,
5141                &unicode,
5142                &outpos,
5143                &p))
5144            goto onError;
5145    }
5146    /* remaining byte at the end? (size should be even) */
5147    if (e == q) {
5148        if (!consumed) {
5149            errmsg = "truncated data";
5150            startinpos = ((const char *)q) - starts;
5151            endinpos = ((const char *)e) + 1 - starts;
5152            outpos = p - PyUnicode_AS_UNICODE(unicode);
5153            if (unicode_decode_call_errorhandler(
5154                    errors,
5155                    &errorHandler,
5156                    "utf16", errmsg,
5157                    &starts,
5158                    (const char **)&e,
5159                    &startinpos,
5160                    &endinpos,
5161                    &exc,
5162                    (const char **)&q,
5163                    &unicode,
5164                    &outpos,
5165                    &p))
5166                goto onError;
5167            /* The remaining input chars are ignored if the callback
5168               chooses to skip the input */
5169        }
5170    }
5171
5172    if (byteorder)
5173        *byteorder = bo;
5174
5175    if (consumed)
5176        *consumed = (const char *)q-starts;
5177
5178    /* Adjust length */
5179    if (PyUnicode_Resize((PyObject**)&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
5180        goto onError;
5181
5182    Py_XDECREF(errorHandler);
5183    Py_XDECREF(exc);
5184#ifndef DONT_MAKE_RESULT_READY
5185    if (_PyUnicode_READY_REPLACE(&unicode)) {
5186        Py_DECREF(unicode);
5187        return NULL;
5188    }
5189#endif
5190    return (PyObject *)unicode;
5191
5192  onError:
5193    Py_DECREF(unicode);
5194    Py_XDECREF(errorHandler);
5195    Py_XDECREF(exc);
5196    return NULL;
5197}
5198
5199#undef FAST_CHAR_MASK
5200#undef SWAPPED_FAST_CHAR_MASK
5201
5202PyObject *
5203PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5204                      Py_ssize_t size,
5205                      const char *errors,
5206                      int byteorder)
5207{
5208    PyObject *v;
5209    unsigned char *p;
5210    Py_ssize_t nsize, bytesize;
5211#ifdef Py_UNICODE_WIDE
5212    Py_ssize_t i, pairs;
5213#else
5214    const int pairs = 0;
5215#endif
5216    /* Offsets from p for storing byte pairs in the right order. */
5217#ifdef BYTEORDER_IS_LITTLE_ENDIAN
5218    int ihi = 1, ilo = 0;
5219#else
5220    int ihi = 0, ilo = 1;
5221#endif
5222
5223#define STORECHAR(CH)                           \
5224    do {                                        \
5225        p[ihi] = ((CH) >> 8) & 0xff;            \
5226        p[ilo] = (CH) & 0xff;                   \
5227        p += 2;                                 \
5228    } while(0)
5229
5230#ifdef Py_UNICODE_WIDE
5231    for (i = pairs = 0; i < size; i++)
5232        if (s[i] >= 0x10000)
5233            pairs++;
5234#endif
5235    /* 2 * (size + pairs + (byteorder == 0)) */
5236    if (size > PY_SSIZE_T_MAX ||
5237        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
5238        return PyErr_NoMemory();
5239    nsize = size + pairs + (byteorder == 0);
5240    bytesize = nsize * 2;
5241    if (bytesize / 2 != nsize)
5242        return PyErr_NoMemory();
5243    v = PyBytes_FromStringAndSize(NULL, bytesize);
5244    if (v == NULL)
5245        return NULL;
5246
5247    p = (unsigned char *)PyBytes_AS_STRING(v);
5248    if (byteorder == 0)
5249        STORECHAR(0xFEFF);
5250    if (size == 0)
5251        goto done;
5252
5253    if (byteorder == -1) {
5254        /* force LE */
5255        ihi = 1;
5256        ilo = 0;
5257    }
5258    else if (byteorder == 1) {
5259        /* force BE */
5260        ihi = 0;
5261        ilo = 1;
5262    }
5263
5264    while (size-- > 0) {
5265        Py_UNICODE ch = *s++;
5266        Py_UNICODE ch2 = 0;
5267#ifdef Py_UNICODE_WIDE
5268        if (ch >= 0x10000) {
5269            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
5270            ch  = 0xD800 | ((ch-0x10000) >> 10);
5271        }
5272#endif
5273        STORECHAR(ch);
5274        if (ch2)
5275            STORECHAR(ch2);
5276    }
5277
5278  done:
5279    return v;
5280#undef STORECHAR
5281}
5282
5283PyObject *
5284PyUnicode_AsUTF16String(PyObject *unicode)
5285{
5286    if (!PyUnicode_Check(unicode)) {
5287        PyErr_BadArgument();
5288        return NULL;
5289    }
5290    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
5291                                 PyUnicode_GET_SIZE(unicode),
5292                                 NULL,
5293                                 0);
5294}
5295
5296/* --- Unicode Escape Codec ----------------------------------------------- */
5297
5298/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5299   if all the escapes in the string make it still a valid ASCII string.
5300   Returns -1 if any escapes were found which cause the string to
5301   pop out of ASCII range.  Otherwise returns the length of the
5302   required buffer to hold the string.
5303   */
5304Py_ssize_t
5305length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5306{
5307    const unsigned char *p = (const unsigned char *)s;
5308    const unsigned char *end = p + size;
5309    Py_ssize_t length = 0;
5310
5311    if (size < 0)
5312        return -1;
5313
5314    for (; p < end; ++p) {
5315        if (*p > 127) {
5316            /* Non-ASCII */
5317            return -1;
5318        }
5319        else if (*p != '\\') {
5320            /* Normal character */
5321            ++length;
5322        }
5323        else {
5324            /* Backslash-escape, check next char */
5325            ++p;
5326            /* Escape sequence reaches till end of string or
5327               non-ASCII follow-up. */
5328            if (p >= end || *p > 127)
5329                return -1;
5330            switch (*p) {
5331            case '\n':
5332                /* backslash + \n result in zero characters */
5333                break;
5334            case '\\': case '\'': case '\"':
5335            case 'b': case 'f': case 't':
5336            case 'n': case 'r': case 'v': case 'a':
5337                ++length;
5338                break;
5339            case '0': case '1': case '2': case '3':
5340            case '4': case '5': case '6': case '7':
5341            case 'x': case 'u': case 'U': case 'N':
5342                /* these do not guarantee ASCII characters */
5343                return -1;
5344            default:
5345                /* count the backslash + the other character */
5346                length += 2;
5347            }
5348        }
5349    }
5350    return length;
5351}
5352
5353/* Similar to PyUnicode_WRITE but either write into wstr field
5354   or treat string as ASCII. */
5355#define WRITE_ASCII_OR_WSTR(kind, buf, index, value) \
5356    do { \
5357        if ((kind) != PyUnicode_WCHAR_KIND) \
5358            ((unsigned char *)(buf))[(index)] = (unsigned char)(value); \
5359        else \
5360            ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value); \
5361    } while (0)
5362
5363#define WRITE_WSTR(buf, index, value) \
5364    assert(kind == PyUnicode_WCHAR_KIND), \
5365    ((Py_UNICODE *)(buf))[(index)] = (Py_UNICODE)(value)
5366
5367
5368static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5369
5370PyObject *
5371PyUnicode_DecodeUnicodeEscape(const char *s,
5372                              Py_ssize_t size,
5373                              const char *errors)
5374{
5375    const char *starts = s;
5376    Py_ssize_t startinpos;
5377    Py_ssize_t endinpos;
5378    int j;
5379    PyUnicodeObject *v;
5380    Py_UNICODE *p;
5381    const char *end;
5382    char* message;
5383    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5384    PyObject *errorHandler = NULL;
5385    PyObject *exc = NULL;
5386    Py_ssize_t ascii_length;
5387    Py_ssize_t i;
5388    int kind;
5389    void *data;
5390
5391    ascii_length = length_of_escaped_ascii_string(s, size);
5392
5393    /* After length_of_escaped_ascii_string() there are two alternatives,
5394       either the string is pure ASCII with named escapes like \n, etc.
5395       and we determined it's exact size (common case)
5396       or it contains \x, \u, ... escape sequences.  then we create a
5397       legacy wchar string and resize it at the end of this function. */
5398    if (ascii_length >= 0) {
5399        v = (PyUnicodeObject *)PyUnicode_New(ascii_length, 127);
5400        if (!v)
5401            goto onError;
5402        assert(PyUnicode_KIND(v) == PyUnicode_1BYTE_KIND);
5403        kind = PyUnicode_1BYTE_KIND;
5404        data = PyUnicode_DATA(v);
5405    }
5406    else {
5407        /* Escaped strings will always be longer than the resulting
5408           Unicode string, so we start with size here and then reduce the
5409           length after conversion to the true value.
5410           (but if the error callback returns a long replacement string
5411           we'll have to allocate more space) */
5412        v = _PyUnicode_New(size);
5413        if (!v)
5414            goto onError;
5415        kind = PyUnicode_WCHAR_KIND;
5416        data = PyUnicode_AS_UNICODE(v);
5417    }
5418
5419    if (size == 0)
5420        return (PyObject *)v;
5421    i = 0;
5422    end = s + size;
5423
5424    while (s < end) {
5425        unsigned char c;
5426        Py_UNICODE x;
5427        int digits;
5428
5429        if (kind == PyUnicode_WCHAR_KIND) {
5430            assert(i < _PyUnicode_WSTR_LENGTH(v));
5431        }
5432        else {
5433            /* The only case in which i == ascii_length is a backslash
5434               followed by a newline. */
5435            assert(i <= ascii_length);
5436        }
5437
5438        /* Non-escape characters are interpreted as Unicode ordinals */
5439        if (*s != '\\') {
5440            WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char) *s++);
5441            continue;
5442        }
5443
5444        startinpos = s-starts;
5445        /* \ - Escapes */
5446        s++;
5447        c = *s++;
5448        if (s > end)
5449            c = '\0'; /* Invalid after \ */
5450
5451        if (kind == PyUnicode_WCHAR_KIND) {
5452            assert(i < _PyUnicode_WSTR_LENGTH(v));
5453        }
5454        else {
5455            /* The only case in which i == ascii_length is a backslash
5456               followed by a newline. */
5457            assert(i < ascii_length || (i == ascii_length && c == '\n'));
5458        }
5459
5460        switch (c) {
5461
5462            /* \x escapes */
5463        case '\n': break;
5464        case '\\': WRITE_ASCII_OR_WSTR(kind, data, i++, '\\'); break;
5465        case '\'': WRITE_ASCII_OR_WSTR(kind, data, i++, '\''); break;
5466        case '\"': WRITE_ASCII_OR_WSTR(kind, data, i++, '\"'); break;
5467        case 'b': WRITE_ASCII_OR_WSTR(kind, data, i++, '\b'); break;
5468        /* FF */
5469        case 'f': WRITE_ASCII_OR_WSTR(kind, data, i++, '\014'); break;
5470        case 't': WRITE_ASCII_OR_WSTR(kind, data, i++, '\t'); break;
5471        case 'n': WRITE_ASCII_OR_WSTR(kind, data, i++, '\n'); break;
5472        case 'r': WRITE_ASCII_OR_WSTR(kind, data, i++, '\r'); break;
5473        /* VT */
5474        case 'v': WRITE_ASCII_OR_WSTR(kind, data, i++, '\013'); break;
5475        /* BEL, not classic C */
5476        case 'a': WRITE_ASCII_OR_WSTR(kind, data, i++, '\007'); break;
5477
5478            /* \OOO (octal) escapes */
5479        case '0': case '1': case '2': case '3':
5480        case '4': case '5': case '6': case '7':
5481            x = s[-1] - '0';
5482            if (s < end && '0' <= *s && *s <= '7') {
5483                x = (x<<3) + *s++ - '0';
5484                if (s < end && '0' <= *s && *s <= '7')
5485                    x = (x<<3) + *s++ - '0';
5486            }
5487            WRITE_WSTR(data, i++, x);
5488            break;
5489
5490            /* hex escapes */
5491            /* \xXX */
5492        case 'x':
5493            digits = 2;
5494            message = "truncated \\xXX escape";
5495            goto hexescape;
5496
5497            /* \uXXXX */
5498        case 'u':
5499            digits = 4;
5500            message = "truncated \\uXXXX escape";
5501            goto hexescape;
5502
5503            /* \UXXXXXXXX */
5504        case 'U':
5505            digits = 8;
5506            message = "truncated \\UXXXXXXXX escape";
5507        hexescape:
5508            chr = 0;
5509            p = PyUnicode_AS_UNICODE(v) + i;
5510            if (s+digits>end) {
5511                endinpos = size;
5512                if (unicode_decode_call_errorhandler(
5513                        errors, &errorHandler,
5514                        "unicodeescape", "end of string in escape sequence",
5515                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5516                        &v, &i, &p))
5517                    goto onError;
5518                data = PyUnicode_AS_UNICODE(v);
5519                goto nextByte;
5520            }
5521            for (j = 0; j < digits; ++j) {
5522                c = (unsigned char) s[j];
5523                if (!Py_ISXDIGIT(c)) {
5524                    endinpos = (s+j+1)-starts;
5525                    p = PyUnicode_AS_UNICODE(v) + i;
5526                    if (unicode_decode_call_errorhandler(
5527                            errors, &errorHandler,
5528                            "unicodeescape", message,
5529                            &starts, &end, &startinpos, &endinpos, &exc, &s,
5530                            &v, &i, &p))
5531                        goto onError;
5532                    data = PyUnicode_AS_UNICODE(v);
5533                    goto nextByte;
5534                }
5535                chr = (chr<<4) & ~0xF;
5536                if (c >= '0' && c <= '9')
5537                    chr += c - '0';
5538                else if (c >= 'a' && c <= 'f')
5539                    chr += 10 + c - 'a';
5540                else
5541                    chr += 10 + c - 'A';
5542            }
5543            s += j;
5544            if (chr == 0xffffffff && PyErr_Occurred())
5545                /* _decoding_error will have already written into the
5546                   target buffer. */
5547                break;
5548        store:
5549            /* when we get here, chr is a 32-bit unicode character */
5550            if (chr <= 0xffff)
5551                /* UCS-2 character */
5552                WRITE_WSTR(data, i++, chr);
5553            else if (chr <= 0x10ffff) {
5554                /* UCS-4 character. Either store directly, or as
5555                   surrogate pair. */
5556#ifdef Py_UNICODE_WIDE
5557                WRITE_WSTR(data, i++, chr);
5558#else
5559                chr -= 0x10000L;
5560                WRITE_WSTR(data, i++, 0xD800 + (Py_UNICODE) (chr >> 10));
5561                WRITE_WSTR(data, i++, 0xDC00 + (Py_UNICODE) (chr & 0x03FF));
5562#endif
5563            } else {
5564                endinpos = s-starts;
5565                p = PyUnicode_AS_UNICODE(v) + i;
5566                if (unicode_decode_call_errorhandler(
5567                        errors, &errorHandler,
5568                        "unicodeescape", "illegal Unicode character",
5569                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5570                        &v, &i, &p))
5571                    goto onError;
5572                data = PyUnicode_AS_UNICODE(v);
5573            }
5574            break;
5575
5576            /* \N{name} */
5577        case 'N':
5578            message = "malformed \\N character escape";
5579            if (ucnhash_CAPI == NULL) {
5580                /* load the unicode data module */
5581                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5582                                                PyUnicodeData_CAPSULE_NAME, 1);
5583                if (ucnhash_CAPI == NULL)
5584                    goto ucnhashError;
5585            }
5586            if (*s == '{') {
5587                const char *start = s+1;
5588                /* look for the closing brace */
5589                while (*s != '}' && s < end)
5590                    s++;
5591                if (s > start && s < end && *s == '}') {
5592                    /* found a name.  look it up in the unicode database */
5593                    message = "unknown Unicode character name";
5594                    s++;
5595                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5596                                              &chr))
5597                        goto store;
5598                }
5599            }
5600            endinpos = s-starts;
5601            p = PyUnicode_AS_UNICODE(v) + i;
5602            if (unicode_decode_call_errorhandler(
5603                    errors, &errorHandler,
5604                    "unicodeescape", message,
5605                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5606                    &v, &i, &p))
5607                goto onError;
5608            data = PyUnicode_AS_UNICODE(v);
5609            break;
5610
5611        default:
5612            if (s > end) {
5613                assert(kind == PyUnicode_WCHAR_KIND);
5614                message = "\\ at end of string";
5615                s--;
5616                endinpos = s-starts;
5617                p = PyUnicode_AS_UNICODE(v) + i;
5618                if (unicode_decode_call_errorhandler(
5619                        errors, &errorHandler,
5620                        "unicodeescape", message,
5621                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5622                        &v, &i, &p))
5623                    goto onError;
5624                data = PyUnicode_AS_UNICODE(v);
5625            }
5626            else {
5627                WRITE_ASCII_OR_WSTR(kind, data, i++, '\\');
5628                WRITE_ASCII_OR_WSTR(kind, data, i++, (unsigned char)s[-1]);
5629            }
5630            break;
5631        }
5632      nextByte:
5633        ;
5634    }
5635    /* Ensure the length prediction worked in case of ASCII strings */
5636    assert(kind == PyUnicode_WCHAR_KIND || i == ascii_length);
5637
5638    if (kind == PyUnicode_WCHAR_KIND)
5639    {
5640        if (PyUnicode_Resize((PyObject**)&v, i) < 0)
5641            goto onError;
5642    }
5643    Py_XDECREF(errorHandler);
5644    Py_XDECREF(exc);
5645#ifndef DONT_MAKE_RESULT_READY
5646    if (_PyUnicode_READY_REPLACE(&v)) {
5647        Py_DECREF(v);
5648        return NULL;
5649    }
5650#endif
5651    return (PyObject *)v;
5652
5653  ucnhashError:
5654    PyErr_SetString(
5655        PyExc_UnicodeError,
5656        "\\N escapes not supported (can't load unicodedata module)"
5657        );
5658    Py_XDECREF(v);
5659    Py_XDECREF(errorHandler);
5660    Py_XDECREF(exc);
5661    return NULL;
5662
5663  onError:
5664    Py_XDECREF(v);
5665    Py_XDECREF(errorHandler);
5666    Py_XDECREF(exc);
5667    return NULL;
5668}
5669
5670#undef WRITE_ASCII_OR_WSTR
5671#undef WRITE_WSTR
5672
5673/* Return a Unicode-Escape string version of the Unicode object.
5674
5675   If quotes is true, the string is enclosed in u"" or u'' quotes as
5676   appropriate.
5677
5678*/
5679
5680static const char *hexdigits = "0123456789abcdef";
5681
5682PyObject *
5683PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5684                              Py_ssize_t size)
5685{
5686    PyObject *repr;
5687    char *p;
5688
5689#ifdef Py_UNICODE_WIDE
5690    const Py_ssize_t expandsize = 10;
5691#else
5692    const Py_ssize_t expandsize = 6;
5693#endif
5694
5695    /* XXX(nnorwitz): rather than over-allocating, it would be
5696       better to choose a different scheme.  Perhaps scan the
5697       first N-chars of the string and allocate based on that size.
5698    */
5699    /* Initial allocation is based on the longest-possible unichr
5700       escape.
5701
5702       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
5703       unichr, so in this case it's the longest unichr escape. In
5704       narrow (UTF-16) builds this is five chars per source unichr
5705       since there are two unichrs in the surrogate pair, so in narrow
5706       (UTF-16) builds it's not the longest unichr escape.
5707
5708       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
5709       so in the narrow (UTF-16) build case it's the longest unichr
5710       escape.
5711    */
5712
5713    if (size == 0)
5714        return PyBytes_FromStringAndSize(NULL, 0);
5715
5716    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5717        return PyErr_NoMemory();
5718
5719    repr = PyBytes_FromStringAndSize(NULL,
5720                                     2
5721                                     + expandsize*size
5722                                     + 1);
5723    if (repr == NULL)
5724        return NULL;
5725
5726    p = PyBytes_AS_STRING(repr);
5727
5728    while (size-- > 0) {
5729        Py_UNICODE ch = *s++;
5730
5731        /* Escape backslashes */
5732        if (ch == '\\') {
5733            *p++ = '\\';
5734            *p++ = (char) ch;
5735            continue;
5736        }
5737
5738#ifdef Py_UNICODE_WIDE
5739        /* Map 21-bit characters to '\U00xxxxxx' */
5740        else if (ch >= 0x10000) {
5741            *p++ = '\\';
5742            *p++ = 'U';
5743            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
5744            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
5745            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
5746            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
5747            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
5748            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
5749            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
5750            *p++ = hexdigits[ch & 0x0000000F];
5751            continue;
5752        }
5753#else
5754        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
5755        else if (ch >= 0xD800 && ch < 0xDC00) {
5756            Py_UNICODE ch2;
5757            Py_UCS4 ucs;
5758
5759            ch2 = *s++;
5760            size--;
5761            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
5762                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
5763                *p++ = '\\';
5764                *p++ = 'U';
5765                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
5766                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
5767                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
5768                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
5769                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
5770                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
5771                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
5772                *p++ = hexdigits[ucs & 0x0000000F];
5773                continue;
5774            }
5775            /* Fall through: isolated surrogates are copied as-is */
5776            s--;
5777            size++;
5778        }
5779#endif
5780
5781        /* Map 16-bit characters to '\uxxxx' */
5782        if (ch >= 256) {
5783            *p++ = '\\';
5784            *p++ = 'u';
5785            *p++ = hexdigits[(ch >> 12) & 0x000F];
5786            *p++ = hexdigits[(ch >> 8) & 0x000F];
5787            *p++ = hexdigits[(ch >> 4) & 0x000F];
5788            *p++ = hexdigits[ch & 0x000F];
5789        }
5790
5791        /* Map special whitespace to '\t', \n', '\r' */
5792        else if (ch == '\t') {
5793            *p++ = '\\';
5794            *p++ = 't';
5795        }
5796        else if (ch == '\n') {
5797            *p++ = '\\';
5798            *p++ = 'n';
5799        }
5800        else if (ch == '\r') {
5801            *p++ = '\\';
5802            *p++ = 'r';
5803        }
5804
5805        /* Map non-printable US ASCII to '\xhh' */
5806        else if (ch < ' ' || ch >= 0x7F) {
5807            *p++ = '\\';
5808            *p++ = 'x';
5809            *p++ = hexdigits[(ch >> 4) & 0x000F];
5810            *p++ = hexdigits[ch & 0x000F];
5811        }
5812
5813        /* Copy everything else as-is */
5814        else
5815            *p++ = (char) ch;
5816    }
5817
5818    assert(p - PyBytes_AS_STRING(repr) > 0);
5819    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5820        return NULL;
5821    return repr;
5822}
5823
5824PyObject *
5825PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5826{
5827    PyObject *s;
5828    if (!PyUnicode_Check(unicode)) {
5829        PyErr_BadArgument();
5830        return NULL;
5831    }
5832    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
5833                                      PyUnicode_GET_SIZE(unicode));
5834    return s;
5835}
5836
5837/* --- Raw Unicode Escape Codec ------------------------------------------- */
5838
5839PyObject *
5840PyUnicode_DecodeRawUnicodeEscape(const char *s,
5841                                 Py_ssize_t size,
5842                                 const char *errors)
5843{
5844    const char *starts = s;
5845    Py_ssize_t startinpos;
5846    Py_ssize_t endinpos;
5847    Py_ssize_t outpos;
5848    PyUnicodeObject *v;
5849    Py_UNICODE *p;
5850    const char *end;
5851    const char *bs;
5852    PyObject *errorHandler = NULL;
5853    PyObject *exc = NULL;
5854
5855    /* Escaped strings will always be longer than the resulting
5856       Unicode string, so we start with size here and then reduce the
5857       length after conversion to the true value. (But decoding error
5858       handler might have to resize the string) */
5859    v = _PyUnicode_New(size);
5860    if (v == NULL)
5861        goto onError;
5862    if (size == 0)
5863        return (PyObject *)v;
5864    p = PyUnicode_AS_UNICODE(v);
5865    end = s + size;
5866    while (s < end) {
5867        unsigned char c;
5868        Py_UCS4 x;
5869        int i;
5870        int count;
5871
5872        /* Non-escape characters are interpreted as Unicode ordinals */
5873        if (*s != '\\') {
5874            *p++ = (unsigned char)*s++;
5875            continue;
5876        }
5877        startinpos = s-starts;
5878
5879        /* \u-escapes are only interpreted iff the number of leading
5880           backslashes if odd */
5881        bs = s;
5882        for (;s < end;) {
5883            if (*s != '\\')
5884                break;
5885            *p++ = (unsigned char)*s++;
5886        }
5887        if (((s - bs) & 1) == 0 ||
5888            s >= end ||
5889            (*s != 'u' && *s != 'U')) {
5890            continue;
5891        }
5892        p--;
5893        count = *s=='u' ? 4 : 8;
5894        s++;
5895
5896        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5897        outpos = p-PyUnicode_AS_UNICODE(v);
5898        for (x = 0, i = 0; i < count; ++i, ++s) {
5899            c = (unsigned char)*s;
5900            if (!Py_ISXDIGIT(c)) {
5901                endinpos = s-starts;
5902                if (unicode_decode_call_errorhandler(
5903                        errors, &errorHandler,
5904                        "rawunicodeescape", "truncated \\uXXXX",
5905                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5906                        &v, &outpos, &p))
5907                    goto onError;
5908                goto nextByte;
5909            }
5910            x = (x<<4) & ~0xF;
5911            if (c >= '0' && c <= '9')
5912                x += c - '0';
5913            else if (c >= 'a' && c <= 'f')
5914                x += 10 + c - 'a';
5915            else
5916                x += 10 + c - 'A';
5917        }
5918        if (x <= 0xffff)
5919            /* UCS-2 character */
5920            *p++ = (Py_UNICODE) x;
5921        else if (x <= 0x10ffff) {
5922            /* UCS-4 character. Either store directly, or as
5923               surrogate pair. */
5924#ifdef Py_UNICODE_WIDE
5925            *p++ = (Py_UNICODE) x;
5926#else
5927            x -= 0x10000L;
5928            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
5929            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
5930#endif
5931        } else {
5932            endinpos = s-starts;
5933            outpos = p-PyUnicode_AS_UNICODE(v);
5934            if (unicode_decode_call_errorhandler(
5935                    errors, &errorHandler,
5936                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5937                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5938                    &v, &outpos, &p))
5939                goto onError;
5940        }
5941      nextByte:
5942        ;
5943    }
5944    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5945        goto onError;
5946    Py_XDECREF(errorHandler);
5947    Py_XDECREF(exc);
5948#ifndef DONT_MAKE_RESULT_READY
5949    if (_PyUnicode_READY_REPLACE(&v)) {
5950        Py_DECREF(v);
5951        return NULL;
5952    }
5953#endif
5954    return (PyObject *)v;
5955
5956  onError:
5957    Py_XDECREF(v);
5958    Py_XDECREF(errorHandler);
5959    Py_XDECREF(exc);
5960    return NULL;
5961}
5962
5963PyObject *
5964PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
5965                                 Py_ssize_t size)
5966{
5967    PyObject *repr;
5968    char *p;
5969    char *q;
5970
5971#ifdef Py_UNICODE_WIDE
5972    const Py_ssize_t expandsize = 10;
5973#else
5974    const Py_ssize_t expandsize = 6;
5975#endif
5976
5977    if (size > PY_SSIZE_T_MAX / expandsize)
5978        return PyErr_NoMemory();
5979
5980    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
5981    if (repr == NULL)
5982        return NULL;
5983    if (size == 0)
5984        return repr;
5985
5986    p = q = PyBytes_AS_STRING(repr);
5987    while (size-- > 0) {
5988        Py_UNICODE ch = *s++;
5989#ifdef Py_UNICODE_WIDE
5990        /* Map 32-bit characters to '\Uxxxxxxxx' */
5991        if (ch >= 0x10000) {
5992            *p++ = '\\';
5993            *p++ = 'U';
5994            *p++ = hexdigits[(ch >> 28) & 0xf];
5995            *p++ = hexdigits[(ch >> 24) & 0xf];
5996            *p++ = hexdigits[(ch >> 20) & 0xf];
5997            *p++ = hexdigits[(ch >> 16) & 0xf];
5998            *p++ = hexdigits[(ch >> 12) & 0xf];
5999            *p++ = hexdigits[(ch >> 8) & 0xf];
6000            *p++ = hexdigits[(ch >> 4) & 0xf];
6001            *p++ = hexdigits[ch & 15];
6002        }
6003        else
6004#else
6005            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
6006            if (ch >= 0xD800 && ch < 0xDC00) {
6007                Py_UNICODE ch2;
6008                Py_UCS4 ucs;
6009
6010                ch2 = *s++;
6011                size--;
6012                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
6013                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
6014                    *p++ = '\\';
6015                    *p++ = 'U';
6016                    *p++ = hexdigits[(ucs >> 28) & 0xf];
6017                    *p++ = hexdigits[(ucs >> 24) & 0xf];
6018                    *p++ = hexdigits[(ucs >> 20) & 0xf];
6019                    *p++ = hexdigits[(ucs >> 16) & 0xf];
6020                    *p++ = hexdigits[(ucs >> 12) & 0xf];
6021                    *p++ = hexdigits[(ucs >> 8) & 0xf];
6022                    *p++ = hexdigits[(ucs >> 4) & 0xf];
6023                    *p++ = hexdigits[ucs & 0xf];
6024                    continue;
6025                }
6026                /* Fall through: isolated surrogates are copied as-is */
6027                s--;
6028                size++;
6029            }
6030#endif
6031        /* Map 16-bit characters to '\uxxxx' */
6032        if (ch >= 256) {
6033            *p++ = '\\';
6034            *p++ = 'u';
6035            *p++ = hexdigits[(ch >> 12) & 0xf];
6036            *p++ = hexdigits[(ch >> 8) & 0xf];
6037            *p++ = hexdigits[(ch >> 4) & 0xf];
6038            *p++ = hexdigits[ch & 15];
6039        }
6040        /* Copy everything else as-is */
6041        else
6042            *p++ = (char) ch;
6043    }
6044    size = p - q;
6045
6046    assert(size > 0);
6047    if (_PyBytes_Resize(&repr, size) < 0)
6048        return NULL;
6049    return repr;
6050}
6051
6052PyObject *
6053PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6054{
6055    PyObject *s;
6056    if (!PyUnicode_Check(unicode)) {
6057        PyErr_BadArgument();
6058        return NULL;
6059    }
6060    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
6061                                         PyUnicode_GET_SIZE(unicode));
6062
6063    return s;
6064}
6065
6066/* --- Unicode Internal Codec ------------------------------------------- */
6067
6068PyObject *
6069_PyUnicode_DecodeUnicodeInternal(const char *s,
6070                                 Py_ssize_t size,
6071                                 const char *errors)
6072{
6073    const char *starts = s;
6074    Py_ssize_t startinpos;
6075    Py_ssize_t endinpos;
6076    Py_ssize_t outpos;
6077    PyUnicodeObject *v;
6078    Py_UNICODE *p;
6079    const char *end;
6080    const char *reason;
6081    PyObject *errorHandler = NULL;
6082    PyObject *exc = NULL;
6083
6084#ifdef Py_UNICODE_WIDE
6085    Py_UNICODE unimax = PyUnicode_GetMax();
6086#endif
6087
6088    /* XXX overflow detection missing */
6089    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
6090    if (v == NULL)
6091        goto onError;
6092    /* Intentionally PyUnicode_GET_SIZE instead of PyUnicode_GET_LENGTH
6093       as string was created with the old API. */
6094    if (PyUnicode_GET_SIZE(v) == 0)
6095        return (PyObject *)v;
6096    p = PyUnicode_AS_UNICODE(v);
6097    end = s + size;
6098
6099    while (s < end) {
6100        memcpy(p, s, sizeof(Py_UNICODE));
6101        /* We have to sanity check the raw data, otherwise doom looms for
6102           some malformed UCS-4 data. */
6103        if (
6104#ifdef Py_UNICODE_WIDE
6105            *p > unimax || *p < 0 ||
6106#endif
6107            end-s < Py_UNICODE_SIZE
6108            )
6109        {
6110            startinpos = s - starts;
6111            if (end-s < Py_UNICODE_SIZE) {
6112                endinpos = end-starts;
6113                reason = "truncated input";
6114            }
6115            else {
6116                endinpos = s - starts + Py_UNICODE_SIZE;
6117                reason = "illegal code point (> 0x10FFFF)";
6118            }
6119            outpos = p - PyUnicode_AS_UNICODE(v);
6120            if (unicode_decode_call_errorhandler(
6121                    errors, &errorHandler,
6122                    "unicode_internal", reason,
6123                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6124                    &v, &outpos, &p)) {
6125                goto onError;
6126            }
6127        }
6128        else {
6129            p++;
6130            s += Py_UNICODE_SIZE;
6131        }
6132    }
6133
6134    if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
6135        goto onError;
6136    Py_XDECREF(errorHandler);
6137    Py_XDECREF(exc);
6138#ifndef DONT_MAKE_RESULT_READY
6139    if (_PyUnicode_READY_REPLACE(&v)) {
6140        Py_DECREF(v);
6141        return NULL;
6142    }
6143#endif
6144    return (PyObject *)v;
6145
6146  onError:
6147    Py_XDECREF(v);
6148    Py_XDECREF(errorHandler);
6149    Py_XDECREF(exc);
6150    return NULL;
6151}
6152
6153/* --- Latin-1 Codec ------------------------------------------------------ */
6154
6155PyObject *
6156PyUnicode_DecodeLatin1(const char *s,
6157                       Py_ssize_t size,
6158                       const char *errors)
6159{
6160    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6161    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6162}
6163
6164/* create or adjust a UnicodeEncodeError */
6165static void
6166make_encode_exception(PyObject **exceptionObject,
6167                      const char *encoding,
6168                      const Py_UNICODE *unicode, Py_ssize_t size,
6169                      Py_ssize_t startpos, Py_ssize_t endpos,
6170                      const char *reason)
6171{
6172    if (*exceptionObject == NULL) {
6173        *exceptionObject = PyUnicodeEncodeError_Create(
6174            encoding, unicode, size, startpos, endpos, reason);
6175    }
6176    else {
6177        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6178            goto onError;
6179        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6180            goto onError;
6181        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6182            goto onError;
6183        return;
6184      onError:
6185        Py_DECREF(*exceptionObject);
6186        *exceptionObject = NULL;
6187    }
6188}
6189
6190/* raises a UnicodeEncodeError */
6191static void
6192raise_encode_exception(PyObject **exceptionObject,
6193                       const char *encoding,
6194                       const Py_UNICODE *unicode, Py_ssize_t size,
6195                       Py_ssize_t startpos, Py_ssize_t endpos,
6196                       const char *reason)
6197{
6198    make_encode_exception(exceptionObject,
6199                          encoding, unicode, size, startpos, endpos, reason);
6200    if (*exceptionObject != NULL)
6201        PyCodec_StrictErrors(*exceptionObject);
6202}
6203
6204/* error handling callback helper:
6205   build arguments, call the callback and check the arguments,
6206   put the result into newpos and return the replacement string, which
6207   has to be freed by the caller */
6208static PyObject *
6209unicode_encode_call_errorhandler(const char *errors,
6210                                 PyObject **errorHandler,
6211                                 const char *encoding, const char *reason,
6212                                 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
6213                                 Py_ssize_t startpos, Py_ssize_t endpos,
6214                                 Py_ssize_t *newpos)
6215{
6216    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6217
6218    PyObject *restuple;
6219    PyObject *resunicode;
6220
6221    if (*errorHandler == NULL) {
6222        *errorHandler = PyCodec_LookupError(errors);
6223        if (*errorHandler == NULL)
6224            return NULL;
6225    }
6226
6227    make_encode_exception(exceptionObject,
6228                          encoding, unicode, size, startpos, endpos, reason);
6229    if (*exceptionObject == NULL)
6230        return NULL;
6231
6232    restuple = PyObject_CallFunctionObjArgs(
6233        *errorHandler, *exceptionObject, NULL);
6234    if (restuple == NULL)
6235        return NULL;
6236    if (!PyTuple_Check(restuple)) {
6237        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6238        Py_DECREF(restuple);
6239        return NULL;
6240    }
6241    if (!PyArg_ParseTuple(restuple, argparse,
6242                          &resunicode, newpos)) {
6243        Py_DECREF(restuple);
6244        return NULL;
6245    }
6246    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6247        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6248        Py_DECREF(restuple);
6249        return NULL;
6250    }
6251    if (*newpos<0)
6252        *newpos = size+*newpos;
6253    if (*newpos<0 || *newpos>size) {
6254        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6255        Py_DECREF(restuple);
6256        return NULL;
6257    }
6258    Py_INCREF(resunicode);
6259    Py_DECREF(restuple);
6260    return resunicode;
6261}
6262
6263static PyObject *
6264unicode_encode_ucs1(const Py_UNICODE *p,
6265                    Py_ssize_t size,
6266                    const char *errors,
6267                    int limit)
6268{
6269    /* output object */
6270    PyObject *res;
6271    /* pointers to the beginning and end+1 of input */
6272    const Py_UNICODE *startp = p;
6273    const Py_UNICODE *endp = p + size;
6274    /* pointer to the beginning of the unencodable characters */
6275    /* const Py_UNICODE *badp = NULL; */
6276    /* pointer into the output */
6277    char *str;
6278    /* current output position */
6279    Py_ssize_t ressize;
6280    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6281    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6282    PyObject *errorHandler = NULL;
6283    PyObject *exc = NULL;
6284    /* the following variable is used for caching string comparisons
6285     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6286    int known_errorHandler = -1;
6287
6288    /* allocate enough for a simple encoding without
6289       replacements, if we need more, we'll resize */
6290    if (size == 0)
6291        return PyBytes_FromStringAndSize(NULL, 0);
6292    res = PyBytes_FromStringAndSize(NULL, size);
6293    if (res == NULL)
6294        return NULL;
6295    str = PyBytes_AS_STRING(res);
6296    ressize = size;
6297
6298    while (p<endp) {
6299        Py_UNICODE c = *p;
6300
6301        /* can we encode this? */
6302        if (c<limit) {
6303            /* no overflow check, because we know that the space is enough */
6304            *str++ = (char)c;
6305            ++p;
6306        }
6307        else {
6308            Py_ssize_t unicodepos = p-startp;
6309            Py_ssize_t requiredsize;
6310            PyObject *repunicode;
6311            Py_ssize_t repsize;
6312            Py_ssize_t newpos;
6313            Py_ssize_t respos;
6314            Py_UNICODE *uni2;
6315            /* startpos for collecting unencodable chars */
6316            const Py_UNICODE *collstart = p;
6317            const Py_UNICODE *collend = p;
6318            /* find all unecodable characters */
6319            while ((collend < endp) && ((*collend)>=limit))
6320                ++collend;
6321            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6322            if (known_errorHandler==-1) {
6323                if ((errors==NULL) || (!strcmp(errors, "strict")))
6324                    known_errorHandler = 1;
6325                else if (!strcmp(errors, "replace"))
6326                    known_errorHandler = 2;
6327                else if (!strcmp(errors, "ignore"))
6328                    known_errorHandler = 3;
6329                else if (!strcmp(errors, "xmlcharrefreplace"))
6330                    known_errorHandler = 4;
6331                else
6332                    known_errorHandler = 0;
6333            }
6334            switch (known_errorHandler) {
6335            case 1: /* strict */
6336                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
6337                goto onError;
6338            case 2: /* replace */
6339                while (collstart++<collend)
6340                    *str++ = '?'; /* fall through */
6341            case 3: /* ignore */
6342                p = collend;
6343                break;
6344            case 4: /* xmlcharrefreplace */
6345                respos = str - PyBytes_AS_STRING(res);
6346                /* determine replacement size (temporarily (mis)uses p) */
6347                for (p = collstart, repsize = 0; p < collend; ++p) {
6348                    if (*p<10)
6349                        repsize += 2+1+1;
6350                    else if (*p<100)
6351                        repsize += 2+2+1;
6352                    else if (*p<1000)
6353                        repsize += 2+3+1;
6354                    else if (*p<10000)
6355                        repsize += 2+4+1;
6356#ifndef Py_UNICODE_WIDE
6357                    else
6358                        repsize += 2+5+1;
6359#else
6360                    else if (*p<100000)
6361                        repsize += 2+5+1;
6362                    else if (*p<1000000)
6363                        repsize += 2+6+1;
6364                    else
6365                        repsize += 2+7+1;
6366#endif
6367                }
6368                requiredsize = respos+repsize+(endp-collend);
6369                if (requiredsize > ressize) {
6370                    if (requiredsize<2*ressize)
6371                        requiredsize = 2*ressize;
6372                    if (_PyBytes_Resize(&res, requiredsize))
6373                        goto onError;
6374                    str = PyBytes_AS_STRING(res) + respos;
6375                    ressize = requiredsize;
6376                }
6377                /* generate replacement (temporarily (mis)uses p) */
6378                for (p = collstart; p < collend; ++p) {
6379                    str += sprintf(str, "&#%d;", (int)*p);
6380                }
6381                p = collend;
6382                break;
6383            default:
6384                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6385                                                              encoding, reason, startp, size, &exc,
6386                                                              collstart-startp, collend-startp, &newpos);
6387                if (repunicode == NULL)
6388                    goto onError;
6389                if (PyBytes_Check(repunicode)) {
6390                    /* Directly copy bytes result to output. */
6391                    repsize = PyBytes_Size(repunicode);
6392                    if (repsize > 1) {
6393                        /* Make room for all additional bytes. */
6394                        respos = str - PyBytes_AS_STRING(res);
6395                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6396                            Py_DECREF(repunicode);
6397                            goto onError;
6398                        }
6399                        str = PyBytes_AS_STRING(res) + respos;
6400                        ressize += repsize-1;
6401                    }
6402                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6403                    str += repsize;
6404                    p = startp + newpos;
6405                    Py_DECREF(repunicode);
6406                    break;
6407                }
6408                /* need more space? (at least enough for what we
6409                   have+the replacement+the rest of the string, so
6410                   we won't have to check space for encodable characters) */
6411                respos = str - PyBytes_AS_STRING(res);
6412                repsize = PyUnicode_GET_SIZE(repunicode);
6413                requiredsize = respos+repsize+(endp-collend);
6414                if (requiredsize > ressize) {
6415                    if (requiredsize<2*ressize)
6416                        requiredsize = 2*ressize;
6417                    if (_PyBytes_Resize(&res, requiredsize)) {
6418                        Py_DECREF(repunicode);
6419                        goto onError;
6420                    }
6421                    str = PyBytes_AS_STRING(res) + respos;
6422                    ressize = requiredsize;
6423                }
6424                /* check if there is anything unencodable in the replacement
6425                   and copy it to the output */
6426                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
6427                    c = *uni2;
6428                    if (c >= limit) {
6429                        raise_encode_exception(&exc, encoding, startp, size,
6430                                               unicodepos, unicodepos+1, reason);
6431                        Py_DECREF(repunicode);
6432                        goto onError;
6433                    }
6434                    *str = (char)c;
6435                }
6436                p = startp + newpos;
6437                Py_DECREF(repunicode);
6438            }
6439        }
6440    }
6441    /* Resize if we allocated to much */
6442    size = str - PyBytes_AS_STRING(res);
6443    if (size < ressize) { /* If this falls res will be NULL */
6444        assert(size >= 0);
6445        if (_PyBytes_Resize(&res, size) < 0)
6446            goto onError;
6447    }
6448
6449    Py_XDECREF(errorHandler);
6450    Py_XDECREF(exc);
6451    return res;
6452
6453  onError:
6454    Py_XDECREF(res);
6455    Py_XDECREF(errorHandler);
6456    Py_XDECREF(exc);
6457    return NULL;
6458}
6459
6460PyObject *
6461PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6462                       Py_ssize_t size,
6463                       const char *errors)
6464{
6465    return unicode_encode_ucs1(p, size, errors, 256);
6466}
6467
6468PyObject *
6469_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6470{
6471    if (!PyUnicode_Check(unicode)) {
6472        PyErr_BadArgument();
6473        return NULL;
6474    }
6475    if (PyUnicode_READY(unicode) == -1)
6476        return NULL;
6477    /* Fast path: if it is a one-byte string, construct
6478       bytes object directly. */
6479    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6480        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6481                                         PyUnicode_GET_LENGTH(unicode));
6482    /* Non-Latin-1 characters present. Defer to above function to
6483       raise the exception. */
6484    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
6485                                  PyUnicode_GET_SIZE(unicode),
6486                                  errors);
6487}
6488
6489PyObject*
6490PyUnicode_AsLatin1String(PyObject *unicode)
6491{
6492    return _PyUnicode_AsLatin1String(unicode, NULL);
6493}
6494
6495/* --- 7-bit ASCII Codec -------------------------------------------------- */
6496
6497PyObject *
6498PyUnicode_DecodeASCII(const char *s,
6499                      Py_ssize_t size,
6500                      const char *errors)
6501{
6502    const char *starts = s;
6503    PyUnicodeObject *v;
6504    Py_UNICODE *u;
6505    Py_ssize_t startinpos;
6506    Py_ssize_t endinpos;
6507    Py_ssize_t outpos;
6508    const char *e;
6509    int has_error;
6510    const unsigned char *p = (const unsigned char *)s;
6511    const unsigned char *end = p + size;
6512    const unsigned char *aligned_end = (const unsigned char *) ((size_t) end & ~LONG_PTR_MASK);
6513    PyObject *errorHandler = NULL;
6514    PyObject *exc = NULL;
6515
6516    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6517    if (size == 1 && (unsigned char)s[0] < 128)
6518        return get_latin1_char((unsigned char)s[0]);
6519
6520    has_error = 0;
6521    while (p < end && !has_error) {
6522        /* Fast path, see below in PyUnicode_DecodeUTF8Stateful for
6523           an explanation. */
6524        if (!((size_t) p & LONG_PTR_MASK)) {
6525            /* Help register allocation */
6526            register const unsigned char *_p = p;
6527            while (_p < aligned_end) {
6528                unsigned long value = *(unsigned long *) _p;
6529                if (value & ASCII_CHAR_MASK) {
6530                    has_error = 1;
6531                    break;
6532                }
6533                _p += SIZEOF_LONG;
6534            }
6535            if (_p == end)
6536                break;
6537            if (has_error)
6538                break;
6539            p = _p;
6540        }
6541        if (*p & 0x80) {
6542            has_error = 1;
6543            break;
6544        }
6545        else {
6546            ++p;
6547        }
6548    }
6549    if (!has_error)
6550        return unicode_fromascii((const unsigned char *)s, size);
6551
6552    v = _PyUnicode_New(size);
6553    if (v == NULL)
6554        goto onError;
6555    if (size == 0)
6556        return (PyObject *)v;
6557    u = PyUnicode_AS_UNICODE(v);
6558    e = s + size;
6559    while (s < e) {
6560        register unsigned char c = (unsigned char)*s;
6561        if (c < 128) {
6562            *u++ = c;
6563            ++s;
6564        }
6565        else {
6566            startinpos = s-starts;
6567            endinpos = startinpos + 1;
6568            outpos = u - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
6569            if (unicode_decode_call_errorhandler(
6570                    errors, &errorHandler,
6571                    "ascii", "ordinal not in range(128)",
6572                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6573                    &v, &outpos, &u))
6574                goto onError;
6575        }
6576    }
6577    if (u - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
6578        if (PyUnicode_Resize((PyObject**)&v, u - PyUnicode_AS_UNICODE(v)) < 0)
6579            goto onError;
6580    Py_XDECREF(errorHandler);
6581    Py_XDECREF(exc);
6582#ifndef DONT_MAKE_RESULT_READY
6583    if (_PyUnicode_READY_REPLACE(&v)) {
6584        Py_DECREF(v);
6585        return NULL;
6586    }
6587#endif
6588    return (PyObject *)v;
6589
6590  onError:
6591    Py_XDECREF(v);
6592    Py_XDECREF(errorHandler);
6593    Py_XDECREF(exc);
6594    return NULL;
6595}
6596
6597PyObject *
6598PyUnicode_EncodeASCII(const Py_UNICODE *p,
6599                      Py_ssize_t size,
6600                      const char *errors)
6601{
6602    return unicode_encode_ucs1(p, size, errors, 128);
6603}
6604
6605PyObject *
6606_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6607{
6608    if (!PyUnicode_Check(unicode)) {
6609        PyErr_BadArgument();
6610        return NULL;
6611    }
6612    if (PyUnicode_READY(unicode) == -1)
6613        return NULL;
6614    /* Fast path: if it is an ASCII-only string, construct bytes object
6615       directly. Else defer to above function to raise the exception. */
6616    if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
6617        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6618                                         PyUnicode_GET_LENGTH(unicode));
6619    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
6620                                 PyUnicode_GET_SIZE(unicode),
6621                                 errors);
6622}
6623
6624PyObject *
6625PyUnicode_AsASCIIString(PyObject *unicode)
6626{
6627    return _PyUnicode_AsASCIIString(unicode, NULL);
6628}
6629
6630#ifdef HAVE_MBCS
6631
6632/* --- MBCS codecs for Windows -------------------------------------------- */
6633
6634#if SIZEOF_INT < SIZEOF_SIZE_T
6635#define NEED_RETRY
6636#endif
6637
6638/* XXX This code is limited to "true" double-byte encodings, as
6639   a) it assumes an incomplete character consists of a single byte, and
6640   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
6641   encodings, see IsDBCSLeadByteEx documentation. */
6642
6643static int
6644is_dbcs_lead_byte(const char *s, int offset)
6645{
6646    const char *curr = s + offset;
6647
6648    if (IsDBCSLeadByte(*curr)) {
6649        const char *prev = CharPrev(s, curr);
6650        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
6651    }
6652    return 0;
6653}
6654
6655/*
6656 * Decode MBCS string into unicode object. If 'final' is set, converts
6657 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
6658 */
6659static int
6660decode_mbcs(PyUnicodeObject **v,
6661            const char *s, /* MBCS string */
6662            int size, /* sizeof MBCS string */
6663            int final,
6664            const char *errors)
6665{
6666    Py_UNICODE *p;
6667    Py_ssize_t n;
6668    DWORD usize;
6669    DWORD flags;
6670
6671    assert(size >= 0);
6672
6673    /* check and handle 'errors' arg */
6674    if (errors==NULL || strcmp(errors, "strict")==0)
6675        flags = MB_ERR_INVALID_CHARS;
6676    else if (strcmp(errors, "ignore")==0)
6677        flags = 0;
6678    else {
6679        PyErr_Format(PyExc_ValueError,
6680                     "mbcs encoding does not support errors='%s'",
6681                     errors);
6682        return -1;
6683    }
6684
6685    /* Skip trailing lead-byte unless 'final' is set */
6686    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
6687        --size;
6688
6689    /* First get the size of the result */
6690    if (size > 0) {
6691        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
6692        if (usize==0)
6693            goto mbcs_decode_error;
6694    } else
6695        usize = 0;
6696
6697    if (*v == NULL) {
6698        /* Create unicode object */
6699        *v = _PyUnicode_New(usize);
6700        if (*v == NULL)
6701            return -1;
6702        n = 0;
6703    }
6704    else {
6705        /* Extend unicode object */
6706        n = PyUnicode_GET_SIZE(*v);
6707        if (PyUnicode_Resize((PyObject**)v, n + usize) < 0)
6708            return -1;
6709    }
6710
6711    /* Do the conversion */
6712    if (usize > 0) {
6713        p = PyUnicode_AS_UNICODE(*v) + n;
6714        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
6715            goto mbcs_decode_error;
6716        }
6717    }
6718    return size;
6719
6720mbcs_decode_error:
6721    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
6722       we raise a UnicodeDecodeError - else it is a 'generic'
6723       windows error
6724     */
6725    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
6726        /* Ideally, we should get reason from FormatMessage - this
6727           is the Windows 2000 English version of the message
6728        */
6729        PyObject *exc = NULL;
6730        const char *reason = "No mapping for the Unicode character exists "
6731                             "in the target multi-byte code page.";
6732        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
6733        if (exc != NULL) {
6734            PyCodec_StrictErrors(exc);
6735            Py_DECREF(exc);
6736        }
6737    } else {
6738        PyErr_SetFromWindowsErrWithFilename(0, NULL);
6739    }
6740    return -1;
6741}
6742
6743PyObject *
6744PyUnicode_DecodeMBCSStateful(const char *s,
6745                             Py_ssize_t size,
6746                             const char *errors,
6747                             Py_ssize_t *consumed)
6748{
6749    PyUnicodeObject *v = NULL;
6750    int done;
6751
6752    if (consumed)
6753        *consumed = 0;
6754
6755#ifdef NEED_RETRY
6756  retry:
6757    if (size > INT_MAX)
6758        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
6759    else
6760#endif
6761        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
6762
6763    if (done < 0) {
6764        Py_XDECREF(v);
6765        return NULL;
6766    }
6767
6768    if (consumed)
6769        *consumed += done;
6770
6771#ifdef NEED_RETRY
6772    if (size > INT_MAX) {
6773        s += done;
6774        size -= done;
6775        goto retry;
6776    }
6777#endif
6778#ifndef DONT_MAKE_RESULT_READY
6779    if (_PyUnicode_READY_REPLACE(&v)) {
6780        Py_DECREF(v);
6781        return NULL;
6782    }
6783#endif
6784    return (PyObject *)v;
6785}
6786
6787PyObject *
6788PyUnicode_DecodeMBCS(const char *s,
6789                     Py_ssize_t size,
6790                     const char *errors)
6791{
6792    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6793}
6794
6795/*
6796 * Convert unicode into string object (MBCS).
6797 * Returns 0 if succeed, -1 otherwise.
6798 */
6799static int
6800encode_mbcs(PyObject **repr,
6801            const Py_UNICODE *p, /* unicode */
6802            int size, /* size of unicode */
6803            const char* errors)
6804{
6805    BOOL usedDefaultChar = FALSE;
6806    BOOL *pusedDefaultChar;
6807    int mbcssize;
6808    Py_ssize_t n;
6809    PyObject *exc = NULL;
6810    DWORD flags;
6811
6812    assert(size >= 0);
6813
6814    /* check and handle 'errors' arg */
6815    if (errors==NULL || strcmp(errors, "strict")==0) {
6816        flags = WC_NO_BEST_FIT_CHARS;
6817        pusedDefaultChar = &usedDefaultChar;
6818    } else if (strcmp(errors, "replace")==0) {
6819        flags = 0;
6820        pusedDefaultChar = NULL;
6821    } else {
6822         PyErr_Format(PyExc_ValueError,
6823                      "mbcs encoding does not support errors='%s'",
6824                      errors);
6825         return -1;
6826    }
6827
6828    /* First get the size of the result */
6829    if (size > 0) {
6830        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
6831                                       NULL, pusedDefaultChar);
6832        if (mbcssize == 0) {
6833            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6834            return -1;
6835        }
6836        /* If we used a default char, then we failed! */
6837        if (pusedDefaultChar && *pusedDefaultChar)
6838            goto mbcs_encode_error;
6839    } else {
6840        mbcssize = 0;
6841    }
6842
6843    if (*repr == NULL) {
6844        /* Create string object */
6845        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
6846        if (*repr == NULL)
6847            return -1;
6848        n = 0;
6849    }
6850    else {
6851        /* Extend string object */
6852        n = PyBytes_Size(*repr);
6853        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
6854            return -1;
6855    }
6856
6857    /* Do the conversion */
6858    if (size > 0) {
6859        char *s = PyBytes_AS_STRING(*repr) + n;
6860        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
6861                                     NULL, pusedDefaultChar)) {
6862            PyErr_SetFromWindowsErrWithFilename(0, NULL);
6863            return -1;
6864        }
6865        if (pusedDefaultChar && *pusedDefaultChar)
6866            goto mbcs_encode_error;
6867    }
6868    return 0;
6869
6870mbcs_encode_error:
6871    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
6872    Py_XDECREF(exc);
6873    return -1;
6874}
6875
6876PyObject *
6877PyUnicode_EncodeMBCS(const Py_UNICODE *p,
6878                     Py_ssize_t size,
6879                     const char *errors)
6880{
6881    PyObject *repr = NULL;
6882    int ret;
6883
6884#ifdef NEED_RETRY
6885  retry:
6886    if (size > INT_MAX)
6887        ret = encode_mbcs(&repr, p, INT_MAX, errors);
6888    else
6889#endif
6890        ret = encode_mbcs(&repr, p, (int)size, errors);
6891
6892    if (ret < 0) {
6893        Py_XDECREF(repr);
6894        return NULL;
6895    }
6896
6897#ifdef NEED_RETRY
6898    if (size > INT_MAX) {
6899        p += INT_MAX;
6900        size -= INT_MAX;
6901        goto retry;
6902    }
6903#endif
6904
6905    return repr;
6906}
6907
6908PyObject *
6909PyUnicode_AsMBCSString(PyObject *unicode)
6910{
6911    if (!PyUnicode_Check(unicode)) {
6912        PyErr_BadArgument();
6913        return NULL;
6914    }
6915    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
6916                                PyUnicode_GET_SIZE(unicode),
6917                                NULL);
6918}
6919
6920#undef NEED_RETRY
6921
6922#endif /* HAVE_MBCS */
6923
6924/* --- Character Mapping Codec -------------------------------------------- */
6925
6926PyObject *
6927PyUnicode_DecodeCharmap(const char *s,
6928                        Py_ssize_t size,
6929                        PyObject *mapping,
6930                        const char *errors)
6931{
6932    const char *starts = s;
6933    Py_ssize_t startinpos;
6934    Py_ssize_t endinpos;
6935    Py_ssize_t outpos;
6936    const char *e;
6937    PyUnicodeObject *v;
6938    Py_UNICODE *p;
6939    Py_ssize_t extrachars = 0;
6940    PyObject *errorHandler = NULL;
6941    PyObject *exc = NULL;
6942    Py_UNICODE *mapstring = NULL;
6943    Py_ssize_t maplen = 0;
6944
6945    /* Default to Latin-1 */
6946    if (mapping == NULL)
6947        return PyUnicode_DecodeLatin1(s, size, errors);
6948
6949    v = _PyUnicode_New(size);
6950    if (v == NULL)
6951        goto onError;
6952    if (size == 0)
6953        return (PyObject *)v;
6954    p = PyUnicode_AS_UNICODE(v);
6955    e = s + size;
6956    if (PyUnicode_CheckExact(mapping)) {
6957        mapstring = PyUnicode_AS_UNICODE(mapping);
6958        maplen = PyUnicode_GET_SIZE(mapping);
6959        while (s < e) {
6960            unsigned char ch = *s;
6961            Py_UNICODE x = 0xfffe; /* illegal value */
6962
6963            if (ch < maplen)
6964                x = mapstring[ch];
6965
6966            if (x == 0xfffe) {
6967                /* undefined mapping */
6968                outpos = p-PyUnicode_AS_UNICODE(v);
6969                startinpos = s-starts;
6970                endinpos = startinpos+1;
6971                if (unicode_decode_call_errorhandler(
6972                        errors, &errorHandler,
6973                        "charmap", "character maps to <undefined>",
6974                        &starts, &e, &startinpos, &endinpos, &exc, &s,
6975                        &v, &outpos, &p)) {
6976                    goto onError;
6977                }
6978                continue;
6979            }
6980            *p++ = x;
6981            ++s;
6982        }
6983    }
6984    else {
6985        while (s < e) {
6986            unsigned char ch = *s;
6987            PyObject *w, *x;
6988
6989            /* Get mapping (char ordinal -> integer, Unicode char or None) */
6990            w = PyLong_FromLong((long)ch);
6991            if (w == NULL)
6992                goto onError;
6993            x = PyObject_GetItem(mapping, w);
6994            Py_DECREF(w);
6995            if (x == NULL) {
6996                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6997                    /* No mapping found means: mapping is undefined. */
6998                    PyErr_Clear();
6999                    x = Py_None;
7000                    Py_INCREF(x);
7001                } else
7002                    goto onError;
7003            }
7004
7005            /* Apply mapping */
7006            if (PyLong_Check(x)) {
7007                long value = PyLong_AS_LONG(x);
7008                if (value < 0 || value > 65535) {
7009                    PyErr_SetString(PyExc_TypeError,
7010                                    "character mapping must be in range(65536)");
7011                    Py_DECREF(x);
7012                    goto onError;
7013                }
7014                *p++ = (Py_UNICODE)value;
7015            }
7016            else if (x == Py_None) {
7017                /* undefined mapping */
7018                outpos = p-PyUnicode_AS_UNICODE(v);
7019                startinpos = s-starts;
7020                endinpos = startinpos+1;
7021                if (unicode_decode_call_errorhandler(
7022                        errors, &errorHandler,
7023                        "charmap", "character maps to <undefined>",
7024                        &starts, &e, &startinpos, &endinpos, &exc, &s,
7025                        &v, &outpos, &p)) {
7026                    Py_DECREF(x);
7027                    goto onError;
7028                }
7029                Py_DECREF(x);
7030                continue;
7031            }
7032            else if (PyUnicode_Check(x)) {
7033                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
7034
7035                if (targetsize == 1)
7036                    /* 1-1 mapping */
7037                    *p++ = *PyUnicode_AS_UNICODE(x);
7038
7039                else if (targetsize > 1) {
7040                    /* 1-n mapping */
7041                    if (targetsize > extrachars) {
7042                        /* resize first */
7043                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
7044                        Py_ssize_t needed = (targetsize - extrachars) + \
7045                            (targetsize << 2);
7046                        extrachars += needed;
7047                        /* XXX overflow detection missing */
7048                        if (PyUnicode_Resize((PyObject**)&v,
7049                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
7050                            Py_DECREF(x);
7051                            goto onError;
7052                        }
7053                        p = PyUnicode_AS_UNICODE(v) + oldpos;
7054                    }
7055                    Py_UNICODE_COPY(p,
7056                                    PyUnicode_AS_UNICODE(x),
7057                                    targetsize);
7058                    p += targetsize;
7059                    extrachars -= targetsize;
7060                }
7061                /* 1-0 mapping: skip the character */
7062            }
7063            else {
7064                /* wrong return value */
7065                PyErr_SetString(PyExc_TypeError,
7066                                "character mapping must return integer, None or str");
7067                Py_DECREF(x);
7068                goto onError;
7069            }
7070            Py_DECREF(x);
7071            ++s;
7072        }
7073    }
7074    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
7075        if (PyUnicode_Resize((PyObject**)&v, p - PyUnicode_AS_UNICODE(v)) < 0)
7076            goto onError;
7077    Py_XDECREF(errorHandler);
7078    Py_XDECREF(exc);
7079#ifndef DONT_MAKE_RESULT_READY
7080    if (_PyUnicode_READY_REPLACE(&v)) {
7081        Py_DECREF(v);
7082        return NULL;
7083    }
7084#endif
7085    return (PyObject *)v;
7086
7087  onError:
7088    Py_XDECREF(errorHandler);
7089    Py_XDECREF(exc);
7090    Py_XDECREF(v);
7091    return NULL;
7092}
7093
7094/* Charmap encoding: the lookup table */
7095
7096struct encoding_map {
7097    PyObject_HEAD
7098    unsigned char level1[32];
7099    int count2, count3;
7100    unsigned char level23[1];
7101};
7102
7103static PyObject*
7104encoding_map_size(PyObject *obj, PyObject* args)
7105{
7106    struct encoding_map *map = (struct encoding_map*)obj;
7107    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7108                           128*map->count3);
7109}
7110
7111static PyMethodDef encoding_map_methods[] = {
7112    {"size", encoding_map_size, METH_NOARGS,
7113     PyDoc_STR("Return the size (in bytes) of this object") },
7114    { 0 }
7115};
7116
7117static void
7118encoding_map_dealloc(PyObject* o)
7119{
7120    PyObject_FREE(o);
7121}
7122
7123static PyTypeObject EncodingMapType = {
7124    PyVarObject_HEAD_INIT(NULL, 0)
7125    "EncodingMap",          /*tp_name*/
7126    sizeof(struct encoding_map),   /*tp_basicsize*/
7127    0,                      /*tp_itemsize*/
7128    /* methods */
7129    encoding_map_dealloc,   /*tp_dealloc*/
7130    0,                      /*tp_print*/
7131    0,                      /*tp_getattr*/
7132    0,                      /*tp_setattr*/
7133    0,                      /*tp_reserved*/
7134    0,                      /*tp_repr*/
7135    0,                      /*tp_as_number*/
7136    0,                      /*tp_as_sequence*/
7137    0,                      /*tp_as_mapping*/
7138    0,                      /*tp_hash*/
7139    0,                      /*tp_call*/
7140    0,                      /*tp_str*/
7141    0,                      /*tp_getattro*/
7142    0,                      /*tp_setattro*/
7143    0,                      /*tp_as_buffer*/
7144    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7145    0,                      /*tp_doc*/
7146    0,                      /*tp_traverse*/
7147    0,                      /*tp_clear*/
7148    0,                      /*tp_richcompare*/
7149    0,                      /*tp_weaklistoffset*/
7150    0,                      /*tp_iter*/
7151    0,                      /*tp_iternext*/
7152    encoding_map_methods,   /*tp_methods*/
7153    0,                      /*tp_members*/
7154    0,                      /*tp_getset*/
7155    0,                      /*tp_base*/
7156    0,                      /*tp_dict*/
7157    0,                      /*tp_descr_get*/
7158    0,                      /*tp_descr_set*/
7159    0,                      /*tp_dictoffset*/
7160    0,                      /*tp_init*/
7161    0,                      /*tp_alloc*/
7162    0,                      /*tp_new*/
7163    0,                      /*tp_free*/
7164    0,                      /*tp_is_gc*/
7165};
7166
7167PyObject*
7168PyUnicode_BuildEncodingMap(PyObject* string)
7169{
7170    PyObject *result;
7171    struct encoding_map *mresult;
7172    int i;
7173    int need_dict = 0;
7174    unsigned char level1[32];
7175    unsigned char level2[512];
7176    unsigned char *mlevel1, *mlevel2, *mlevel3;
7177    int count2 = 0, count3 = 0;
7178    int kind;
7179    void *data;
7180    Py_UCS4 ch;
7181
7182    if (!PyUnicode_Check(string) || PyUnicode_GET_LENGTH(string) != 256) {
7183        PyErr_BadArgument();
7184        return NULL;
7185    }
7186    kind = PyUnicode_KIND(string);
7187    data = PyUnicode_DATA(string);
7188    memset(level1, 0xFF, sizeof level1);
7189    memset(level2, 0xFF, sizeof level2);
7190
7191    /* If there isn't a one-to-one mapping of NULL to \0,
7192       or if there are non-BMP characters, we need to use
7193       a mapping dictionary. */
7194    if (PyUnicode_READ(kind, data, 0) != 0)
7195        need_dict = 1;
7196    for (i = 1; i < 256; i++) {
7197        int l1, l2;
7198        ch = PyUnicode_READ(kind, data, i);
7199        if (ch == 0 || ch > 0xFFFF) {
7200            need_dict = 1;
7201            break;
7202        }
7203        if (ch == 0xFFFE)
7204            /* unmapped character */
7205            continue;
7206        l1 = ch >> 11;
7207        l2 = ch >> 7;
7208        if (level1[l1] == 0xFF)
7209            level1[l1] = count2++;
7210        if (level2[l2] == 0xFF)
7211            level2[l2] = count3++;
7212    }
7213
7214    if (count2 >= 0xFF || count3 >= 0xFF)
7215        need_dict = 1;
7216
7217    if (need_dict) {
7218        PyObject *result = PyDict_New();
7219        PyObject *key, *value;
7220        if (!result)
7221            return NULL;
7222        for (i = 0; i < 256; i++) {
7223            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7224            value = PyLong_FromLong(i);
7225            if (!key || !value)
7226                goto failed1;
7227            if (PyDict_SetItem(result, key, value) == -1)
7228                goto failed1;
7229            Py_DECREF(key);
7230            Py_DECREF(value);
7231        }
7232        return result;
7233      failed1:
7234        Py_XDECREF(key);
7235        Py_XDECREF(value);
7236        Py_DECREF(result);
7237        return NULL;
7238    }
7239
7240    /* Create a three-level trie */
7241    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7242                             16*count2 + 128*count3 - 1);
7243    if (!result)
7244        return PyErr_NoMemory();
7245    PyObject_Init(result, &EncodingMapType);
7246    mresult = (struct encoding_map*)result;
7247    mresult->count2 = count2;
7248    mresult->count3 = count3;
7249    mlevel1 = mresult->level1;
7250    mlevel2 = mresult->level23;
7251    mlevel3 = mresult->level23 + 16*count2;
7252    memcpy(mlevel1, level1, 32);
7253    memset(mlevel2, 0xFF, 16*count2);
7254    memset(mlevel3, 0, 128*count3);
7255    count3 = 0;
7256    for (i = 1; i < 256; i++) {
7257        int o1, o2, o3, i2, i3;
7258        if (PyUnicode_READ(kind, data, i) == 0xFFFE)
7259            /* unmapped character */
7260            continue;
7261        o1 = PyUnicode_READ(kind, data, i)>>11;
7262        o2 = (PyUnicode_READ(kind, data, i)>>7) & 0xF;
7263        i2 = 16*mlevel1[o1] + o2;
7264        if (mlevel2[i2] == 0xFF)
7265            mlevel2[i2] = count3++;
7266        o3 = PyUnicode_READ(kind, data, i) & 0x7F;
7267        i3 = 128*mlevel2[i2] + o3;
7268        mlevel3[i3] = i;
7269    }
7270    return result;
7271}
7272
7273static int
7274encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
7275{
7276    struct encoding_map *map = (struct encoding_map*)mapping;
7277    int l1 = c>>11;
7278    int l2 = (c>>7) & 0xF;
7279    int l3 = c & 0x7F;
7280    int i;
7281
7282#ifdef Py_UNICODE_WIDE
7283    if (c > 0xFFFF) {
7284        return -1;
7285    }
7286#endif
7287    if (c == 0)
7288        return 0;
7289    /* level 1*/
7290    i = map->level1[l1];
7291    if (i == 0xFF) {
7292        return -1;
7293    }
7294    /* level 2*/
7295    i = map->level23[16*i+l2];
7296    if (i == 0xFF) {
7297        return -1;
7298    }
7299    /* level 3 */
7300    i = map->level23[16*map->count2 + 128*i + l3];
7301    if (i == 0) {
7302        return -1;
7303    }
7304    return i;
7305}
7306
7307/* Lookup the character ch in the mapping. If the character
7308   can't be found, Py_None is returned (or NULL, if another
7309   error occurred). */
7310static PyObject *
7311charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
7312{
7313    PyObject *w = PyLong_FromLong((long)c);
7314    PyObject *x;
7315
7316    if (w == NULL)
7317        return NULL;
7318    x = PyObject_GetItem(mapping, w);
7319    Py_DECREF(w);
7320    if (x == NULL) {
7321        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7322            /* No mapping found means: mapping is undefined. */
7323            PyErr_Clear();
7324            x = Py_None;
7325            Py_INCREF(x);
7326            return x;
7327        } else
7328            return NULL;
7329    }
7330    else if (x == Py_None)
7331        return x;
7332    else if (PyLong_Check(x)) {
7333        long value = PyLong_AS_LONG(x);
7334        if (value < 0 || value > 255) {
7335            PyErr_SetString(PyExc_TypeError,
7336                            "character mapping must be in range(256)");
7337            Py_DECREF(x);
7338            return NULL;
7339        }
7340        return x;
7341    }
7342    else if (PyBytes_Check(x))
7343        return x;
7344    else {
7345        /* wrong return value */
7346        PyErr_Format(PyExc_TypeError,
7347                     "character mapping must return integer, bytes or None, not %.400s",
7348                     x->ob_type->tp_name);
7349        Py_DECREF(x);
7350        return NULL;
7351    }
7352}
7353
7354static int
7355charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7356{
7357    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7358    /* exponentially overallocate to minimize reallocations */
7359    if (requiredsize < 2*outsize)
7360        requiredsize = 2*outsize;
7361    if (_PyBytes_Resize(outobj, requiredsize))
7362        return -1;
7363    return 0;
7364}
7365
7366typedef enum charmapencode_result {
7367    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7368} charmapencode_result;
7369/* lookup the character, put the result in the output string and adjust
7370   various state variables. Resize the output bytes object if not enough
7371   space is available. Return a new reference to the object that
7372   was put in the output buffer, or Py_None, if the mapping was undefined
7373   (in which case no character was written) or NULL, if a
7374   reallocation error occurred. The caller must decref the result */
7375static charmapencode_result
7376charmapencode_output(Py_UNICODE c, PyObject *mapping,
7377                     PyObject **outobj, Py_ssize_t *outpos)
7378{
7379    PyObject *rep;
7380    char *outstart;
7381    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7382
7383    if (Py_TYPE(mapping) == &EncodingMapType) {
7384        int res = encoding_map_lookup(c, mapping);
7385        Py_ssize_t requiredsize = *outpos+1;
7386        if (res == -1)
7387            return enc_FAILED;
7388        if (outsize<requiredsize)
7389            if (charmapencode_resize(outobj, outpos, requiredsize))
7390                return enc_EXCEPTION;
7391        outstart = PyBytes_AS_STRING(*outobj);
7392        outstart[(*outpos)++] = (char)res;
7393        return enc_SUCCESS;
7394    }
7395
7396    rep = charmapencode_lookup(c, mapping);
7397    if (rep==NULL)
7398        return enc_EXCEPTION;
7399    else if (rep==Py_None) {
7400        Py_DECREF(rep);
7401        return enc_FAILED;
7402    } else {
7403        if (PyLong_Check(rep)) {
7404            Py_ssize_t requiredsize = *outpos+1;
7405            if (outsize<requiredsize)
7406                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7407                    Py_DECREF(rep);
7408                    return enc_EXCEPTION;
7409                }
7410            outstart = PyBytes_AS_STRING(*outobj);
7411            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7412        }
7413        else {
7414            const char *repchars = PyBytes_AS_STRING(rep);
7415            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7416            Py_ssize_t requiredsize = *outpos+repsize;
7417            if (outsize<requiredsize)
7418                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7419                    Py_DECREF(rep);
7420                    return enc_EXCEPTION;
7421                }
7422            outstart = PyBytes_AS_STRING(*outobj);
7423            memcpy(outstart + *outpos, repchars, repsize);
7424            *outpos += repsize;
7425        }
7426    }
7427    Py_DECREF(rep);
7428    return enc_SUCCESS;
7429}
7430
7431/* handle an error in PyUnicode_EncodeCharmap
7432   Return 0 on success, -1 on error */
7433static int
7434charmap_encoding_error(
7435    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
7436    PyObject **exceptionObject,
7437    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7438    PyObject **res, Py_ssize_t *respos)
7439{
7440    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7441    Py_ssize_t repsize;
7442    Py_ssize_t newpos;
7443    Py_UNICODE *uni2;
7444    /* startpos for collecting unencodable chars */
7445    Py_ssize_t collstartpos = *inpos;
7446    Py_ssize_t collendpos = *inpos+1;
7447    Py_ssize_t collpos;
7448    char *encoding = "charmap";
7449    char *reason = "character maps to <undefined>";
7450    charmapencode_result x;
7451
7452    /* find all unencodable characters */
7453    while (collendpos < size) {
7454        PyObject *rep;
7455        if (Py_TYPE(mapping) == &EncodingMapType) {
7456            int res = encoding_map_lookup(p[collendpos], mapping);
7457            if (res != -1)
7458                break;
7459            ++collendpos;
7460            continue;
7461        }
7462
7463        rep = charmapencode_lookup(p[collendpos], mapping);
7464        if (rep==NULL)
7465            return -1;
7466        else if (rep!=Py_None) {
7467            Py_DECREF(rep);
7468            break;
7469        }
7470        Py_DECREF(rep);
7471        ++collendpos;
7472    }
7473    /* cache callback name lookup
7474     * (if not done yet, i.e. it's the first error) */
7475    if (*known_errorHandler==-1) {
7476        if ((errors==NULL) || (!strcmp(errors, "strict")))
7477            *known_errorHandler = 1;
7478        else if (!strcmp(errors, "replace"))
7479            *known_errorHandler = 2;
7480        else if (!strcmp(errors, "ignore"))
7481            *known_errorHandler = 3;
7482        else if (!strcmp(errors, "xmlcharrefreplace"))
7483            *known_errorHandler = 4;
7484        else
7485            *known_errorHandler = 0;
7486    }
7487    switch (*known_errorHandler) {
7488    case 1: /* strict */
7489        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7490        return -1;
7491    case 2: /* replace */
7492        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
7493            x = charmapencode_output('?', mapping, res, respos);
7494            if (x==enc_EXCEPTION) {
7495                return -1;
7496            }
7497            else if (x==enc_FAILED) {
7498                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7499                return -1;
7500            }
7501        }
7502        /* fall through */
7503    case 3: /* ignore */
7504        *inpos = collendpos;
7505        break;
7506    case 4: /* xmlcharrefreplace */
7507        /* generate replacement (temporarily (mis)uses p) */
7508        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
7509            char buffer[2+29+1+1];
7510            char *cp;
7511            sprintf(buffer, "&#%d;", (int)p[collpos]);
7512            for (cp = buffer; *cp; ++cp) {
7513                x = charmapencode_output(*cp, mapping, res, respos);
7514                if (x==enc_EXCEPTION)
7515                    return -1;
7516                else if (x==enc_FAILED) {
7517                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7518                    return -1;
7519                }
7520            }
7521        }
7522        *inpos = collendpos;
7523        break;
7524    default:
7525        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
7526                                                      encoding, reason, p, size, exceptionObject,
7527                                                      collstartpos, collendpos, &newpos);
7528        if (repunicode == NULL)
7529            return -1;
7530        if (PyBytes_Check(repunicode)) {
7531            /* Directly copy bytes result to output. */
7532            Py_ssize_t outsize = PyBytes_Size(*res);
7533            Py_ssize_t requiredsize;
7534            repsize = PyBytes_Size(repunicode);
7535            requiredsize = *respos + repsize;
7536            if (requiredsize > outsize)
7537                /* Make room for all additional bytes. */
7538                if (charmapencode_resize(res, respos, requiredsize)) {
7539                    Py_DECREF(repunicode);
7540                    return -1;
7541                }
7542            memcpy(PyBytes_AsString(*res) + *respos,
7543                   PyBytes_AsString(repunicode),  repsize);
7544            *respos += repsize;
7545            *inpos = newpos;
7546            Py_DECREF(repunicode);
7547            break;
7548        }
7549        /* generate replacement  */
7550        repsize = PyUnicode_GET_SIZE(repunicode);
7551        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
7552            x = charmapencode_output(*uni2, mapping, res, respos);
7553            if (x==enc_EXCEPTION) {
7554                return -1;
7555            }
7556            else if (x==enc_FAILED) {
7557                Py_DECREF(repunicode);
7558                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
7559                return -1;
7560            }
7561        }
7562        *inpos = newpos;
7563        Py_DECREF(repunicode);
7564    }
7565    return 0;
7566}
7567
7568PyObject *
7569PyUnicode_EncodeCharmap(const Py_UNICODE *p,
7570                        Py_ssize_t size,
7571                        PyObject *mapping,
7572                        const char *errors)
7573{
7574    /* output object */
7575    PyObject *res = NULL;
7576    /* current input position */
7577    Py_ssize_t inpos = 0;
7578    /* current output position */
7579    Py_ssize_t respos = 0;
7580    PyObject *errorHandler = NULL;
7581    PyObject *exc = NULL;
7582    /* the following variable is used for caching string comparisons
7583     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7584     * 3=ignore, 4=xmlcharrefreplace */
7585    int known_errorHandler = -1;
7586
7587    /* Default to Latin-1 */
7588    if (mapping == NULL)
7589        return PyUnicode_EncodeLatin1(p, size, errors);
7590
7591    /* allocate enough for a simple encoding without
7592       replacements, if we need more, we'll resize */
7593    res = PyBytes_FromStringAndSize(NULL, size);
7594    if (res == NULL)
7595        goto onError;
7596    if (size == 0)
7597        return res;
7598
7599    while (inpos<size) {
7600        /* try to encode it */
7601        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
7602        if (x==enc_EXCEPTION) /* error */
7603            goto onError;
7604        if (x==enc_FAILED) { /* unencodable character */
7605            if (charmap_encoding_error(p, size, &inpos, mapping,
7606                                       &exc,
7607                                       &known_errorHandler, &errorHandler, errors,
7608                                       &res, &respos)) {
7609                goto onError;
7610            }
7611        }
7612        else
7613            /* done with this character => adjust input position */
7614            ++inpos;
7615    }
7616
7617    /* Resize if we allocated to much */
7618    if (respos<PyBytes_GET_SIZE(res))
7619        if (_PyBytes_Resize(&res, respos) < 0)
7620            goto onError;
7621
7622    Py_XDECREF(exc);
7623    Py_XDECREF(errorHandler);
7624    return res;
7625
7626  onError:
7627    Py_XDECREF(res);
7628    Py_XDECREF(exc);
7629    Py_XDECREF(errorHandler);
7630    return NULL;
7631}
7632
7633PyObject *
7634PyUnicode_AsCharmapString(PyObject *unicode,
7635                          PyObject *mapping)
7636{
7637    if (!PyUnicode_Check(unicode) || mapping == NULL) {
7638        PyErr_BadArgument();
7639        return NULL;
7640    }
7641    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
7642                                   PyUnicode_GET_SIZE(unicode),
7643                                   mapping,
7644                                   NULL);
7645}
7646
7647/* create or adjust a UnicodeTranslateError */
7648static void
7649make_translate_exception(PyObject **exceptionObject,
7650                         PyObject *unicode,
7651                         Py_ssize_t startpos, Py_ssize_t endpos,
7652                         const char *reason)
7653{
7654    if (*exceptionObject == NULL) {
7655        *exceptionObject = _PyUnicodeTranslateError_Create(
7656            unicode, startpos, endpos, reason);
7657    }
7658    else {
7659        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
7660            goto onError;
7661        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
7662            goto onError;
7663        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
7664            goto onError;
7665        return;
7666      onError:
7667        Py_DECREF(*exceptionObject);
7668        *exceptionObject = NULL;
7669    }
7670}
7671
7672/* raises a UnicodeTranslateError */
7673static void
7674raise_translate_exception(PyObject **exceptionObject,
7675                          PyObject *unicode,
7676                          Py_ssize_t startpos, Py_ssize_t endpos,
7677                          const char *reason)
7678{
7679    make_translate_exception(exceptionObject,
7680                             unicode, startpos, endpos, reason);
7681    if (*exceptionObject != NULL)
7682        PyCodec_StrictErrors(*exceptionObject);
7683}
7684
7685/* error handling callback helper:
7686   build arguments, call the callback and check the arguments,
7687   put the result into newpos and return the replacement string, which
7688   has to be freed by the caller */
7689static PyObject *
7690unicode_translate_call_errorhandler(const char *errors,
7691                                    PyObject **errorHandler,
7692                                    const char *reason,
7693                                    PyObject *unicode, PyObject **exceptionObject,
7694                                    Py_ssize_t startpos, Py_ssize_t endpos,
7695                                    Py_ssize_t *newpos)
7696{
7697    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
7698
7699    Py_ssize_t i_newpos;
7700    PyObject *restuple;
7701    PyObject *resunicode;
7702
7703    if (*errorHandler == NULL) {
7704        *errorHandler = PyCodec_LookupError(errors);
7705        if (*errorHandler == NULL)
7706            return NULL;
7707    }
7708
7709    make_translate_exception(exceptionObject,
7710                             unicode, startpos, endpos, reason);
7711    if (*exceptionObject == NULL)
7712        return NULL;
7713
7714    restuple = PyObject_CallFunctionObjArgs(
7715        *errorHandler, *exceptionObject, NULL);
7716    if (restuple == NULL)
7717        return NULL;
7718    if (!PyTuple_Check(restuple)) {
7719        PyErr_SetString(PyExc_TypeError, &argparse[4]);
7720        Py_DECREF(restuple);
7721        return NULL;
7722    }
7723    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
7724                          &resunicode, &i_newpos)) {
7725        Py_DECREF(restuple);
7726        return NULL;
7727    }
7728    if (i_newpos<0)
7729        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
7730    else
7731        *newpos = i_newpos;
7732    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
7733        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
7734        Py_DECREF(restuple);
7735        return NULL;
7736    }
7737    Py_INCREF(resunicode);
7738    Py_DECREF(restuple);
7739    return resunicode;
7740}
7741
7742/* Lookup the character ch in the mapping and put the result in result,
7743   which must be decrefed by the caller.
7744   Return 0 on success, -1 on error */
7745static int
7746charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
7747{
7748    PyObject *w = PyLong_FromLong((long)c);
7749    PyObject *x;
7750
7751    if (w == NULL)
7752        return -1;
7753    x = PyObject_GetItem(mapping, w);
7754    Py_DECREF(w);
7755    if (x == NULL) {
7756        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7757            /* No mapping found means: use 1:1 mapping. */
7758            PyErr_Clear();
7759            *result = NULL;
7760            return 0;
7761        } else
7762            return -1;
7763    }
7764    else if (x == Py_None) {
7765        *result = x;
7766        return 0;
7767    }
7768    else if (PyLong_Check(x)) {
7769        long value = PyLong_AS_LONG(x);
7770        long max = PyUnicode_GetMax();
7771        if (value < 0 || value > max) {
7772            PyErr_Format(PyExc_TypeError,
7773                         "character mapping must be in range(0x%x)", max+1);
7774            Py_DECREF(x);
7775            return -1;
7776        }
7777        *result = x;
7778        return 0;
7779    }
7780    else if (PyUnicode_Check(x)) {
7781        *result = x;
7782        return 0;
7783    }
7784    else {
7785        /* wrong return value */
7786        PyErr_SetString(PyExc_TypeError,
7787                        "character mapping must return integer, None or str");
7788        Py_DECREF(x);
7789        return -1;
7790    }
7791}
7792/* ensure that *outobj is at least requiredsize characters long,
7793   if not reallocate and adjust various state variables.
7794   Return 0 on success, -1 on error */
7795static int
7796charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
7797                               Py_ssize_t requiredsize)
7798{
7799    Py_ssize_t oldsize = *psize;
7800    if (requiredsize > oldsize) {
7801        /* exponentially overallocate to minimize reallocations */
7802        if (requiredsize < 2 * oldsize)
7803            requiredsize = 2 * oldsize;
7804        *outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
7805        if (*outobj == 0)
7806            return -1;
7807        *psize = requiredsize;
7808    }
7809    return 0;
7810}
7811/* lookup the character, put the result in the output string and adjust
7812   various state variables. Return a new reference to the object that
7813   was put in the output buffer in *result, or Py_None, if the mapping was
7814   undefined (in which case no character was written).
7815   The called must decref result.
7816   Return 0 on success, -1 on error. */
7817static int
7818charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
7819                        PyObject *mapping, Py_UCS4 **output,
7820                        Py_ssize_t *osize, Py_ssize_t *opos,
7821                        PyObject **res)
7822{
7823    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
7824    if (charmaptranslate_lookup(curinp, mapping, res))
7825        return -1;
7826    if (*res==NULL) {
7827        /* not found => default to 1:1 mapping */
7828        (*output)[(*opos)++] = curinp;
7829    }
7830    else if (*res==Py_None)
7831        ;
7832    else if (PyLong_Check(*res)) {
7833        /* no overflow check, because we know that the space is enough */
7834        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
7835    }
7836    else if (PyUnicode_Check(*res)) {
7837        Py_ssize_t repsize;
7838        if (PyUnicode_READY(*res) == -1)
7839            return -1;
7840        repsize = PyUnicode_GET_LENGTH(*res);
7841        if (repsize==1) {
7842            /* no overflow check, because we know that the space is enough */
7843            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
7844        }
7845        else if (repsize!=0) {
7846            /* more than one character */
7847            Py_ssize_t requiredsize = *opos +
7848                (PyUnicode_GET_LENGTH(input) - ipos) +
7849                repsize - 1;
7850            Py_ssize_t i;
7851            if (charmaptranslate_makespace(output, osize, requiredsize))
7852                return -1;
7853            for(i = 0; i < repsize; i++)
7854                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
7855        }
7856    }
7857    else
7858        return -1;
7859    return 0;
7860}
7861
7862PyObject *
7863_PyUnicode_TranslateCharmap(PyObject *input,
7864                            PyObject *mapping,
7865                            const char *errors)
7866{
7867    /* input object */
7868    char *idata;
7869    Py_ssize_t size, i;
7870    int kind;
7871    /* output buffer */
7872    Py_UCS4 *output = NULL;
7873    Py_ssize_t osize;
7874    PyObject *res;
7875    /* current output position */
7876    Py_ssize_t opos;
7877    char *reason = "character maps to <undefined>";
7878    PyObject *errorHandler = NULL;
7879    PyObject *exc = NULL;
7880    /* the following variable is used for caching string comparisons
7881     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
7882     * 3=ignore, 4=xmlcharrefreplace */
7883    int known_errorHandler = -1;
7884
7885    if (mapping == NULL) {
7886        PyErr_BadArgument();
7887        return NULL;
7888    }
7889
7890    if (PyUnicode_READY(input) == -1)
7891        return NULL;
7892    idata = (char*)PyUnicode_DATA(input);
7893    kind = PyUnicode_KIND(input);
7894    size = PyUnicode_GET_LENGTH(input);
7895    i = 0;
7896
7897    if (size == 0) {
7898        Py_INCREF(input);
7899        return input;
7900    }
7901
7902    /* allocate enough for a simple 1:1 translation without
7903       replacements, if we need more, we'll resize */
7904    osize = size;
7905    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
7906    opos = 0;
7907    if (output == NULL) {
7908        PyErr_NoMemory();
7909        goto onError;
7910    }
7911
7912    while (i<size) {
7913        /* try to encode it */
7914        PyObject *x = NULL;
7915        if (charmaptranslate_output(input, i, mapping,
7916                                    &output, &osize, &opos, &x)) {
7917            Py_XDECREF(x);
7918            goto onError;
7919        }
7920        Py_XDECREF(x);
7921        if (x!=Py_None) /* it worked => adjust input pointer */
7922            ++i;
7923        else { /* untranslatable character */
7924            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7925            Py_ssize_t repsize;
7926            Py_ssize_t newpos;
7927            Py_ssize_t uni2;
7928            /* startpos for collecting untranslatable chars */
7929            Py_ssize_t collstart = i;
7930            Py_ssize_t collend = i+1;
7931            Py_ssize_t coll;
7932
7933            /* find all untranslatable characters */
7934            while (collend < size) {
7935                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
7936                    goto onError;
7937                Py_XDECREF(x);
7938                if (x!=Py_None)
7939                    break;
7940                ++collend;
7941            }
7942            /* cache callback name lookup
7943             * (if not done yet, i.e. it's the first error) */
7944            if (known_errorHandler==-1) {
7945                if ((errors==NULL) || (!strcmp(errors, "strict")))
7946                    known_errorHandler = 1;
7947                else if (!strcmp(errors, "replace"))
7948                    known_errorHandler = 2;
7949                else if (!strcmp(errors, "ignore"))
7950                    known_errorHandler = 3;
7951                else if (!strcmp(errors, "xmlcharrefreplace"))
7952                    known_errorHandler = 4;
7953                else
7954                    known_errorHandler = 0;
7955            }
7956            switch (known_errorHandler) {
7957            case 1: /* strict */
7958                raise_translate_exception(&exc, input, collstart,
7959                                          collend, reason);
7960                goto onError;
7961            case 2: /* replace */
7962                /* No need to check for space, this is a 1:1 replacement */
7963                for (coll = collstart; coll<collend; coll++)
7964                    output[opos++] = '?';
7965                /* fall through */
7966            case 3: /* ignore */
7967                i = collend;
7968                break;
7969            case 4: /* xmlcharrefreplace */
7970                /* generate replacement (temporarily (mis)uses i) */
7971                for (i = collstart; i < collend; ++i) {
7972                    char buffer[2+29+1+1];
7973                    char *cp;
7974                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
7975                    if (charmaptranslate_makespace(&output, &osize,
7976                                                   opos+strlen(buffer)+(size-collend)))
7977                        goto onError;
7978                    for (cp = buffer; *cp; ++cp)
7979                        output[opos++] = *cp;
7980                }
7981                i = collend;
7982                break;
7983            default:
7984                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
7985                                                                 reason, input, &exc,
7986                                                                 collstart, collend, &newpos);
7987                if (repunicode == NULL || _PyUnicode_READY_REPLACE(&repunicode))
7988                    goto onError;
7989                /* generate replacement  */
7990                repsize = PyUnicode_GET_LENGTH(repunicode);
7991                if (charmaptranslate_makespace(&output, &osize,
7992                                               opos+repsize+(size-collend))) {
7993                    Py_DECREF(repunicode);
7994                    goto onError;
7995                }
7996                for (uni2 = 0; repsize-->0; ++uni2)
7997                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
7998                i = newpos;
7999                Py_DECREF(repunicode);
8000            }
8001        }
8002    }
8003    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8004    if (!res)
8005        goto onError;
8006    PyMem_Free(output);
8007    Py_XDECREF(exc);
8008    Py_XDECREF(errorHandler);
8009    return res;
8010
8011  onError:
8012    PyMem_Free(output);
8013    Py_XDECREF(exc);
8014    Py_XDECREF(errorHandler);
8015    return NULL;
8016}
8017
8018/* Deprecated. Use PyUnicode_Translate instead. */
8019PyObject *
8020PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8021                           Py_ssize_t size,
8022                           PyObject *mapping,
8023                           const char *errors)
8024{
8025    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8026    if (!unicode)
8027        return NULL;
8028    return _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8029}
8030
8031PyObject *
8032PyUnicode_Translate(PyObject *str,
8033                    PyObject *mapping,
8034                    const char *errors)
8035{
8036    PyObject *result;
8037
8038    str = PyUnicode_FromObject(str);
8039    if (str == NULL)
8040        goto onError;
8041    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8042    Py_DECREF(str);
8043    return result;
8044
8045  onError:
8046    Py_XDECREF(str);
8047    return NULL;
8048}
8049
8050static Py_UCS4
8051fix_decimal_and_space_to_ascii(PyUnicodeObject *self)
8052{
8053    /* No need to call PyUnicode_READY(self) because this function is only
8054       called as a callback from fixup() which does it already. */
8055    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8056    const int kind = PyUnicode_KIND(self);
8057    void *data = PyUnicode_DATA(self);
8058    Py_UCS4 maxchar = 0, ch, fixed;
8059    Py_ssize_t i;
8060
8061    for (i = 0; i < len; ++i) {
8062        ch = PyUnicode_READ(kind, data, i);
8063        fixed = 0;
8064        if (ch > 127) {
8065            if (Py_UNICODE_ISSPACE(ch))
8066                fixed = ' ';
8067            else {
8068                const int decimal = Py_UNICODE_TODECIMAL(ch);
8069                if (decimal >= 0)
8070                    fixed = '0' + decimal;
8071            }
8072            if (fixed != 0) {
8073                if (fixed > maxchar)
8074                    maxchar = fixed;
8075                PyUnicode_WRITE(kind, data, i, fixed);
8076            }
8077            else if (ch > maxchar)
8078                maxchar = ch;
8079        }
8080        else if (ch > maxchar)
8081            maxchar = ch;
8082    }
8083
8084    return maxchar;
8085}
8086
8087PyObject *
8088_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8089{
8090    if (!PyUnicode_Check(unicode)) {
8091        PyErr_BadInternalCall();
8092        return NULL;
8093    }
8094    if (PyUnicode_READY(unicode) == -1)
8095        return NULL;
8096    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8097        /* If the string is already ASCII, just return the same string */
8098        Py_INCREF(unicode);
8099        return unicode;
8100    }
8101    return fixup((PyUnicodeObject *)unicode, fix_decimal_and_space_to_ascii);
8102}
8103
8104PyObject *
8105PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8106                                  Py_ssize_t length)
8107{
8108    PyObject *result;
8109    Py_UNICODE *p; /* write pointer into result */
8110    Py_ssize_t i;
8111    /* Copy to a new string */
8112    result = (PyObject *)_PyUnicode_New(length);
8113    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
8114    if (result == NULL)
8115        return result;
8116    p = PyUnicode_AS_UNICODE(result);
8117    /* Iterate over code points */
8118    for (i = 0; i < length; i++) {
8119        Py_UNICODE ch =s[i];
8120        if (ch > 127) {
8121            int decimal = Py_UNICODE_TODECIMAL(ch);
8122            if (decimal >= 0)
8123                p[i] = '0' + decimal;
8124        }
8125    }
8126#ifndef DONT_MAKE_RESULT_READY
8127    if (_PyUnicode_READY_REPLACE(&result)) {
8128        Py_DECREF(result);
8129        return NULL;
8130    }
8131#endif
8132    return result;
8133}
8134/* --- Decimal Encoder ---------------------------------------------------- */
8135
8136int
8137PyUnicode_EncodeDecimal(Py_UNICODE *s,
8138                        Py_ssize_t length,
8139                        char *output,
8140                        const char *errors)
8141{
8142    Py_UNICODE *p, *end;
8143    PyObject *errorHandler = NULL;
8144    PyObject *exc = NULL;
8145    const char *encoding = "decimal";
8146    const char *reason = "invalid decimal Unicode string";
8147    /* the following variable is used for caching string comparisons
8148     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
8149    int known_errorHandler = -1;
8150
8151    if (output == NULL) {
8152        PyErr_BadArgument();
8153        return -1;
8154    }
8155
8156    p = s;
8157    end = s + length;
8158    while (p < end) {
8159        register Py_UNICODE ch = *p;
8160        int decimal;
8161        PyObject *repunicode;
8162        Py_ssize_t repsize;
8163        Py_ssize_t newpos;
8164        Py_UNICODE *uni2;
8165        Py_UNICODE *collstart;
8166        Py_UNICODE *collend;
8167
8168        if (Py_UNICODE_ISSPACE(ch)) {
8169            *output++ = ' ';
8170            ++p;
8171            continue;
8172        }
8173        decimal = Py_UNICODE_TODECIMAL(ch);
8174        if (decimal >= 0) {
8175            *output++ = '0' + decimal;
8176            ++p;
8177            continue;
8178        }
8179        if (0 < ch && ch < 256) {
8180            *output++ = (char)ch;
8181            ++p;
8182            continue;
8183        }
8184        /* All other characters are considered unencodable */
8185        collstart = p;
8186        collend = p+1;
8187        while (collend < end) {
8188            if ((0 < *collend && *collend < 256) ||
8189                !Py_UNICODE_ISSPACE(*collend) ||
8190                Py_UNICODE_TODECIMAL(*collend))
8191                break;
8192        }
8193        /* cache callback name lookup
8194         * (if not done yet, i.e. it's the first error) */
8195        if (known_errorHandler==-1) {
8196            if ((errors==NULL) || (!strcmp(errors, "strict")))
8197                known_errorHandler = 1;
8198            else if (!strcmp(errors, "replace"))
8199                known_errorHandler = 2;
8200            else if (!strcmp(errors, "ignore"))
8201                known_errorHandler = 3;
8202            else if (!strcmp(errors, "xmlcharrefreplace"))
8203                known_errorHandler = 4;
8204            else
8205                known_errorHandler = 0;
8206        }
8207        switch (known_errorHandler) {
8208        case 1: /* strict */
8209            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
8210            goto onError;
8211        case 2: /* replace */
8212            for (p = collstart; p < collend; ++p)
8213                *output++ = '?';
8214            /* fall through */
8215        case 3: /* ignore */
8216            p = collend;
8217            break;
8218        case 4: /* xmlcharrefreplace */
8219            /* generate replacement (temporarily (mis)uses p) */
8220            for (p = collstart; p < collend; ++p)
8221                output += sprintf(output, "&#%d;", (int)*p);
8222            p = collend;
8223            break;
8224        default:
8225            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
8226                                                          encoding, reason, s, length, &exc,
8227                                                          collstart-s, collend-s, &newpos);
8228            if (repunicode == NULL)
8229                goto onError;
8230            if (!PyUnicode_Check(repunicode)) {
8231                /* Byte results not supported, since they have no decimal property. */
8232                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
8233                Py_DECREF(repunicode);
8234                goto onError;
8235            }
8236            /* generate replacement  */
8237            repsize = PyUnicode_GET_SIZE(repunicode);
8238            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
8239                Py_UNICODE ch = *uni2;
8240                if (Py_UNICODE_ISSPACE(ch))
8241                    *output++ = ' ';
8242                else {
8243                    decimal = Py_UNICODE_TODECIMAL(ch);
8244                    if (decimal >= 0)
8245                        *output++ = '0' + decimal;
8246                    else if (0 < ch && ch < 256)
8247                        *output++ = (char)ch;
8248                    else {
8249                        Py_DECREF(repunicode);
8250                        raise_encode_exception(&exc, encoding,
8251                                               s, length, collstart-s, collend-s, reason);
8252                        goto onError;
8253                    }
8254                }
8255            }
8256            p = s + newpos;
8257            Py_DECREF(repunicode);
8258        }
8259    }
8260    /* 0-terminate the output string */
8261    *output++ = '\0';
8262    Py_XDECREF(exc);
8263    Py_XDECREF(errorHandler);
8264    return 0;
8265
8266  onError:
8267    Py_XDECREF(exc);
8268    Py_XDECREF(errorHandler);
8269    return -1;
8270}
8271
8272/* --- Helpers ------------------------------------------------------------ */
8273
8274#include "stringlib/ucs1lib.h"
8275#include "stringlib/fastsearch.h"
8276#include "stringlib/partition.h"
8277#include "stringlib/split.h"
8278#include "stringlib/count.h"
8279#include "stringlib/find.h"
8280#include "stringlib/localeutil.h"
8281#include "stringlib/undef.h"
8282
8283#include "stringlib/ucs2lib.h"
8284#include "stringlib/fastsearch.h"
8285#include "stringlib/partition.h"
8286#include "stringlib/split.h"
8287#include "stringlib/count.h"
8288#include "stringlib/find.h"
8289#include "stringlib/localeutil.h"
8290#include "stringlib/undef.h"
8291
8292#include "stringlib/ucs4lib.h"
8293#include "stringlib/fastsearch.h"
8294#include "stringlib/partition.h"
8295#include "stringlib/split.h"
8296#include "stringlib/count.h"
8297#include "stringlib/find.h"
8298#include "stringlib/localeutil.h"
8299#include "stringlib/undef.h"
8300
8301static Py_ssize_t
8302any_find_slice(Py_ssize_t Py_LOCAL_CALLBACK(ucs1)(const Py_UCS1*, Py_ssize_t,
8303                                  const Py_UCS1*, Py_ssize_t,
8304                                  Py_ssize_t, Py_ssize_t),
8305               Py_ssize_t Py_LOCAL_CALLBACK(ucs2)(const Py_UCS2*, Py_ssize_t,
8306                                  const Py_UCS2*, Py_ssize_t,
8307                                  Py_ssize_t, Py_ssize_t),
8308               Py_ssize_t Py_LOCAL_CALLBACK(ucs4)(const Py_UCS4*, Py_ssize_t,
8309                                  const Py_UCS4*, Py_ssize_t,
8310                                  Py_ssize_t, Py_ssize_t),
8311               PyObject* s1, PyObject* s2,
8312               Py_ssize_t start,
8313               Py_ssize_t end)
8314{
8315    int kind1, kind2, kind;
8316    void *buf1, *buf2;
8317    Py_ssize_t len1, len2, result;
8318
8319    kind1 = PyUnicode_KIND(s1);
8320    kind2 = PyUnicode_KIND(s2);
8321    kind = kind1 > kind2 ? kind1 : kind2;
8322    buf1 = PyUnicode_DATA(s1);
8323    buf2 = PyUnicode_DATA(s2);
8324    if (kind1 != kind)
8325        buf1 = _PyUnicode_AsKind(s1, kind);
8326    if (!buf1)
8327        return -2;
8328    if (kind2 != kind)
8329        buf2 = _PyUnicode_AsKind(s2, kind);
8330    if (!buf2) {
8331        if (kind1 != kind) PyMem_Free(buf1);
8332        return -2;
8333    }
8334    len1 = PyUnicode_GET_LENGTH(s1);
8335    len2 = PyUnicode_GET_LENGTH(s2);
8336
8337    switch(kind) {
8338    case PyUnicode_1BYTE_KIND:
8339        result = ucs1(buf1, len1, buf2, len2, start, end);
8340        break;
8341    case PyUnicode_2BYTE_KIND:
8342        result = ucs2(buf1, len1, buf2, len2, start, end);
8343        break;
8344    case PyUnicode_4BYTE_KIND:
8345        result = ucs4(buf1, len1, buf2, len2, start, end);
8346        break;
8347    default:
8348        assert(0); result = -2;
8349    }
8350
8351    if (kind1 != kind)
8352        PyMem_Free(buf1);
8353    if (kind2 != kind)
8354        PyMem_Free(buf2);
8355
8356    return result;
8357}
8358
8359Py_ssize_t
8360_PyUnicode_InsertThousandsGrouping(int kind, void *data,
8361                                   Py_ssize_t n_buffer,
8362                                   void *digits, Py_ssize_t n_digits,
8363                                   Py_ssize_t min_width,
8364                                   const char *grouping,
8365                                   const char *thousands_sep)
8366{
8367    switch(kind) {
8368    case PyUnicode_1BYTE_KIND:
8369        return _PyUnicode_ucs1_InsertThousandsGrouping(
8370            (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8371            min_width, grouping, thousands_sep);
8372    case PyUnicode_2BYTE_KIND:
8373        return _PyUnicode_ucs2_InsertThousandsGrouping(
8374            (Py_UCS2*)data, n_buffer, (Py_UCS2*)digits, n_digits,
8375            min_width, grouping, thousands_sep);
8376    case PyUnicode_4BYTE_KIND:
8377        return _PyUnicode_ucs4_InsertThousandsGrouping(
8378            (Py_UCS4*)data, n_buffer, (Py_UCS4*)digits, n_digits,
8379            min_width, grouping, thousands_sep);
8380    }
8381    assert(0);
8382    return -1;
8383}
8384
8385
8386#include "stringlib/unicodedefs.h"
8387#include "stringlib/fastsearch.h"
8388
8389#include "stringlib/count.h"
8390#include "stringlib/find.h"
8391
8392/* helper macro to fixup start/end slice values */
8393#define ADJUST_INDICES(start, end, len)         \
8394    if (end > len)                              \
8395        end = len;                              \
8396    else if (end < 0) {                         \
8397        end += len;                             \
8398        if (end < 0)                            \
8399            end = 0;                            \
8400    }                                           \
8401    if (start < 0) {                            \
8402        start += len;                           \
8403        if (start < 0)                          \
8404            start = 0;                          \
8405    }
8406
8407Py_ssize_t
8408PyUnicode_Count(PyObject *str,
8409                PyObject *substr,
8410                Py_ssize_t start,
8411                Py_ssize_t end)
8412{
8413    Py_ssize_t result;
8414    PyUnicodeObject* str_obj;
8415    PyUnicodeObject* sub_obj;
8416    int kind1, kind2, kind;
8417    void *buf1 = NULL, *buf2 = NULL;
8418    Py_ssize_t len1, len2;
8419
8420    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
8421    if (!str_obj || PyUnicode_READY(str_obj) == -1)
8422        return -1;
8423    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
8424    if (!sub_obj || PyUnicode_READY(sub_obj) == -1) {
8425        Py_DECREF(str_obj);
8426        return -1;
8427    }
8428
8429    kind1 = PyUnicode_KIND(str_obj);
8430    kind2 = PyUnicode_KIND(sub_obj);
8431    kind = kind1 > kind2 ? kind1 : kind2;
8432    buf1 = PyUnicode_DATA(str_obj);
8433    if (kind1 != kind)
8434        buf1 = _PyUnicode_AsKind((PyObject*)str_obj, kind);
8435    if (!buf1)
8436        goto onError;
8437    buf2 = PyUnicode_DATA(sub_obj);
8438    if (kind2 != kind)
8439        buf2 = _PyUnicode_AsKind((PyObject*)sub_obj, kind);
8440    if (!buf2)
8441        goto onError;
8442    len1 = PyUnicode_GET_LENGTH(str_obj);
8443    len2 = PyUnicode_GET_LENGTH(sub_obj);
8444
8445    ADJUST_INDICES(start, end, len1);
8446    switch(kind) {
8447    case PyUnicode_1BYTE_KIND:
8448        result = ucs1lib_count(
8449            ((Py_UCS1*)buf1) + start, end - start,
8450            buf2, len2, PY_SSIZE_T_MAX
8451            );
8452        break;
8453    case PyUnicode_2BYTE_KIND:
8454        result = ucs2lib_count(
8455            ((Py_UCS2*)buf1) + start, end - start,
8456            buf2, len2, PY_SSIZE_T_MAX
8457            );
8458        break;
8459    case PyUnicode_4BYTE_KIND:
8460        result = ucs4lib_count(
8461            ((Py_UCS4*)buf1) + start, end - start,
8462            buf2, len2, PY_SSIZE_T_MAX
8463            );
8464        break;
8465    default:
8466        assert(0); result = 0;
8467    }
8468
8469    Py_DECREF(sub_obj);
8470    Py_DECREF(str_obj);
8471
8472    if (kind1 != kind)
8473        PyMem_Free(buf1);
8474    if (kind2 != kind)
8475        PyMem_Free(buf2);
8476
8477    return result;
8478  onError:
8479    Py_DECREF(sub_obj);
8480    Py_DECREF(str_obj);
8481    if (kind1 != kind && buf1)
8482        PyMem_Free(buf1);
8483    if (kind2 != kind && buf2)
8484        PyMem_Free(buf2);
8485    return -1;
8486}
8487
8488Py_ssize_t
8489PyUnicode_Find(PyObject *str,
8490               PyObject *sub,
8491               Py_ssize_t start,
8492               Py_ssize_t end,
8493               int direction)
8494{
8495    Py_ssize_t result;
8496
8497    str = PyUnicode_FromObject(str);
8498    if (!str || PyUnicode_READY(str) == -1)
8499        return -2;
8500    sub = PyUnicode_FromObject(sub);
8501    if (!sub || PyUnicode_READY(sub) == -1) {
8502        Py_DECREF(str);
8503        return -2;
8504    }
8505
8506    if (direction > 0)
8507        result = any_find_slice(
8508            ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
8509            str, sub, start, end
8510            );
8511    else
8512        result = any_find_slice(
8513            ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
8514            str, sub, start, end
8515            );
8516
8517    Py_DECREF(str);
8518    Py_DECREF(sub);
8519
8520    return result;
8521}
8522
8523Py_ssize_t
8524PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
8525                   Py_ssize_t start, Py_ssize_t end,
8526                   int direction)
8527{
8528    char *result;
8529    int kind;
8530    if (PyUnicode_READY(str) == -1)
8531        return -2;
8532    if (start < 0 || end < 0) {
8533        PyErr_SetString(PyExc_IndexError, "string index out of range");
8534        return -2;
8535    }
8536    if (end > PyUnicode_GET_LENGTH(str))
8537        end = PyUnicode_GET_LENGTH(str);
8538    kind = PyUnicode_KIND(str);
8539    result = findchar(PyUnicode_1BYTE_DATA(str)
8540                      + PyUnicode_KIND_SIZE(kind, start),
8541                      kind,
8542                      end-start, ch, direction);
8543    if (!result)
8544        return -1;
8545    return (result-(char*)PyUnicode_DATA(str)) >> (kind-1);
8546}
8547
8548static int
8549tailmatch(PyUnicodeObject *self,
8550          PyUnicodeObject *substring,
8551          Py_ssize_t start,
8552          Py_ssize_t end,
8553          int direction)
8554{
8555    int kind_self;
8556    int kind_sub;
8557    void *data_self;
8558    void *data_sub;
8559    Py_ssize_t offset;
8560    Py_ssize_t i;
8561    Py_ssize_t end_sub;
8562
8563    if (PyUnicode_READY(self) == -1 ||
8564        PyUnicode_READY(substring) == -1)
8565        return 0;
8566
8567    if (PyUnicode_GET_LENGTH(substring) == 0)
8568        return 1;
8569
8570    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
8571    end -= PyUnicode_GET_LENGTH(substring);
8572    if (end < start)
8573        return 0;
8574
8575    kind_self = PyUnicode_KIND(self);
8576    data_self = PyUnicode_DATA(self);
8577    kind_sub = PyUnicode_KIND(substring);
8578    data_sub = PyUnicode_DATA(substring);
8579    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
8580
8581    if (direction > 0)
8582        offset = end;
8583    else
8584        offset = start;
8585
8586    if (PyUnicode_READ(kind_self, data_self, offset) ==
8587        PyUnicode_READ(kind_sub, data_sub, 0) &&
8588        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
8589        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
8590        /* If both are of the same kind, memcmp is sufficient */
8591        if (kind_self == kind_sub) {
8592            return ! memcmp((char *)data_self +
8593                                (offset * PyUnicode_CHARACTER_SIZE(substring)),
8594                            data_sub,
8595                            PyUnicode_GET_LENGTH(substring) *
8596                                PyUnicode_CHARACTER_SIZE(substring));
8597        }
8598        /* otherwise we have to compare each character by first accesing it */
8599        else {
8600            /* We do not need to compare 0 and len(substring)-1 because
8601               the if statement above ensured already that they are equal
8602               when we end up here. */
8603            // TODO: honor direction and do a forward or backwards search
8604            for (i = 1; i < end_sub; ++i) {
8605                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
8606                    PyUnicode_READ(kind_sub, data_sub, i))
8607                    return 0;
8608            }
8609            return 1;
8610        }
8611    }
8612
8613    return 0;
8614}
8615
8616Py_ssize_t
8617PyUnicode_Tailmatch(PyObject *str,
8618                    PyObject *substr,
8619                    Py_ssize_t start,
8620                    Py_ssize_t end,
8621                    int direction)
8622{
8623    Py_ssize_t result;
8624
8625    str = PyUnicode_FromObject(str);
8626    if (str == NULL)
8627        return -1;
8628    substr = PyUnicode_FromObject(substr);
8629    if (substr == NULL) {
8630        Py_DECREF(str);
8631        return -1;
8632    }
8633
8634    result = tailmatch((PyUnicodeObject *)str,
8635                       (PyUnicodeObject *)substr,
8636                       start, end, direction);
8637    Py_DECREF(str);
8638    Py_DECREF(substr);
8639    return result;
8640}
8641
8642/* Apply fixfct filter to the Unicode object self and return a
8643   reference to the modified object */
8644
8645static PyObject *
8646fixup(PyUnicodeObject *self,
8647      Py_UCS4 (*fixfct)(PyUnicodeObject *s))
8648{
8649    PyObject *u;
8650    Py_UCS4 maxchar_old, maxchar_new = 0;
8651
8652    if (PyUnicode_READY(self) == -1)
8653        return NULL;
8654    maxchar_old = PyUnicode_MAX_CHAR_VALUE(self);
8655    u = PyUnicode_New(PyUnicode_GET_LENGTH(self),
8656                      maxchar_old);
8657    if (u == NULL)
8658        return NULL;
8659
8660    Py_MEMCPY(PyUnicode_1BYTE_DATA(u), PyUnicode_1BYTE_DATA(self),
8661              PyUnicode_GET_LENGTH(u) * PyUnicode_CHARACTER_SIZE(u));
8662
8663    /* fix functions return the new maximum character in a string,
8664       if the kind of the resulting unicode object does not change,
8665       everything is fine.  Otherwise we need to change the string kind
8666       and re-run the fix function. */
8667    maxchar_new = fixfct((PyUnicodeObject*)u);
8668    if (maxchar_new == 0)
8669        /* do nothing, keep maxchar_new at 0 which means no changes. */;
8670    else if (maxchar_new <= 127)
8671        maxchar_new = 127;
8672    else if (maxchar_new <= 255)
8673        maxchar_new = 255;
8674    else if (maxchar_new <= 65535)
8675        maxchar_new = 65535;
8676    else
8677        maxchar_new = 1114111; /* 0x10ffff */
8678
8679    if (!maxchar_new && PyUnicode_CheckExact(self)) {
8680        /* fixfct should return TRUE if it modified the buffer. If
8681           FALSE, return a reference to the original buffer instead
8682           (to save space, not time) */
8683        Py_INCREF(self);
8684        Py_DECREF(u);
8685        return (PyObject*) self;
8686    }
8687    else if (maxchar_new == maxchar_old) {
8688        return u;
8689    }
8690    else {
8691        /* In case the maximum character changed, we need to
8692           convert the string to the new category. */
8693        PyObject *v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
8694        if (v == NULL) {
8695            Py_DECREF(u);
8696            return NULL;
8697        }
8698        if (maxchar_new > maxchar_old) {
8699            /* If the maxchar increased so that the kind changed, not all
8700               characters are representable anymore and we need to fix the
8701               string again. This only happens in very few cases. */
8702            if (PyUnicode_CopyCharacters(v, 0,
8703                                         (PyObject*)self, 0,
8704                                         PyUnicode_GET_LENGTH(self)) < 0)
8705            {
8706                Py_DECREF(u);
8707                return NULL;
8708            }
8709            maxchar_old = fixfct((PyUnicodeObject*)v);
8710            assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
8711        }
8712        else {
8713            if (PyUnicode_CopyCharacters(v, 0,
8714                                         u, 0,
8715                                         PyUnicode_GET_LENGTH(self)) < 0)
8716            {
8717                Py_DECREF(u);
8718                return NULL;
8719            }
8720        }
8721
8722        Py_DECREF(u);
8723        return v;
8724    }
8725}
8726
8727static Py_UCS4
8728fixupper(PyUnicodeObject *self)
8729{
8730    /* No need to call PyUnicode_READY(self) because this function is only
8731       called as a callback from fixup() which does it already. */
8732    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8733    const int kind = PyUnicode_KIND(self);
8734    void *data = PyUnicode_DATA(self);
8735    int touched = 0;
8736    Py_UCS4 maxchar = 0;
8737    Py_ssize_t i;
8738
8739    for (i = 0; i < len; ++i) {
8740        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8741        const Py_UCS4 up = Py_UNICODE_TOUPPER(ch);
8742        if (up != ch) {
8743            if (up > maxchar)
8744                maxchar = up;
8745            PyUnicode_WRITE(kind, data, i, up);
8746            touched = 1;
8747        }
8748        else if (ch > maxchar)
8749            maxchar = ch;
8750    }
8751
8752    if (touched)
8753        return maxchar;
8754    else
8755        return 0;
8756}
8757
8758static Py_UCS4
8759fixlower(PyUnicodeObject *self)
8760{
8761    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8762    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8763    const int kind = PyUnicode_KIND(self);
8764    void *data = PyUnicode_DATA(self);
8765    int touched = 0;
8766    Py_UCS4 maxchar = 0;
8767    Py_ssize_t i;
8768
8769    for(i = 0; i < len; ++i) {
8770        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8771        const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8772        if (lo != ch) {
8773            if (lo > maxchar)
8774                maxchar = lo;
8775            PyUnicode_WRITE(kind, data, i, lo);
8776            touched = 1;
8777        }
8778        else if (ch > maxchar)
8779            maxchar = ch;
8780    }
8781
8782    if (touched)
8783        return maxchar;
8784    else
8785        return 0;
8786}
8787
8788static Py_UCS4
8789fixswapcase(PyUnicodeObject *self)
8790{
8791    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8792    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8793    const int kind = PyUnicode_KIND(self);
8794    void *data = PyUnicode_DATA(self);
8795    int touched = 0;
8796    Py_UCS4 maxchar = 0;
8797    Py_ssize_t i;
8798
8799    for(i = 0; i < len; ++i) {
8800        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8801        Py_UCS4 nu = 0;
8802
8803        if (Py_UNICODE_ISUPPER(ch))
8804            nu = Py_UNICODE_TOLOWER(ch);
8805        else if (Py_UNICODE_ISLOWER(ch))
8806            nu = Py_UNICODE_TOUPPER(ch);
8807
8808        if (nu != 0) {
8809            if (nu > maxchar)
8810                maxchar = nu;
8811            PyUnicode_WRITE(kind, data, i, nu);
8812            touched = 1;
8813        }
8814        else if (ch > maxchar)
8815            maxchar = ch;
8816    }
8817
8818    if (touched)
8819        return maxchar;
8820    else
8821        return 0;
8822}
8823
8824static Py_UCS4
8825fixcapitalize(PyUnicodeObject *self)
8826{
8827    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8828    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8829    const int kind = PyUnicode_KIND(self);
8830    void *data = PyUnicode_DATA(self);
8831    int touched = 0;
8832    Py_UCS4 maxchar = 0;
8833    Py_ssize_t i = 0;
8834    Py_UCS4 ch;
8835
8836    if (len == 0)
8837        return 0;
8838
8839    ch = PyUnicode_READ(kind, data, i);
8840    if (!Py_UNICODE_ISUPPER(ch)) {
8841        maxchar = Py_UNICODE_TOUPPER(ch);
8842        PyUnicode_WRITE(kind, data, i, maxchar);
8843        touched = 1;
8844    }
8845    ++i;
8846    for(; i < len; ++i) {
8847        ch = PyUnicode_READ(kind, data, i);
8848        if (!Py_UNICODE_ISLOWER(ch)) {
8849            const Py_UCS4 lo = Py_UNICODE_TOLOWER(ch);
8850            if (lo > maxchar)
8851                maxchar = lo;
8852            PyUnicode_WRITE(kind, data, i, lo);
8853            touched = 1;
8854        }
8855        else if (ch > maxchar)
8856            maxchar = ch;
8857    }
8858
8859    if (touched)
8860        return maxchar;
8861    else
8862        return 0;
8863}
8864
8865static Py_UCS4
8866fixtitle(PyUnicodeObject *self)
8867{
8868    /* No need to call PyUnicode_READY(self) because fixup() which does it. */
8869    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8870    const int kind = PyUnicode_KIND(self);
8871    void *data = PyUnicode_DATA(self);
8872    Py_UCS4 maxchar = 0;
8873    Py_ssize_t i = 0;
8874    int previous_is_cased;
8875
8876    /* Shortcut for single character strings */
8877    if (len == 1) {
8878        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8879        const Py_UCS4 ti = Py_UNICODE_TOTITLE(ch);
8880        if (ti != ch) {
8881            PyUnicode_WRITE(kind, data, i, ti);
8882            return ti;
8883        }
8884        else
8885            return 0;
8886    }
8887    previous_is_cased = 0;
8888    for(; i < len; ++i) {
8889        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
8890        Py_UCS4 nu;
8891
8892        if (previous_is_cased)
8893            nu = Py_UNICODE_TOLOWER(ch);
8894        else
8895            nu = Py_UNICODE_TOTITLE(ch);
8896
8897        if (nu > maxchar)
8898            maxchar = nu;
8899        PyUnicode_WRITE(kind, data, i, nu);
8900
8901        if (Py_UNICODE_ISLOWER(ch) ||
8902            Py_UNICODE_ISUPPER(ch) ||
8903            Py_UNICODE_ISTITLE(ch))
8904            previous_is_cased = 1;
8905        else
8906            previous_is_cased = 0;
8907    }
8908    return maxchar;
8909}
8910
8911PyObject *
8912PyUnicode_Join(PyObject *separator, PyObject *seq)
8913{
8914    PyObject *sep = NULL;
8915    Py_ssize_t seplen = 1;
8916    PyObject *res = NULL; /* the result */
8917    PyObject *fseq;          /* PySequence_Fast(seq) */
8918    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
8919    PyObject **items;
8920    PyObject *item;
8921    Py_ssize_t sz, i, res_offset;
8922    Py_UCS4 maxchar = 0;
8923    Py_UCS4 item_maxchar;
8924
8925    fseq = PySequence_Fast(seq, "");
8926    if (fseq == NULL) {
8927        return NULL;
8928    }
8929
8930    /* NOTE: the following code can't call back into Python code,
8931     * so we are sure that fseq won't be mutated.
8932     */
8933
8934    seqlen = PySequence_Fast_GET_SIZE(fseq);
8935    /* If empty sequence, return u"". */
8936    if (seqlen == 0) {
8937        res = PyUnicode_New(0, 0);
8938        goto Done;
8939    }
8940    items = PySequence_Fast_ITEMS(fseq);
8941    /* If singleton sequence with an exact Unicode, return that. */
8942    if (seqlen == 1) {
8943        item = items[0];
8944        if (PyUnicode_CheckExact(item)) {
8945            Py_INCREF(item);
8946            res = item;
8947            goto Done;
8948        }
8949    }
8950    else {
8951        /* Set up sep and seplen */
8952        if (separator == NULL) {
8953            /* fall back to a blank space separator */
8954            sep = PyUnicode_FromOrdinal(' ');
8955            if (!sep)
8956                goto onError;
8957        }
8958        else {
8959            if (!PyUnicode_Check(separator)) {
8960                PyErr_Format(PyExc_TypeError,
8961                             "separator: expected str instance,"
8962                             " %.80s found",
8963                             Py_TYPE(separator)->tp_name);
8964                goto onError;
8965            }
8966            if (PyUnicode_READY(separator))
8967                goto onError;
8968            sep = separator;
8969            seplen = PyUnicode_GET_LENGTH(separator);
8970            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
8971            /* inc refcount to keep this code path symetric with the
8972               above case of a blank separator */
8973            Py_INCREF(sep);
8974        }
8975    }
8976
8977    /* There are at least two things to join, or else we have a subclass
8978     * of str in the sequence.
8979     * Do a pre-pass to figure out the total amount of space we'll
8980     * need (sz), and see whether all argument are strings.
8981     */
8982    sz = 0;
8983    for (i = 0; i < seqlen; i++) {
8984        const Py_ssize_t old_sz = sz;
8985        item = items[i];
8986        if (!PyUnicode_Check(item)) {
8987            PyErr_Format(PyExc_TypeError,
8988                         "sequence item %zd: expected str instance,"
8989                         " %.80s found",
8990                         i, Py_TYPE(item)->tp_name);
8991            goto onError;
8992        }
8993        if (PyUnicode_READY(item) == -1)
8994            goto onError;
8995        sz += PyUnicode_GET_LENGTH(item);
8996        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
8997        if (item_maxchar > maxchar)
8998            maxchar = item_maxchar;
8999        if (i != 0)
9000            sz += seplen;
9001        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9002            PyErr_SetString(PyExc_OverflowError,
9003                            "join() result is too long for a Python string");
9004            goto onError;
9005        }
9006    }
9007
9008    res = PyUnicode_New(sz, maxchar);
9009    if (res == NULL)
9010        goto onError;
9011
9012    /* Catenate everything. */
9013    for (i = 0, res_offset = 0; i < seqlen; ++i) {
9014        Py_ssize_t itemlen, copied;
9015        item = items[i];
9016        /* Copy item, and maybe the separator. */
9017        if (i && seplen != 0) {
9018            copied = PyUnicode_CopyCharacters(res, res_offset,
9019                                              sep, 0, seplen);
9020            if (copied < 0)
9021                goto onError;
9022#ifdef Py_DEBUG
9023            res_offset += copied;
9024#else
9025            res_offset += seplen;
9026#endif
9027        }
9028        itemlen = PyUnicode_GET_LENGTH(item);
9029        if (itemlen != 0) {
9030            copied = PyUnicode_CopyCharacters(res, res_offset,
9031                                              item, 0, itemlen);
9032            if (copied < 0)
9033                goto onError;
9034#ifdef Py_DEBUG
9035            res_offset += copied;
9036#else
9037            res_offset += itemlen;
9038#endif
9039        }
9040    }
9041    assert(res_offset == PyUnicode_GET_LENGTH(res));
9042
9043  Done:
9044    Py_DECREF(fseq);
9045    Py_XDECREF(sep);
9046    return res;
9047
9048  onError:
9049    Py_DECREF(fseq);
9050    Py_XDECREF(sep);
9051    Py_XDECREF(res);
9052    return NULL;
9053}
9054
9055#define FILL(kind, data, value, start, length) \
9056    do { \
9057        Py_ssize_t i_ = 0; \
9058        assert(kind != PyUnicode_WCHAR_KIND); \
9059        switch ((kind)) { \
9060        case PyUnicode_1BYTE_KIND: { \
9061            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9062            memset(to_, (unsigned char)value, length); \
9063            break; \
9064        } \
9065        case PyUnicode_2BYTE_KIND: { \
9066            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9067            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9068            break; \
9069        } \
9070        default: { \
9071            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9072            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9073            break; \
9074        } \
9075        } \
9076    } while (0)
9077
9078static PyUnicodeObject *
9079pad(PyUnicodeObject *self,
9080    Py_ssize_t left,
9081    Py_ssize_t right,
9082    Py_UCS4 fill)
9083{
9084    PyObject *u;
9085    Py_UCS4 maxchar;
9086    int kind;
9087    void *data;
9088
9089    if (left < 0)
9090        left = 0;
9091    if (right < 0)
9092        right = 0;
9093
9094    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
9095        Py_INCREF(self);
9096        return self;
9097    }
9098
9099    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9100        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9101        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9102        return NULL;
9103    }
9104    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9105    if (fill > maxchar)
9106        maxchar = fill;
9107    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9108    if (!u)
9109        return NULL;
9110
9111    kind = PyUnicode_KIND(u);
9112    data = PyUnicode_DATA(u);
9113    if (left)
9114        FILL(kind, data, fill, 0, left);
9115    if (right)
9116        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9117    if (PyUnicode_CopyCharacters(u, left,
9118                                 (PyObject*)self, 0,
9119                                 _PyUnicode_LENGTH(self)) < 0)
9120    {
9121        Py_DECREF(u);
9122        return NULL;
9123    }
9124
9125    return (PyUnicodeObject*)u;
9126}
9127#undef FILL
9128
9129PyObject *
9130PyUnicode_Splitlines(PyObject *string, int keepends)
9131{
9132    PyObject *list;
9133
9134    string = PyUnicode_FromObject(string);
9135    if (string == NULL || PyUnicode_READY(string) == -1)
9136        return NULL;
9137
9138    switch(PyUnicode_KIND(string)) {
9139    case PyUnicode_1BYTE_KIND:
9140        list = ucs1lib_splitlines(
9141            (PyObject*) string, PyUnicode_1BYTE_DATA(string),
9142            PyUnicode_GET_LENGTH(string), keepends);
9143        break;
9144    case PyUnicode_2BYTE_KIND:
9145        list = ucs2lib_splitlines(
9146            (PyObject*) string, PyUnicode_2BYTE_DATA(string),
9147            PyUnicode_GET_LENGTH(string), keepends);
9148        break;
9149    case PyUnicode_4BYTE_KIND:
9150        list = ucs4lib_splitlines(
9151            (PyObject*) string, PyUnicode_4BYTE_DATA(string),
9152            PyUnicode_GET_LENGTH(string), keepends);
9153        break;
9154    default:
9155        assert(0);
9156        list = 0;
9157    }
9158    Py_DECREF(string);
9159    return list;
9160}
9161
9162static PyObject *
9163split(PyUnicodeObject *self,
9164      PyUnicodeObject *substring,
9165      Py_ssize_t maxcount)
9166{
9167    int kind1, kind2, kind;
9168    void *buf1, *buf2;
9169    Py_ssize_t len1, len2;
9170    PyObject* out;
9171
9172    if (maxcount < 0)
9173        maxcount = PY_SSIZE_T_MAX;
9174
9175    if (PyUnicode_READY(self) == -1)
9176        return NULL;
9177
9178    if (substring == NULL)
9179        switch(PyUnicode_KIND(self)) {
9180        case PyUnicode_1BYTE_KIND:
9181            return ucs1lib_split_whitespace(
9182                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9183                PyUnicode_GET_LENGTH(self), maxcount
9184                );
9185        case PyUnicode_2BYTE_KIND:
9186            return ucs2lib_split_whitespace(
9187                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9188                PyUnicode_GET_LENGTH(self), maxcount
9189                );
9190        case PyUnicode_4BYTE_KIND:
9191            return ucs4lib_split_whitespace(
9192                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9193                PyUnicode_GET_LENGTH(self), maxcount
9194                );
9195        default:
9196            assert(0);
9197            return NULL;
9198        }
9199
9200    if (PyUnicode_READY(substring) == -1)
9201        return NULL;
9202
9203    kind1 = PyUnicode_KIND(self);
9204    kind2 = PyUnicode_KIND(substring);
9205    kind = kind1 > kind2 ? kind1 : kind2;
9206    buf1 = PyUnicode_DATA(self);
9207    buf2 = PyUnicode_DATA(substring);
9208    if (kind1 != kind)
9209        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9210    if (!buf1)
9211        return NULL;
9212    if (kind2 != kind)
9213        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9214    if (!buf2) {
9215        if (kind1 != kind) PyMem_Free(buf1);
9216        return NULL;
9217    }
9218    len1 = PyUnicode_GET_LENGTH(self);
9219    len2 = PyUnicode_GET_LENGTH(substring);
9220
9221    switch(kind) {
9222    case PyUnicode_1BYTE_KIND:
9223        out = ucs1lib_split(
9224            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9225        break;
9226    case PyUnicode_2BYTE_KIND:
9227        out = ucs2lib_split(
9228            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9229        break;
9230    case PyUnicode_4BYTE_KIND:
9231        out = ucs4lib_split(
9232            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9233        break;
9234    default:
9235        out = NULL;
9236    }
9237    if (kind1 != kind)
9238        PyMem_Free(buf1);
9239    if (kind2 != kind)
9240        PyMem_Free(buf2);
9241    return out;
9242}
9243
9244static PyObject *
9245rsplit(PyUnicodeObject *self,
9246       PyUnicodeObject *substring,
9247       Py_ssize_t maxcount)
9248{
9249    int kind1, kind2, kind;
9250    void *buf1, *buf2;
9251    Py_ssize_t len1, len2;
9252    PyObject* out;
9253
9254    if (maxcount < 0)
9255        maxcount = PY_SSIZE_T_MAX;
9256
9257    if (PyUnicode_READY(self) == -1)
9258        return NULL;
9259
9260    if (substring == NULL)
9261        switch(PyUnicode_KIND(self)) {
9262        case PyUnicode_1BYTE_KIND:
9263            return ucs1lib_rsplit_whitespace(
9264                (PyObject*) self,  PyUnicode_1BYTE_DATA(self),
9265                PyUnicode_GET_LENGTH(self), maxcount
9266                );
9267        case PyUnicode_2BYTE_KIND:
9268            return ucs2lib_rsplit_whitespace(
9269                (PyObject*) self,  PyUnicode_2BYTE_DATA(self),
9270                PyUnicode_GET_LENGTH(self), maxcount
9271                );
9272        case PyUnicode_4BYTE_KIND:
9273            return ucs4lib_rsplit_whitespace(
9274                (PyObject*) self,  PyUnicode_4BYTE_DATA(self),
9275                PyUnicode_GET_LENGTH(self), maxcount
9276                );
9277        default:
9278            assert(0);
9279            return NULL;
9280        }
9281
9282    if (PyUnicode_READY(substring) == -1)
9283        return NULL;
9284
9285    kind1 = PyUnicode_KIND(self);
9286    kind2 = PyUnicode_KIND(substring);
9287    kind = kind1 > kind2 ? kind1 : kind2;
9288    buf1 = PyUnicode_DATA(self);
9289    buf2 = PyUnicode_DATA(substring);
9290    if (kind1 != kind)
9291        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
9292    if (!buf1)
9293        return NULL;
9294    if (kind2 != kind)
9295        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
9296    if (!buf2) {
9297        if (kind1 != kind) PyMem_Free(buf1);
9298        return NULL;
9299    }
9300    len1 = PyUnicode_GET_LENGTH(self);
9301    len2 = PyUnicode_GET_LENGTH(substring);
9302
9303    switch(kind) {
9304    case PyUnicode_1BYTE_KIND:
9305        out = ucs1lib_rsplit(
9306            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9307        break;
9308    case PyUnicode_2BYTE_KIND:
9309        out = ucs2lib_rsplit(
9310            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9311        break;
9312    case PyUnicode_4BYTE_KIND:
9313        out = ucs4lib_rsplit(
9314            (PyObject*) self,  buf1, len1, buf2, len2, maxcount);
9315        break;
9316    default:
9317        out = NULL;
9318    }
9319    if (kind1 != kind)
9320        PyMem_Free(buf1);
9321    if (kind2 != kind)
9322        PyMem_Free(buf2);
9323    return out;
9324}
9325
9326static Py_ssize_t
9327anylib_find(int kind, void *buf1, Py_ssize_t len1,
9328            void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9329{
9330    switch(kind) {
9331    case PyUnicode_1BYTE_KIND:
9332        return ucs1lib_find(buf1, len1, buf2, len2, offset);
9333    case PyUnicode_2BYTE_KIND:
9334        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9335    case PyUnicode_4BYTE_KIND:
9336        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9337    }
9338    assert(0);
9339    return -1;
9340}
9341
9342static Py_ssize_t
9343anylib_count(int kind, void* sbuf, Py_ssize_t slen,
9344             void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9345{
9346        switch(kind) {
9347        case PyUnicode_1BYTE_KIND:
9348            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
9349        case PyUnicode_2BYTE_KIND:
9350            return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
9351        case PyUnicode_4BYTE_KIND:
9352            return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
9353        }
9354        assert(0);
9355        return 0;
9356}
9357
9358static PyObject *
9359replace(PyObject *self, PyObject *str1,
9360        PyObject *str2, Py_ssize_t maxcount)
9361{
9362    PyObject *u;
9363    char *sbuf = PyUnicode_DATA(self);
9364    char *buf1 = PyUnicode_DATA(str1);
9365    char *buf2 = PyUnicode_DATA(str2);
9366    int srelease = 0, release1 = 0, release2 = 0;
9367    int skind = PyUnicode_KIND(self);
9368    int kind1 = PyUnicode_KIND(str1);
9369    int kind2 = PyUnicode_KIND(str2);
9370    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
9371    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
9372    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
9373
9374    if (maxcount < 0)
9375        maxcount = PY_SSIZE_T_MAX;
9376    else if (maxcount == 0 || slen == 0)
9377        goto nothing;
9378
9379    if (skind < kind1)
9380        /* substring too wide to be present */
9381        goto nothing;
9382
9383    if (len1 == len2) {
9384        Py_ssize_t i;
9385        /* same length */
9386        if (len1 == 0)
9387            goto nothing;
9388        if (len1 == 1) {
9389            /* replace characters */
9390            Py_UCS4 u1, u2, maxchar;
9391            int mayshrink, rkind;
9392            u1 = PyUnicode_READ_CHAR(str1, 0);
9393            if (!findchar(sbuf, PyUnicode_KIND(self),
9394                          slen, u1, 1))
9395                goto nothing;
9396            u2 = PyUnicode_READ_CHAR(str2, 0);
9397            maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9398            /* Replacing u1 with u2 may cause a maxchar reduction in the
9399               result string. */
9400            if (u2 > maxchar) {
9401                maxchar = u2;
9402                mayshrink = 0;
9403            }
9404            else
9405                mayshrink = maxchar > 127;
9406            u = PyUnicode_New(slen, maxchar);
9407            if (!u)
9408                goto error;
9409            if (PyUnicode_CopyCharacters(u, 0,
9410                                         (PyObject*)self, 0, slen) < 0)
9411            {
9412                Py_DECREF(u);
9413                return NULL;
9414            }
9415            rkind = PyUnicode_KIND(u);
9416            for (i = 0; i < PyUnicode_GET_LENGTH(u); i++)
9417                if (PyUnicode_READ(rkind, PyUnicode_DATA(u), i) == u1) {
9418                    if (--maxcount < 0)
9419                        break;
9420                    PyUnicode_WRITE(rkind, PyUnicode_DATA(u), i, u2);
9421                }
9422            if (mayshrink) {
9423                PyObject *tmp = u;
9424                u = PyUnicode_FromKindAndData(rkind, PyUnicode_DATA(tmp),
9425                                              PyUnicode_GET_LENGTH(tmp));
9426                Py_DECREF(tmp);
9427            }
9428        } else {
9429            int rkind = skind;
9430            char *res;
9431            if (kind1 < rkind) {
9432                /* widen substring */
9433                buf1 = _PyUnicode_AsKind(str1, rkind);
9434                if (!buf1) goto error;
9435                release1 = 1;
9436            }
9437            i = anylib_find(rkind, sbuf, slen, buf1, len1, 0);
9438            if (i < 0)
9439                goto nothing;
9440            if (rkind > kind2) {
9441                /* widen replacement */
9442                buf2 = _PyUnicode_AsKind(str2, rkind);
9443                if (!buf2) goto error;
9444                release2 = 1;
9445            }
9446            else if (rkind < kind2) {
9447                /* widen self and buf1 */
9448                rkind = kind2;
9449                if (release1) PyMem_Free(buf1);
9450                sbuf = _PyUnicode_AsKind(self, rkind);
9451                if (!sbuf) goto error;
9452                srelease = 1;
9453                buf1 = _PyUnicode_AsKind(str1, rkind);
9454                if (!buf1) goto error;
9455                release1 = 1;
9456            }
9457            res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, slen));
9458            if (!res) {
9459                PyErr_NoMemory();
9460                goto error;
9461            }
9462            memcpy(res, sbuf, PyUnicode_KIND_SIZE(rkind, slen));
9463            /* change everything in-place, starting with this one */
9464            memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9465                   buf2,
9466                   PyUnicode_KIND_SIZE(rkind, len2));
9467            i += len1;
9468
9469            while ( --maxcount > 0) {
9470                i = anylib_find(rkind, sbuf+PyUnicode_KIND_SIZE(rkind, i),
9471                                slen-i,
9472                                buf1, len1, i);
9473                if (i == -1)
9474                    break;
9475                memcpy(res + PyUnicode_KIND_SIZE(rkind, i),
9476                       buf2,
9477                       PyUnicode_KIND_SIZE(rkind, len2));
9478                i += len1;
9479            }
9480
9481            u = PyUnicode_FromKindAndData(rkind, res, slen);
9482            PyMem_Free(res);
9483            if (!u) goto error;
9484        }
9485    } else {
9486
9487        Py_ssize_t n, i, j, ires;
9488        Py_ssize_t product, new_size;
9489        int rkind = skind;
9490        char *res;
9491
9492        if (kind1 < rkind) {
9493            buf1 = _PyUnicode_AsKind(str1, rkind);
9494            if (!buf1) goto error;
9495            release1 = 1;
9496        }
9497        n = anylib_count(rkind, sbuf, slen, buf1, len1, maxcount);
9498        if (n == 0)
9499            goto nothing;
9500        if (kind2 < rkind) {
9501            buf2 = _PyUnicode_AsKind(str2, rkind);
9502            if (!buf2) goto error;
9503            release2 = 1;
9504        }
9505        else if (kind2 > rkind) {
9506            rkind = kind2;
9507            sbuf = _PyUnicode_AsKind(self, rkind);
9508            if (!sbuf) goto error;
9509            srelease = 1;
9510            if (release1) PyMem_Free(buf1);
9511            buf1 = _PyUnicode_AsKind(str1, rkind);
9512            if (!buf1) goto error;
9513            release1 = 1;
9514        }
9515        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
9516           PyUnicode_GET_LENGTH(str1))); */
9517        product = n * (len2-len1);
9518        if ((product / (len2-len1)) != n) {
9519                PyErr_SetString(PyExc_OverflowError,
9520                                "replace string is too long");
9521                goto error;
9522        }
9523        new_size = slen + product;
9524        if (new_size < 0 || new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
9525            PyErr_SetString(PyExc_OverflowError,
9526                            "replace string is too long");
9527            goto error;
9528        }
9529        res = PyMem_Malloc(PyUnicode_KIND_SIZE(rkind, new_size));
9530        if (!res)
9531            goto error;
9532        ires = i = 0;
9533        if (len1 > 0) {
9534            while (n-- > 0) {
9535                /* look for next match */
9536                j = anylib_find(rkind,
9537                                sbuf + PyUnicode_KIND_SIZE(rkind, i),
9538                                slen-i, buf1, len1, i);
9539                if (j == -1)
9540                    break;
9541                else if (j > i) {
9542                    /* copy unchanged part [i:j] */
9543                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9544                           sbuf + PyUnicode_KIND_SIZE(rkind, i),
9545                           PyUnicode_KIND_SIZE(rkind, j-i));
9546                    ires += j - i;
9547                }
9548                /* copy substitution string */
9549                if (len2 > 0) {
9550                    memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9551                           buf2,
9552                           PyUnicode_KIND_SIZE(rkind, len2));
9553                    ires += len2;
9554                }
9555                i = j + len1;
9556            }
9557            if (i < slen)
9558                /* copy tail [i:] */
9559                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9560                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9561                       PyUnicode_KIND_SIZE(rkind, slen-i));
9562        } else {
9563            /* interleave */
9564            while (n > 0) {
9565                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9566                       buf2,
9567                       PyUnicode_KIND_SIZE(rkind, len2));
9568                ires += len2;
9569                if (--n <= 0)
9570                    break;
9571                memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9572                       sbuf + PyUnicode_KIND_SIZE(rkind, i),
9573                       PyUnicode_KIND_SIZE(rkind, 1));
9574                ires++;
9575                i++;
9576            }
9577            memcpy(res + PyUnicode_KIND_SIZE(rkind, ires),
9578                   sbuf + PyUnicode_KIND_SIZE(rkind, i),
9579                   PyUnicode_KIND_SIZE(rkind, slen-i));
9580        }
9581        u = PyUnicode_FromKindAndData(rkind, res, new_size);
9582        PyMem_Free(res);
9583    }
9584    if (srelease)
9585        PyMem_FREE(sbuf);
9586    if (release1)
9587        PyMem_FREE(buf1);
9588    if (release2)
9589        PyMem_FREE(buf2);
9590    return u;
9591
9592  nothing:
9593    /* nothing to replace; return original string (when possible) */
9594    if (srelease)
9595        PyMem_FREE(sbuf);
9596    if (release1)
9597        PyMem_FREE(buf1);
9598    if (release2)
9599        PyMem_FREE(buf2);
9600    if (PyUnicode_CheckExact(self)) {
9601        Py_INCREF(self);
9602        return (PyObject *) self;
9603    }
9604    return PyUnicode_Copy(self);
9605  error:
9606    if (srelease && sbuf)
9607        PyMem_FREE(sbuf);
9608    if (release1 && buf1)
9609        PyMem_FREE(buf1);
9610    if (release2 && buf2)
9611        PyMem_FREE(buf2);
9612    return NULL;
9613}
9614
9615/* --- Unicode Object Methods --------------------------------------------- */
9616
9617PyDoc_STRVAR(title__doc__,
9618             "S.title() -> str\n\
9619\n\
9620Return a titlecased version of S, i.e. words start with title case\n\
9621characters, all remaining cased characters have lower case.");
9622
9623static PyObject*
9624unicode_title(PyUnicodeObject *self)
9625{
9626    return fixup(self, fixtitle);
9627}
9628
9629PyDoc_STRVAR(capitalize__doc__,
9630             "S.capitalize() -> str\n\
9631\n\
9632Return a capitalized version of S, i.e. make the first character\n\
9633have upper case and the rest lower case.");
9634
9635static PyObject*
9636unicode_capitalize(PyUnicodeObject *self)
9637{
9638    return fixup(self, fixcapitalize);
9639}
9640
9641#if 0
9642PyDoc_STRVAR(capwords__doc__,
9643             "S.capwords() -> str\n\
9644\n\
9645Apply .capitalize() to all words in S and return the result with\n\
9646normalized whitespace (all whitespace strings are replaced by ' ').");
9647
9648static PyObject*
9649unicode_capwords(PyUnicodeObject *self)
9650{
9651    PyObject *list;
9652    PyObject *item;
9653    Py_ssize_t i;
9654
9655    /* Split into words */
9656    list = split(self, NULL, -1);
9657    if (!list)
9658        return NULL;
9659
9660    /* Capitalize each word */
9661    for (i = 0; i < PyList_GET_SIZE(list); i++) {
9662        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
9663                     fixcapitalize);
9664        if (item == NULL)
9665            goto onError;
9666        Py_DECREF(PyList_GET_ITEM(list, i));
9667        PyList_SET_ITEM(list, i, item);
9668    }
9669
9670    /* Join the words to form a new string */
9671    item = PyUnicode_Join(NULL, list);
9672
9673  onError:
9674    Py_DECREF(list);
9675    return (PyObject *)item;
9676}
9677#endif
9678
9679/* Argument converter.  Coerces to a single unicode character */
9680
9681static int
9682convert_uc(PyObject *obj, void *addr)
9683{
9684    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
9685    PyObject *uniobj;
9686
9687    uniobj = PyUnicode_FromObject(obj);
9688    if (uniobj == NULL) {
9689        PyErr_SetString(PyExc_TypeError,
9690                        "The fill character cannot be converted to Unicode");
9691        return 0;
9692    }
9693    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
9694        PyErr_SetString(PyExc_TypeError,
9695                        "The fill character must be exactly one character long");
9696        Py_DECREF(uniobj);
9697        return 0;
9698    }
9699    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
9700    Py_DECREF(uniobj);
9701    return 1;
9702}
9703
9704PyDoc_STRVAR(center__doc__,
9705             "S.center(width[, fillchar]) -> str\n\
9706\n\
9707Return S centered in a string of length width. Padding is\n\
9708done using the specified fill character (default is a space)");
9709
9710static PyObject *
9711unicode_center(PyUnicodeObject *self, PyObject *args)
9712{
9713    Py_ssize_t marg, left;
9714    Py_ssize_t width;
9715    Py_UCS4 fillchar = ' ';
9716
9717    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
9718        return NULL;
9719
9720    if (PyUnicode_READY(self) == -1)
9721        return NULL;
9722
9723    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
9724        Py_INCREF(self);
9725        return (PyObject*) self;
9726    }
9727
9728    marg = width - _PyUnicode_LENGTH(self);
9729    left = marg / 2 + (marg & width & 1);
9730
9731    return (PyObject*) pad(self, left, marg - left, fillchar);
9732}
9733
9734#if 0
9735
9736/* This code should go into some future Unicode collation support
9737   module. The basic comparison should compare ordinals on a naive
9738   basis (this is what Java does and thus Jython too). */
9739
9740/* speedy UTF-16 code point order comparison */
9741/* gleaned from: */
9742/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
9743
9744static short utf16Fixup[32] =
9745{
9746    0, 0, 0, 0, 0, 0, 0, 0,
9747    0, 0, 0, 0, 0, 0, 0, 0,
9748    0, 0, 0, 0, 0, 0, 0, 0,
9749    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
9750};
9751
9752static int
9753unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9754{
9755    Py_ssize_t len1, len2;
9756
9757    Py_UNICODE *s1 = str1->str;
9758    Py_UNICODE *s2 = str2->str;
9759
9760    len1 = str1->_base._base.length;
9761    len2 = str2->_base._base.length;
9762
9763    while (len1 > 0 && len2 > 0) {
9764        Py_UNICODE c1, c2;
9765
9766        c1 = *s1++;
9767        c2 = *s2++;
9768
9769        if (c1 > (1<<11) * 26)
9770            c1 += utf16Fixup[c1>>11];
9771        if (c2 > (1<<11) * 26)
9772            c2 += utf16Fixup[c2>>11];
9773        /* now c1 and c2 are in UTF-32-compatible order */
9774
9775        if (c1 != c2)
9776            return (c1 < c2) ? -1 : 1;
9777
9778        len1--; len2--;
9779    }
9780
9781    return (len1 < len2) ? -1 : (len1 != len2);
9782}
9783
9784#else
9785
9786/* This function assumes that str1 and str2 are readied by the caller. */
9787
9788static int
9789unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
9790{
9791    int kind1, kind2;
9792    void *data1, *data2;
9793    Py_ssize_t len1, len2, i;
9794
9795    kind1 = PyUnicode_KIND(str1);
9796    kind2 = PyUnicode_KIND(str2);
9797    data1 = PyUnicode_DATA(str1);
9798    data2 = PyUnicode_DATA(str2);
9799    len1 = PyUnicode_GET_LENGTH(str1);
9800    len2 = PyUnicode_GET_LENGTH(str2);
9801
9802    for (i = 0; i < len1 && i < len2; ++i) {
9803        Py_UCS4 c1, c2;
9804        c1 = PyUnicode_READ(kind1, data1, i);
9805        c2 = PyUnicode_READ(kind2, data2, i);
9806
9807        if (c1 != c2)
9808            return (c1 < c2) ? -1 : 1;
9809    }
9810
9811    return (len1 < len2) ? -1 : (len1 != len2);
9812}
9813
9814#endif
9815
9816int
9817PyUnicode_Compare(PyObject *left, PyObject *right)
9818{
9819    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9820        if (PyUnicode_READY(left) == -1 ||
9821            PyUnicode_READY(right) == -1)
9822            return -1;
9823        return unicode_compare((PyUnicodeObject *)left,
9824                               (PyUnicodeObject *)right);
9825    }
9826    PyErr_Format(PyExc_TypeError,
9827                 "Can't compare %.100s and %.100s",
9828                 left->ob_type->tp_name,
9829                 right->ob_type->tp_name);
9830    return -1;
9831}
9832
9833int
9834PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
9835{
9836    Py_ssize_t i;
9837    int kind;
9838    void *data;
9839    Py_UCS4 chr;
9840
9841    assert(_PyUnicode_CHECK(uni));
9842    if (PyUnicode_READY(uni) == -1)
9843        return -1;
9844    kind = PyUnicode_KIND(uni);
9845    data = PyUnicode_DATA(uni);
9846    /* Compare Unicode string and source character set string */
9847    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
9848        if (chr != str[i])
9849            return (chr < (unsigned char)(str[i])) ? -1 : 1;
9850    /* This check keeps Python strings that end in '\0' from comparing equal
9851     to C strings identical up to that point. */
9852    if (PyUnicode_GET_LENGTH(uni) != i || chr)
9853        return 1; /* uni is longer */
9854    if (str[i])
9855        return -1; /* str is longer */
9856    return 0;
9857}
9858
9859
9860#define TEST_COND(cond)                         \
9861    ((cond) ? Py_True : Py_False)
9862
9863PyObject *
9864PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
9865{
9866    int result;
9867
9868    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
9869        PyObject *v;
9870        if (PyUnicode_READY(left) == -1 ||
9871            PyUnicode_READY(right) == -1)
9872            return NULL;
9873        if (PyUnicode_GET_LENGTH(left) != PyUnicode_GET_LENGTH(right) ||
9874            PyUnicode_KIND(left) != PyUnicode_KIND(right)) {
9875            if (op == Py_EQ) {
9876                Py_INCREF(Py_False);
9877                return Py_False;
9878            }
9879            if (op == Py_NE) {
9880                Py_INCREF(Py_True);
9881                return Py_True;
9882            }
9883        }
9884        if (left == right)
9885            result = 0;
9886        else
9887            result = unicode_compare((PyUnicodeObject *)left,
9888                                     (PyUnicodeObject *)right);
9889
9890        /* Convert the return value to a Boolean */
9891        switch (op) {
9892        case Py_EQ:
9893            v = TEST_COND(result == 0);
9894            break;
9895        case Py_NE:
9896            v = TEST_COND(result != 0);
9897            break;
9898        case Py_LE:
9899            v = TEST_COND(result <= 0);
9900            break;
9901        case Py_GE:
9902            v = TEST_COND(result >= 0);
9903            break;
9904        case Py_LT:
9905            v = TEST_COND(result == -1);
9906            break;
9907        case Py_GT:
9908            v = TEST_COND(result == 1);
9909            break;
9910        default:
9911            PyErr_BadArgument();
9912            return NULL;
9913        }
9914        Py_INCREF(v);
9915        return v;
9916    }
9917
9918    Py_RETURN_NOTIMPLEMENTED;
9919}
9920
9921int
9922PyUnicode_Contains(PyObject *container, PyObject *element)
9923{
9924    PyObject *str, *sub;
9925    int kind1, kind2, kind;
9926    void *buf1, *buf2;
9927    Py_ssize_t len1, len2;
9928    int result;
9929
9930    /* Coerce the two arguments */
9931    sub = PyUnicode_FromObject(element);
9932    if (!sub) {
9933        PyErr_Format(PyExc_TypeError,
9934                     "'in <string>' requires string as left operand, not %s",
9935                     element->ob_type->tp_name);
9936        return -1;
9937    }
9938    if (PyUnicode_READY(sub) == -1)
9939        return -1;
9940
9941    str = PyUnicode_FromObject(container);
9942    if (!str || PyUnicode_READY(str) == -1) {
9943        Py_DECREF(sub);
9944        return -1;
9945    }
9946
9947    kind1 = PyUnicode_KIND(str);
9948    kind2 = PyUnicode_KIND(sub);
9949    kind = kind1 > kind2 ? kind1 : kind2;
9950    buf1 = PyUnicode_DATA(str);
9951    buf2 = PyUnicode_DATA(sub);
9952    if (kind1 != kind)
9953        buf1 = _PyUnicode_AsKind((PyObject*)str, kind);
9954    if (!buf1) {
9955        Py_DECREF(sub);
9956        return -1;
9957    }
9958    if (kind2 != kind)
9959        buf2 = _PyUnicode_AsKind((PyObject*)sub, kind);
9960    if (!buf2) {
9961        Py_DECREF(sub);
9962        if (kind1 != kind) PyMem_Free(buf1);
9963        return -1;
9964    }
9965    len1 = PyUnicode_GET_LENGTH(str);
9966    len2 = PyUnicode_GET_LENGTH(sub);
9967
9968    switch(kind) {
9969    case PyUnicode_1BYTE_KIND:
9970        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
9971        break;
9972    case PyUnicode_2BYTE_KIND:
9973        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
9974        break;
9975    case PyUnicode_4BYTE_KIND:
9976        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
9977        break;
9978    default:
9979        result = -1;
9980        assert(0);
9981    }
9982
9983    Py_DECREF(str);
9984    Py_DECREF(sub);
9985
9986    if (kind1 != kind)
9987        PyMem_Free(buf1);
9988    if (kind2 != kind)
9989        PyMem_Free(buf2);
9990
9991    return result;
9992}
9993
9994/* Concat to string or Unicode object giving a new Unicode object. */
9995
9996PyObject *
9997PyUnicode_Concat(PyObject *left, PyObject *right)
9998{
9999    PyObject *u = NULL, *v = NULL, *w;
10000    Py_UCS4 maxchar;
10001
10002    /* Coerce the two arguments */
10003    u = PyUnicode_FromObject(left);
10004    if (u == NULL)
10005        goto onError;
10006    v = PyUnicode_FromObject(right);
10007    if (v == NULL)
10008        goto onError;
10009
10010    /* Shortcuts */
10011    if (v == unicode_empty) {
10012        Py_DECREF(v);
10013        return u;
10014    }
10015    if (u == unicode_empty) {
10016        Py_DECREF(u);
10017        return v;
10018    }
10019
10020    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10021    maxchar = Py_MAX(maxchar, PyUnicode_MAX_CHAR_VALUE(v));
10022
10023    /* Concat the two Unicode strings */
10024    w = PyUnicode_New(
10025        PyUnicode_GET_LENGTH(u) + PyUnicode_GET_LENGTH(v),
10026        maxchar);
10027    if (w == NULL)
10028        goto onError;
10029    if (PyUnicode_CopyCharacters(w, 0, u, 0, PyUnicode_GET_LENGTH(u)) < 0)
10030        goto onError;
10031    if (PyUnicode_CopyCharacters(w, PyUnicode_GET_LENGTH(u),
10032                                 v, 0,
10033                                 PyUnicode_GET_LENGTH(v)) < 0)
10034        goto onError;
10035    Py_DECREF(u);
10036    Py_DECREF(v);
10037    return w;
10038
10039  onError:
10040    Py_XDECREF(u);
10041    Py_XDECREF(v);
10042    return NULL;
10043}
10044
10045static void
10046unicode_append_inplace(PyObject **p_left, PyObject *right)
10047{
10048    Py_ssize_t left_len, right_len, new_len;
10049#ifdef Py_DEBUG
10050    Py_ssize_t copied;
10051#endif
10052
10053    assert(PyUnicode_IS_READY(*p_left));
10054    assert(PyUnicode_IS_READY(right));
10055
10056    left_len = PyUnicode_GET_LENGTH(*p_left);
10057    right_len = PyUnicode_GET_LENGTH(right);
10058    if (left_len > PY_SSIZE_T_MAX - right_len) {
10059        PyErr_SetString(PyExc_OverflowError,
10060                        "strings are too large to concat");
10061        goto error;
10062    }
10063    new_len = left_len + right_len;
10064
10065    /* Now we own the last reference to 'left', so we can resize it
10066     * in-place.
10067     */
10068    if (unicode_resize(p_left, new_len) != 0) {
10069        /* XXX if _PyUnicode_Resize() fails, 'left' has been
10070         * deallocated so it cannot be put back into
10071         * 'variable'.  The MemoryError is raised when there
10072         * is no value in 'variable', which might (very
10073         * remotely) be a cause of incompatibilities.
10074         */
10075        goto error;
10076    }
10077    /* copy 'right' into the newly allocated area of 'left' */
10078#ifdef Py_DEBUG
10079    copied = PyUnicode_CopyCharacters(*p_left, left_len,
10080                                      right, 0,
10081                                      right_len);
10082    assert(0 <= copied);
10083#else
10084    PyUnicode_CopyCharacters(*p_left, left_len, right, 0, right_len);
10085#endif
10086    return;
10087
10088error:
10089    Py_DECREF(*p_left);
10090    *p_left = NULL;
10091}
10092
10093void
10094PyUnicode_Append(PyObject **p_left, PyObject *right)
10095{
10096    PyObject *left, *res;
10097
10098    if (p_left == NULL) {
10099        if (!PyErr_Occurred())
10100            PyErr_BadInternalCall();
10101        return;
10102    }
10103    left = *p_left;
10104    if (right == NULL || !PyUnicode_Check(left)) {
10105        if (!PyErr_Occurred())
10106            PyErr_BadInternalCall();
10107        goto error;
10108    }
10109
10110    if (PyUnicode_READY(left))
10111        goto error;
10112    if (PyUnicode_READY(right))
10113        goto error;
10114
10115    if (PyUnicode_CheckExact(left) && left != unicode_empty
10116        && PyUnicode_CheckExact(right) && right != unicode_empty
10117        && unicode_resizable(left)
10118        && (_PyUnicode_KIND(right) <= _PyUnicode_KIND(left)
10119            || _PyUnicode_WSTR(left) != NULL))
10120    {
10121        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10122           to change the structure size, but characters are stored just after
10123           the structure, and so it requires to move all charactres which is
10124           not so different than duplicating the string. */
10125        if (!(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10126        {
10127            unicode_append_inplace(p_left, right);
10128            return;
10129        }
10130    }
10131
10132    res = PyUnicode_Concat(left, right);
10133    if (res == NULL)
10134        goto error;
10135    Py_DECREF(left);
10136    *p_left = res;
10137    return;
10138
10139error:
10140    Py_DECREF(*p_left);
10141    *p_left = NULL;
10142}
10143
10144void
10145PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10146{
10147    PyUnicode_Append(pleft, right);
10148    Py_XDECREF(right);
10149}
10150
10151PyDoc_STRVAR(count__doc__,
10152             "S.count(sub[, start[, end]]) -> int\n\
10153\n\
10154Return the number of non-overlapping occurrences of substring sub in\n\
10155string S[start:end].  Optional arguments start and end are\n\
10156interpreted as in slice notation.");
10157
10158static PyObject *
10159unicode_count(PyUnicodeObject *self, PyObject *args)
10160{
10161    PyUnicodeObject *substring;
10162    Py_ssize_t start = 0;
10163    Py_ssize_t end = PY_SSIZE_T_MAX;
10164    PyObject *result;
10165    int kind1, kind2, kind;
10166    void *buf1, *buf2;
10167    Py_ssize_t len1, len2, iresult;
10168
10169    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10170                                            &start, &end))
10171        return NULL;
10172
10173    kind1 = PyUnicode_KIND(self);
10174    kind2 = PyUnicode_KIND(substring);
10175    kind = kind1 > kind2 ? kind1 : kind2;
10176    buf1 = PyUnicode_DATA(self);
10177    buf2 = PyUnicode_DATA(substring);
10178    if (kind1 != kind)
10179        buf1 = _PyUnicode_AsKind((PyObject*)self, kind);
10180    if (!buf1) {
10181        Py_DECREF(substring);
10182        return NULL;
10183    }
10184    if (kind2 != kind)
10185        buf2 = _PyUnicode_AsKind((PyObject*)substring, kind);
10186    if (!buf2) {
10187        Py_DECREF(substring);
10188        if (kind1 != kind) PyMem_Free(buf1);
10189        return NULL;
10190    }
10191    len1 = PyUnicode_GET_LENGTH(self);
10192    len2 = PyUnicode_GET_LENGTH(substring);
10193
10194    ADJUST_INDICES(start, end, len1);
10195    switch(kind) {
10196    case PyUnicode_1BYTE_KIND:
10197        iresult = ucs1lib_count(
10198            ((Py_UCS1*)buf1) + start, end - start,
10199            buf2, len2, PY_SSIZE_T_MAX
10200            );
10201        break;
10202    case PyUnicode_2BYTE_KIND:
10203        iresult = ucs2lib_count(
10204            ((Py_UCS2*)buf1) + start, end - start,
10205            buf2, len2, PY_SSIZE_T_MAX
10206            );
10207        break;
10208    case PyUnicode_4BYTE_KIND:
10209        iresult = ucs4lib_count(
10210            ((Py_UCS4*)buf1) + start, end - start,
10211            buf2, len2, PY_SSIZE_T_MAX
10212            );
10213        break;
10214    default:
10215        assert(0); iresult = 0;
10216    }
10217
10218    result = PyLong_FromSsize_t(iresult);
10219
10220    if (kind1 != kind)
10221        PyMem_Free(buf1);
10222    if (kind2 != kind)
10223        PyMem_Free(buf2);
10224
10225    Py_DECREF(substring);
10226
10227    return result;
10228}
10229
10230PyDoc_STRVAR(encode__doc__,
10231             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10232\n\
10233Encode S using the codec registered for encoding. Default encoding\n\
10234is 'utf-8'. errors may be given to set a different error\n\
10235handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10236a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10237'xmlcharrefreplace' as well as any other name registered with\n\
10238codecs.register_error that can handle UnicodeEncodeErrors.");
10239
10240static PyObject *
10241unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
10242{
10243    static char *kwlist[] = {"encoding", "errors", 0};
10244    char *encoding = NULL;
10245    char *errors = NULL;
10246
10247    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10248                                     kwlist, &encoding, &errors))
10249        return NULL;
10250    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
10251}
10252
10253PyDoc_STRVAR(expandtabs__doc__,
10254             "S.expandtabs([tabsize]) -> str\n\
10255\n\
10256Return a copy of S where all tab characters are expanded using spaces.\n\
10257If tabsize is not given, a tab size of 8 characters is assumed.");
10258
10259static PyObject*
10260unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
10261{
10262    Py_ssize_t i, j, line_pos, src_len, incr;
10263    Py_UCS4 ch;
10264    PyObject *u;
10265    void *src_data, *dest_data;
10266    int tabsize = 8;
10267    int kind;
10268    int found;
10269
10270    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10271        return NULL;
10272
10273    if (PyUnicode_READY(self) == -1)
10274        return NULL;
10275
10276    /* First pass: determine size of output string */
10277    src_len = PyUnicode_GET_LENGTH(self);
10278    i = j = line_pos = 0;
10279    kind = PyUnicode_KIND(self);
10280    src_data = PyUnicode_DATA(self);
10281    found = 0;
10282    for (; i < src_len; i++) {
10283        ch = PyUnicode_READ(kind, src_data, i);
10284        if (ch == '\t') {
10285            found = 1;
10286            if (tabsize > 0) {
10287                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10288                if (j > PY_SSIZE_T_MAX - incr)
10289                    goto overflow;
10290                line_pos += incr;
10291                j += incr;
10292            }
10293        }
10294        else {
10295            if (j > PY_SSIZE_T_MAX - 1)
10296                goto overflow;
10297            line_pos++;
10298            j++;
10299            if (ch == '\n' || ch == '\r')
10300                line_pos = 0;
10301        }
10302    }
10303    if (!found && PyUnicode_CheckExact(self)) {
10304        Py_INCREF((PyObject *) self);
10305        return (PyObject *) self;
10306    }
10307
10308    /* Second pass: create output string and fill it */
10309    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
10310    if (!u)
10311        return NULL;
10312    dest_data = PyUnicode_DATA(u);
10313
10314    i = j = line_pos = 0;
10315
10316    for (; i < src_len; i++) {
10317        ch = PyUnicode_READ(kind, src_data, i);
10318        if (ch == '\t') {
10319            if (tabsize > 0) {
10320                incr = tabsize - (line_pos % tabsize);
10321                line_pos += incr;
10322                while (incr--) {
10323                    PyUnicode_WRITE(kind, dest_data, j, ' ');
10324                    j++;
10325                }
10326            }
10327        }
10328        else {
10329            line_pos++;
10330            PyUnicode_WRITE(kind, dest_data, j, ch);
10331            j++;
10332            if (ch == '\n' || ch == '\r')
10333                line_pos = 0;
10334        }
10335    }
10336    assert (j == PyUnicode_GET_LENGTH(u));
10337#ifndef DONT_MAKE_RESULT_READY
10338    if (_PyUnicode_READY_REPLACE(&u)) {
10339        Py_DECREF(u);
10340        return NULL;
10341    }
10342#endif
10343    return (PyObject*) u;
10344
10345  overflow:
10346    PyErr_SetString(PyExc_OverflowError, "new string is too long");
10347    return NULL;
10348}
10349
10350PyDoc_STRVAR(find__doc__,
10351             "S.find(sub[, start[, end]]) -> int\n\
10352\n\
10353Return the lowest index in S where substring sub is found,\n\
10354such that sub is contained within S[start:end].  Optional\n\
10355arguments start and end are interpreted as in slice notation.\n\
10356\n\
10357Return -1 on failure.");
10358
10359static PyObject *
10360unicode_find(PyObject *self, PyObject *args)
10361{
10362    PyUnicodeObject *substring;
10363    Py_ssize_t start;
10364    Py_ssize_t end;
10365    Py_ssize_t result;
10366
10367    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
10368                                            &start, &end))
10369        return NULL;
10370
10371    if (PyUnicode_READY(self) == -1)
10372        return NULL;
10373    if (PyUnicode_READY(substring) == -1)
10374        return NULL;
10375
10376    result = any_find_slice(
10377        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10378        self, (PyObject*)substring, start, end
10379        );
10380
10381    Py_DECREF(substring);
10382
10383    if (result == -2)
10384        return NULL;
10385
10386    return PyLong_FromSsize_t(result);
10387}
10388
10389static PyObject *
10390unicode_getitem(PyObject *self, Py_ssize_t index)
10391{
10392    Py_UCS4 ch = PyUnicode_ReadChar(self, index);
10393    if (ch == (Py_UCS4)-1)
10394        return NULL;
10395    return PyUnicode_FromOrdinal(ch);
10396}
10397
10398/* Believe it or not, this produces the same value for ASCII strings
10399   as bytes_hash(). */
10400static Py_hash_t
10401unicode_hash(PyUnicodeObject *self)
10402{
10403    Py_ssize_t len;
10404    Py_uhash_t x;
10405
10406    if (_PyUnicode_HASH(self) != -1)
10407        return _PyUnicode_HASH(self);
10408    if (PyUnicode_READY(self) == -1)
10409        return -1;
10410    len = PyUnicode_GET_LENGTH(self);
10411
10412    /* The hash function as a macro, gets expanded three times below. */
10413#define HASH(P) \
10414    x = (Py_uhash_t)*P << 7; \
10415    while (--len >= 0) \
10416        x = (1000003*x) ^ (Py_uhash_t)*P++;
10417
10418    switch (PyUnicode_KIND(self)) {
10419    case PyUnicode_1BYTE_KIND: {
10420        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
10421        HASH(c);
10422        break;
10423    }
10424    case PyUnicode_2BYTE_KIND: {
10425        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
10426        HASH(s);
10427        break;
10428    }
10429    default: {
10430        Py_UCS4 *l;
10431        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
10432               "Impossible switch case in unicode_hash");
10433        l = PyUnicode_4BYTE_DATA(self);
10434        HASH(l);
10435        break;
10436    }
10437    }
10438    x ^= (Py_uhash_t)PyUnicode_GET_LENGTH(self);
10439
10440    if (x == -1)
10441        x = -2;
10442    _PyUnicode_HASH(self) = x;
10443    return x;
10444}
10445#undef HASH
10446
10447PyDoc_STRVAR(index__doc__,
10448             "S.index(sub[, start[, end]]) -> int\n\
10449\n\
10450Like S.find() but raise ValueError when the substring is not found.");
10451
10452static PyObject *
10453unicode_index(PyObject *self, PyObject *args)
10454{
10455    Py_ssize_t result;
10456    PyUnicodeObject *substring;
10457    Py_ssize_t start;
10458    Py_ssize_t end;
10459
10460    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
10461                                            &start, &end))
10462        return NULL;
10463
10464    if (PyUnicode_READY(self) == -1)
10465        return NULL;
10466    if (PyUnicode_READY(substring) == -1)
10467        return NULL;
10468
10469    result = any_find_slice(
10470        ucs1lib_find_slice, ucs2lib_find_slice, ucs4lib_find_slice,
10471        self, (PyObject*)substring, start, end
10472        );
10473
10474    Py_DECREF(substring);
10475
10476    if (result == -2)
10477        return NULL;
10478
10479    if (result < 0) {
10480        PyErr_SetString(PyExc_ValueError, "substring not found");
10481        return NULL;
10482    }
10483
10484    return PyLong_FromSsize_t(result);
10485}
10486
10487PyDoc_STRVAR(islower__doc__,
10488             "S.islower() -> bool\n\
10489\n\
10490Return True if all cased characters in S are lowercase and there is\n\
10491at least one cased character in S, False otherwise.");
10492
10493static PyObject*
10494unicode_islower(PyUnicodeObject *self)
10495{
10496    Py_ssize_t i, length;
10497    int kind;
10498    void *data;
10499    int cased;
10500
10501    if (PyUnicode_READY(self) == -1)
10502        return NULL;
10503    length = PyUnicode_GET_LENGTH(self);
10504    kind = PyUnicode_KIND(self);
10505    data = PyUnicode_DATA(self);
10506
10507    /* Shortcut for single character strings */
10508    if (length == 1)
10509        return PyBool_FromLong(
10510            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
10511
10512    /* Special case for empty strings */
10513    if (length == 0)
10514        return PyBool_FromLong(0);
10515
10516    cased = 0;
10517    for (i = 0; i < length; i++) {
10518        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10519
10520        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
10521            return PyBool_FromLong(0);
10522        else if (!cased && Py_UNICODE_ISLOWER(ch))
10523            cased = 1;
10524    }
10525    return PyBool_FromLong(cased);
10526}
10527
10528PyDoc_STRVAR(isupper__doc__,
10529             "S.isupper() -> bool\n\
10530\n\
10531Return True if all cased characters in S are uppercase and there is\n\
10532at least one cased character in S, False otherwise.");
10533
10534static PyObject*
10535unicode_isupper(PyUnicodeObject *self)
10536{
10537    Py_ssize_t i, length;
10538    int kind;
10539    void *data;
10540    int cased;
10541
10542    if (PyUnicode_READY(self) == -1)
10543        return NULL;
10544    length = PyUnicode_GET_LENGTH(self);
10545    kind = PyUnicode_KIND(self);
10546    data = PyUnicode_DATA(self);
10547
10548    /* Shortcut for single character strings */
10549    if (length == 1)
10550        return PyBool_FromLong(
10551            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
10552
10553    /* Special case for empty strings */
10554    if (length == 0)
10555        return PyBool_FromLong(0);
10556
10557    cased = 0;
10558    for (i = 0; i < length; i++) {
10559        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10560
10561        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
10562            return PyBool_FromLong(0);
10563        else if (!cased && Py_UNICODE_ISUPPER(ch))
10564            cased = 1;
10565    }
10566    return PyBool_FromLong(cased);
10567}
10568
10569PyDoc_STRVAR(istitle__doc__,
10570             "S.istitle() -> bool\n\
10571\n\
10572Return True if S is a titlecased string and there is at least one\n\
10573character in S, i.e. upper- and titlecase characters may only\n\
10574follow uncased characters and lowercase characters only cased ones.\n\
10575Return False otherwise.");
10576
10577static PyObject*
10578unicode_istitle(PyUnicodeObject *self)
10579{
10580    Py_ssize_t i, length;
10581    int kind;
10582    void *data;
10583    int cased, previous_is_cased;
10584
10585    if (PyUnicode_READY(self) == -1)
10586        return NULL;
10587    length = PyUnicode_GET_LENGTH(self);
10588    kind = PyUnicode_KIND(self);
10589    data = PyUnicode_DATA(self);
10590
10591    /* Shortcut for single character strings */
10592    if (length == 1) {
10593        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10594        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
10595                               (Py_UNICODE_ISUPPER(ch) != 0));
10596    }
10597
10598    /* Special case for empty strings */
10599    if (length == 0)
10600        return PyBool_FromLong(0);
10601
10602    cased = 0;
10603    previous_is_cased = 0;
10604    for (i = 0; i < length; i++) {
10605        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10606
10607        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
10608            if (previous_is_cased)
10609                return PyBool_FromLong(0);
10610            previous_is_cased = 1;
10611            cased = 1;
10612        }
10613        else if (Py_UNICODE_ISLOWER(ch)) {
10614            if (!previous_is_cased)
10615                return PyBool_FromLong(0);
10616            previous_is_cased = 1;
10617            cased = 1;
10618        }
10619        else
10620            previous_is_cased = 0;
10621    }
10622    return PyBool_FromLong(cased);
10623}
10624
10625PyDoc_STRVAR(isspace__doc__,
10626             "S.isspace() -> bool\n\
10627\n\
10628Return True if all characters in S are whitespace\n\
10629and there is at least one character in S, False otherwise.");
10630
10631static PyObject*
10632unicode_isspace(PyUnicodeObject *self)
10633{
10634    Py_ssize_t i, length;
10635    int kind;
10636    void *data;
10637
10638    if (PyUnicode_READY(self) == -1)
10639        return NULL;
10640    length = PyUnicode_GET_LENGTH(self);
10641    kind = PyUnicode_KIND(self);
10642    data = PyUnicode_DATA(self);
10643
10644    /* Shortcut for single character strings */
10645    if (length == 1)
10646        return PyBool_FromLong(
10647            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
10648
10649    /* Special case for empty strings */
10650    if (length == 0)
10651        return PyBool_FromLong(0);
10652
10653    for (i = 0; i < length; i++) {
10654        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10655        if (!Py_UNICODE_ISSPACE(ch))
10656            return PyBool_FromLong(0);
10657    }
10658    return PyBool_FromLong(1);
10659}
10660
10661PyDoc_STRVAR(isalpha__doc__,
10662             "S.isalpha() -> bool\n\
10663\n\
10664Return True if all characters in S are alphabetic\n\
10665and there is at least one character in S, False otherwise.");
10666
10667static PyObject*
10668unicode_isalpha(PyUnicodeObject *self)
10669{
10670    Py_ssize_t i, length;
10671    int kind;
10672    void *data;
10673
10674    if (PyUnicode_READY(self) == -1)
10675        return NULL;
10676    length = PyUnicode_GET_LENGTH(self);
10677    kind = PyUnicode_KIND(self);
10678    data = PyUnicode_DATA(self);
10679
10680    /* Shortcut for single character strings */
10681    if (length == 1)
10682        return PyBool_FromLong(
10683            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
10684
10685    /* Special case for empty strings */
10686    if (length == 0)
10687        return PyBool_FromLong(0);
10688
10689    for (i = 0; i < length; i++) {
10690        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
10691            return PyBool_FromLong(0);
10692    }
10693    return PyBool_FromLong(1);
10694}
10695
10696PyDoc_STRVAR(isalnum__doc__,
10697             "S.isalnum() -> bool\n\
10698\n\
10699Return True if all characters in S are alphanumeric\n\
10700and there is at least one character in S, False otherwise.");
10701
10702static PyObject*
10703unicode_isalnum(PyUnicodeObject *self)
10704{
10705    int kind;
10706    void *data;
10707    Py_ssize_t len, i;
10708
10709    if (PyUnicode_READY(self) == -1)
10710        return NULL;
10711
10712    kind = PyUnicode_KIND(self);
10713    data = PyUnicode_DATA(self);
10714    len = PyUnicode_GET_LENGTH(self);
10715
10716    /* Shortcut for single character strings */
10717    if (len == 1) {
10718        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10719        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
10720    }
10721
10722    /* Special case for empty strings */
10723    if (len == 0)
10724        return PyBool_FromLong(0);
10725
10726    for (i = 0; i < len; i++) {
10727        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
10728        if (!Py_UNICODE_ISALNUM(ch))
10729            return PyBool_FromLong(0);
10730    }
10731    return PyBool_FromLong(1);
10732}
10733
10734PyDoc_STRVAR(isdecimal__doc__,
10735             "S.isdecimal() -> bool\n\
10736\n\
10737Return True if there are only decimal characters in S,\n\
10738False otherwise.");
10739
10740static PyObject*
10741unicode_isdecimal(PyUnicodeObject *self)
10742{
10743    Py_ssize_t i, length;
10744    int kind;
10745    void *data;
10746
10747    if (PyUnicode_READY(self) == -1)
10748        return NULL;
10749    length = PyUnicode_GET_LENGTH(self);
10750    kind = PyUnicode_KIND(self);
10751    data = PyUnicode_DATA(self);
10752
10753    /* Shortcut for single character strings */
10754    if (length == 1)
10755        return PyBool_FromLong(
10756            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
10757
10758    /* Special case for empty strings */
10759    if (length == 0)
10760        return PyBool_FromLong(0);
10761
10762    for (i = 0; i < length; i++) {
10763        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
10764            return PyBool_FromLong(0);
10765    }
10766    return PyBool_FromLong(1);
10767}
10768
10769PyDoc_STRVAR(isdigit__doc__,
10770             "S.isdigit() -> bool\n\
10771\n\
10772Return True if all characters in S are digits\n\
10773and there is at least one character in S, False otherwise.");
10774
10775static PyObject*
10776unicode_isdigit(PyUnicodeObject *self)
10777{
10778    Py_ssize_t i, length;
10779    int kind;
10780    void *data;
10781
10782    if (PyUnicode_READY(self) == -1)
10783        return NULL;
10784    length = PyUnicode_GET_LENGTH(self);
10785    kind = PyUnicode_KIND(self);
10786    data = PyUnicode_DATA(self);
10787
10788    /* Shortcut for single character strings */
10789    if (length == 1) {
10790        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
10791        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
10792    }
10793
10794    /* Special case for empty strings */
10795    if (length == 0)
10796        return PyBool_FromLong(0);
10797
10798    for (i = 0; i < length; i++) {
10799        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
10800            return PyBool_FromLong(0);
10801    }
10802    return PyBool_FromLong(1);
10803}
10804
10805PyDoc_STRVAR(isnumeric__doc__,
10806             "S.isnumeric() -> bool\n\
10807\n\
10808Return True if there are only numeric characters in S,\n\
10809False otherwise.");
10810
10811static PyObject*
10812unicode_isnumeric(PyUnicodeObject *self)
10813{
10814    Py_ssize_t i, length;
10815    int kind;
10816    void *data;
10817
10818    if (PyUnicode_READY(self) == -1)
10819        return NULL;
10820    length = PyUnicode_GET_LENGTH(self);
10821    kind = PyUnicode_KIND(self);
10822    data = PyUnicode_DATA(self);
10823
10824    /* Shortcut for single character strings */
10825    if (length == 1)
10826        return PyBool_FromLong(
10827            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
10828
10829    /* Special case for empty strings */
10830    if (length == 0)
10831        return PyBool_FromLong(0);
10832
10833    for (i = 0; i < length; i++) {
10834        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
10835            return PyBool_FromLong(0);
10836    }
10837    return PyBool_FromLong(1);
10838}
10839
10840int
10841PyUnicode_IsIdentifier(PyObject *self)
10842{
10843    int kind;
10844    void *data;
10845    Py_ssize_t i;
10846    Py_UCS4 first;
10847
10848    if (PyUnicode_READY(self) == -1) {
10849        Py_FatalError("identifier not ready");
10850        return 0;
10851    }
10852
10853    /* Special case for empty strings */
10854    if (PyUnicode_GET_LENGTH(self) == 0)
10855        return 0;
10856    kind = PyUnicode_KIND(self);
10857    data = PyUnicode_DATA(self);
10858
10859    /* PEP 3131 says that the first character must be in
10860       XID_Start and subsequent characters in XID_Continue,
10861       and for the ASCII range, the 2.x rules apply (i.e
10862       start with letters and underscore, continue with
10863       letters, digits, underscore). However, given the current
10864       definition of XID_Start and XID_Continue, it is sufficient
10865       to check just for these, except that _ must be allowed
10866       as starting an identifier.  */
10867    first = PyUnicode_READ(kind, data, 0);
10868    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
10869        return 0;
10870
10871    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
10872        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
10873            return 0;
10874    return 1;
10875}
10876
10877PyDoc_STRVAR(isidentifier__doc__,
10878             "S.isidentifier() -> bool\n\
10879\n\
10880Return True if S is a valid identifier according\n\
10881to the language definition.");
10882
10883static PyObject*
10884unicode_isidentifier(PyObject *self)
10885{
10886    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
10887}
10888
10889PyDoc_STRVAR(isprintable__doc__,
10890             "S.isprintable() -> bool\n\
10891\n\
10892Return True if all characters in S are considered\n\
10893printable in repr() or S is empty, False otherwise.");
10894
10895static PyObject*
10896unicode_isprintable(PyObject *self)
10897{
10898    Py_ssize_t i, length;
10899    int kind;
10900    void *data;
10901
10902    if (PyUnicode_READY(self) == -1)
10903        return NULL;
10904    length = PyUnicode_GET_LENGTH(self);
10905    kind = PyUnicode_KIND(self);
10906    data = PyUnicode_DATA(self);
10907
10908    /* Shortcut for single character strings */
10909    if (length == 1)
10910        return PyBool_FromLong(
10911            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
10912
10913    for (i = 0; i < length; i++) {
10914        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
10915            Py_RETURN_FALSE;
10916        }
10917    }
10918    Py_RETURN_TRUE;
10919}
10920
10921PyDoc_STRVAR(join__doc__,
10922             "S.join(iterable) -> str\n\
10923\n\
10924Return a string which is the concatenation of the strings in the\n\
10925iterable.  The separator between elements is S.");
10926
10927static PyObject*
10928unicode_join(PyObject *self, PyObject *data)
10929{
10930    return PyUnicode_Join(self, data);
10931}
10932
10933static Py_ssize_t
10934unicode_length(PyUnicodeObject *self)
10935{
10936    if (PyUnicode_READY(self) == -1)
10937        return -1;
10938    return PyUnicode_GET_LENGTH(self);
10939}
10940
10941PyDoc_STRVAR(ljust__doc__,
10942             "S.ljust(width[, fillchar]) -> str\n\
10943\n\
10944Return S left-justified in a Unicode string of length width. Padding is\n\
10945done using the specified fill character (default is a space).");
10946
10947static PyObject *
10948unicode_ljust(PyUnicodeObject *self, PyObject *args)
10949{
10950    Py_ssize_t width;
10951    Py_UCS4 fillchar = ' ';
10952
10953    if (PyUnicode_READY(self) == -1)
10954        return NULL;
10955
10956    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
10957        return NULL;
10958
10959    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
10960        Py_INCREF(self);
10961        return (PyObject*) self;
10962    }
10963
10964    return (PyObject*) pad(self, 0, width - _PyUnicode_LENGTH(self), fillchar);
10965}
10966
10967PyDoc_STRVAR(lower__doc__,
10968             "S.lower() -> str\n\
10969\n\
10970Return a copy of the string S converted to lowercase.");
10971
10972static PyObject*
10973unicode_lower(PyUnicodeObject *self)
10974{
10975    return fixup(self, fixlower);
10976}
10977
10978#define LEFTSTRIP 0
10979#define RIGHTSTRIP 1
10980#define BOTHSTRIP 2
10981
10982/* Arrays indexed by above */
10983static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
10984
10985#define STRIPNAME(i) (stripformat[i]+3)
10986
10987/* externally visible for str.strip(unicode) */
10988PyObject *
10989_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
10990{
10991    void *data;
10992    int kind;
10993    Py_ssize_t i, j, len;
10994    BLOOM_MASK sepmask;
10995
10996    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
10997        return NULL;
10998
10999    kind = PyUnicode_KIND(self);
11000    data = PyUnicode_DATA(self);
11001    len = PyUnicode_GET_LENGTH(self);
11002    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11003                              PyUnicode_DATA(sepobj),
11004                              PyUnicode_GET_LENGTH(sepobj));
11005
11006    i = 0;
11007    if (striptype != RIGHTSTRIP) {
11008        while (i < len &&
11009               BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, i), sepobj)) {
11010            i++;
11011        }
11012    }
11013
11014    j = len;
11015    if (striptype != LEFTSTRIP) {
11016        do {
11017            j--;
11018        } while (j >= i &&
11019                 BLOOM_MEMBER(sepmask, PyUnicode_READ(kind, data, j), sepobj));
11020        j++;
11021    }
11022
11023    return PyUnicode_Substring((PyObject*)self, i, j);
11024}
11025
11026PyObject*
11027PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11028{
11029    unsigned char *data;
11030    int kind;
11031    Py_ssize_t length;
11032
11033    if (PyUnicode_READY(self) == -1)
11034        return NULL;
11035
11036    end = Py_MIN(end, PyUnicode_GET_LENGTH(self));
11037
11038    if (start == 0 && end == PyUnicode_GET_LENGTH(self))
11039    {
11040        if (PyUnicode_CheckExact(self)) {
11041            Py_INCREF(self);
11042            return self;
11043        }
11044        else
11045            return PyUnicode_Copy(self);
11046    }
11047
11048    length = end - start;
11049    if (length == 1)
11050        return unicode_getitem(self, start);
11051
11052    if (start < 0 || end < 0) {
11053        PyErr_SetString(PyExc_IndexError, "string index out of range");
11054        return NULL;
11055    }
11056
11057    if (PyUnicode_IS_ASCII(self)) {
11058        kind = PyUnicode_KIND(self);
11059        data = PyUnicode_1BYTE_DATA(self);
11060        return unicode_fromascii(data + start, length);
11061    }
11062    else {
11063        kind = PyUnicode_KIND(self);
11064        data = PyUnicode_1BYTE_DATA(self);
11065        return PyUnicode_FromKindAndData(kind,
11066                                         data + PyUnicode_KIND_SIZE(kind, start),
11067                                         length);
11068    }
11069}
11070
11071static PyObject *
11072do_strip(PyUnicodeObject *self, int striptype)
11073{
11074    int kind;
11075    void *data;
11076    Py_ssize_t len, i, j;
11077
11078    if (PyUnicode_READY(self) == -1)
11079        return NULL;
11080
11081    kind = PyUnicode_KIND(self);
11082    data = PyUnicode_DATA(self);
11083    len = PyUnicode_GET_LENGTH(self);
11084
11085    i = 0;
11086    if (striptype != RIGHTSTRIP) {
11087        while (i < len && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, i))) {
11088            i++;
11089        }
11090    }
11091
11092    j = len;
11093    if (striptype != LEFTSTRIP) {
11094        do {
11095            j--;
11096        } while (j >= i && Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, j)));
11097        j++;
11098    }
11099
11100    return PyUnicode_Substring((PyObject*)self, i, j);
11101}
11102
11103
11104static PyObject *
11105do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
11106{
11107    PyObject *sep = NULL;
11108
11109    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11110        return NULL;
11111
11112    if (sep != NULL && sep != Py_None) {
11113        if (PyUnicode_Check(sep))
11114            return _PyUnicode_XStrip(self, striptype, sep);
11115        else {
11116            PyErr_Format(PyExc_TypeError,
11117                         "%s arg must be None or str",
11118                         STRIPNAME(striptype));
11119            return NULL;
11120        }
11121    }
11122
11123    return do_strip(self, striptype);
11124}
11125
11126
11127PyDoc_STRVAR(strip__doc__,
11128             "S.strip([chars]) -> str\n\
11129\n\
11130Return a copy of the string S with leading and trailing\n\
11131whitespace removed.\n\
11132If chars is given and not None, remove characters in chars instead.");
11133
11134static PyObject *
11135unicode_strip(PyUnicodeObject *self, PyObject *args)
11136{
11137    if (PyTuple_GET_SIZE(args) == 0)
11138        return do_strip(self, BOTHSTRIP); /* Common case */
11139    else
11140        return do_argstrip(self, BOTHSTRIP, args);
11141}
11142
11143
11144PyDoc_STRVAR(lstrip__doc__,
11145             "S.lstrip([chars]) -> str\n\
11146\n\
11147Return a copy of the string S with leading whitespace removed.\n\
11148If chars is given and not None, remove characters in chars instead.");
11149
11150static PyObject *
11151unicode_lstrip(PyUnicodeObject *self, PyObject *args)
11152{
11153    if (PyTuple_GET_SIZE(args) == 0)
11154        return do_strip(self, LEFTSTRIP); /* Common case */
11155    else
11156        return do_argstrip(self, LEFTSTRIP, args);
11157}
11158
11159
11160PyDoc_STRVAR(rstrip__doc__,
11161             "S.rstrip([chars]) -> str\n\
11162\n\
11163Return a copy of the string S with trailing whitespace removed.\n\
11164If chars is given and not None, remove characters in chars instead.");
11165
11166static PyObject *
11167unicode_rstrip(PyUnicodeObject *self, PyObject *args)
11168{
11169    if (PyTuple_GET_SIZE(args) == 0)
11170        return do_strip(self, RIGHTSTRIP); /* Common case */
11171    else
11172        return do_argstrip(self, RIGHTSTRIP, args);
11173}
11174
11175
11176static PyObject*
11177unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
11178{
11179    PyUnicodeObject *u;
11180    Py_ssize_t nchars, n;
11181
11182    if (len < 1) {
11183        Py_INCREF(unicode_empty);
11184        return unicode_empty;
11185    }
11186
11187    if (len == 1 && PyUnicode_CheckExact(str)) {
11188        /* no repeat, return original string */
11189        Py_INCREF(str);
11190        return (PyObject*) str;
11191    }
11192
11193    if (PyUnicode_READY(str) == -1)
11194        return NULL;
11195
11196    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11197        PyErr_SetString(PyExc_OverflowError,
11198                        "repeated string is too long");
11199        return NULL;
11200    }
11201    nchars = len * PyUnicode_GET_LENGTH(str);
11202
11203    u = (PyUnicodeObject *)PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11204    if (!u)
11205        return NULL;
11206    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11207
11208    if (PyUnicode_GET_LENGTH(str) == 1) {
11209        const int kind = PyUnicode_KIND(str);
11210        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11211        void *to = PyUnicode_DATA(u);
11212        if (kind == PyUnicode_1BYTE_KIND)
11213            memset(to, (unsigned char)fill_char, len);
11214        else {
11215            for (n = 0; n < len; ++n)
11216                PyUnicode_WRITE(kind, to, n, fill_char);
11217        }
11218    }
11219    else {
11220        /* number of characters copied this far */
11221        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11222        const Py_ssize_t char_size = PyUnicode_CHARACTER_SIZE(str);
11223        char *to = (char *) PyUnicode_DATA(u);
11224        Py_MEMCPY(to, PyUnicode_DATA(str),
11225                  PyUnicode_GET_LENGTH(str) * char_size);
11226        while (done < nchars) {
11227            n = (done <= nchars-done) ? done : nchars-done;
11228            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11229            done += n;
11230        }
11231    }
11232
11233    return (PyObject*) u;
11234}
11235
11236PyObject *
11237PyUnicode_Replace(PyObject *obj,
11238                  PyObject *subobj,
11239                  PyObject *replobj,
11240                  Py_ssize_t maxcount)
11241{
11242    PyObject *self;
11243    PyObject *str1;
11244    PyObject *str2;
11245    PyObject *result;
11246
11247    self = PyUnicode_FromObject(obj);
11248    if (self == NULL || PyUnicode_READY(self) == -1)
11249        return NULL;
11250    str1 = PyUnicode_FromObject(subobj);
11251    if (str1 == NULL || PyUnicode_READY(str1) == -1) {
11252        Py_DECREF(self);
11253        return NULL;
11254    }
11255    str2 = PyUnicode_FromObject(replobj);
11256    if (str2 == NULL || PyUnicode_READY(str2)) {
11257        Py_DECREF(self);
11258        Py_DECREF(str1);
11259        return NULL;
11260    }
11261    result = replace(self, str1, str2, maxcount);
11262    Py_DECREF(self);
11263    Py_DECREF(str1);
11264    Py_DECREF(str2);
11265    return result;
11266}
11267
11268PyDoc_STRVAR(replace__doc__,
11269             "S.replace(old, new[, count]) -> str\n\
11270\n\
11271Return a copy of S with all occurrences of substring\n\
11272old replaced by new.  If the optional argument count is\n\
11273given, only the first count occurrences are replaced.");
11274
11275static PyObject*
11276unicode_replace(PyObject *self, PyObject *args)
11277{
11278    PyObject *str1;
11279    PyObject *str2;
11280    Py_ssize_t maxcount = -1;
11281    PyObject *result;
11282
11283    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
11284        return NULL;
11285    if (!PyUnicode_READY(self) == -1)
11286        return NULL;
11287    str1 = PyUnicode_FromObject(str1);
11288    if (str1 == NULL || PyUnicode_READY(str1) == -1)
11289        return NULL;
11290    str2 = PyUnicode_FromObject(str2);
11291    if (str2 == NULL || PyUnicode_READY(str2) == -1) {
11292        Py_DECREF(str1);
11293        return NULL;
11294    }
11295
11296    result = replace(self, str1, str2, maxcount);
11297
11298    Py_DECREF(str1);
11299    Py_DECREF(str2);
11300    return result;
11301}
11302
11303static PyObject *
11304unicode_repr(PyObject *unicode)
11305{
11306    PyObject *repr;
11307    Py_ssize_t isize;
11308    Py_ssize_t osize, squote, dquote, i, o;
11309    Py_UCS4 max, quote;
11310    int ikind, okind;
11311    void *idata, *odata;
11312
11313    if (PyUnicode_READY(unicode) == -1)
11314        return NULL;
11315
11316    isize = PyUnicode_GET_LENGTH(unicode);
11317    idata = PyUnicode_DATA(unicode);
11318
11319    /* Compute length of output, quote characters, and
11320       maximum character */
11321    osize = 2; /* quotes */
11322    max = 127;
11323    squote = dquote = 0;
11324    ikind = PyUnicode_KIND(unicode);
11325    for (i = 0; i < isize; i++) {
11326        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11327        switch (ch) {
11328        case '\'': squote++; osize++; break;
11329        case '"':  dquote++; osize++; break;
11330        case '\\': case '\t': case '\r': case '\n':
11331            osize += 2; break;
11332        default:
11333            /* Fast-path ASCII */
11334            if (ch < ' ' || ch == 0x7f)
11335                osize += 4; /* \xHH */
11336            else if (ch < 0x7f)
11337                osize++;
11338            else if (Py_UNICODE_ISPRINTABLE(ch)) {
11339                osize++;
11340                max = ch > max ? ch : max;
11341            }
11342            else if (ch < 0x100)
11343                osize += 4; /* \xHH */
11344            else if (ch < 0x10000)
11345                osize += 6; /* \uHHHH */
11346            else
11347                osize += 10; /* \uHHHHHHHH */
11348        }
11349    }
11350
11351    quote = '\'';
11352    if (squote) {
11353        if (dquote)
11354            /* Both squote and dquote present. Use squote,
11355               and escape them */
11356            osize += squote;
11357        else
11358            quote = '"';
11359    }
11360
11361    repr = PyUnicode_New(osize, max);
11362    if (repr == NULL)
11363        return NULL;
11364    okind = PyUnicode_KIND(repr);
11365    odata = PyUnicode_DATA(repr);
11366
11367    PyUnicode_WRITE(okind, odata, 0, quote);
11368    PyUnicode_WRITE(okind, odata, osize-1, quote);
11369
11370    for (i = 0, o = 1; i < isize; i++) {
11371        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
11372
11373        /* Escape quotes and backslashes */
11374        if ((ch == quote) || (ch == '\\')) {
11375            PyUnicode_WRITE(okind, odata, o++, '\\');
11376            PyUnicode_WRITE(okind, odata, o++, ch);
11377            continue;
11378        }
11379
11380        /* Map special whitespace to '\t', \n', '\r' */
11381        if (ch == '\t') {
11382            PyUnicode_WRITE(okind, odata, o++, '\\');
11383            PyUnicode_WRITE(okind, odata, o++, 't');
11384        }
11385        else if (ch == '\n') {
11386            PyUnicode_WRITE(okind, odata, o++, '\\');
11387            PyUnicode_WRITE(okind, odata, o++, 'n');
11388        }
11389        else if (ch == '\r') {
11390            PyUnicode_WRITE(okind, odata, o++, '\\');
11391            PyUnicode_WRITE(okind, odata, o++, 'r');
11392        }
11393
11394        /* Map non-printable US ASCII to '\xhh' */
11395        else if (ch < ' ' || ch == 0x7F) {
11396            PyUnicode_WRITE(okind, odata, o++, '\\');
11397            PyUnicode_WRITE(okind, odata, o++, 'x');
11398            PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11399            PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11400        }
11401
11402        /* Copy ASCII characters as-is */
11403        else if (ch < 0x7F) {
11404            PyUnicode_WRITE(okind, odata, o++, ch);
11405        }
11406
11407        /* Non-ASCII characters */
11408        else {
11409            /* Map Unicode whitespace and control characters
11410               (categories Z* and C* except ASCII space)
11411            */
11412            if (!Py_UNICODE_ISPRINTABLE(ch)) {
11413                /* Map 8-bit characters to '\xhh' */
11414                if (ch <= 0xff) {
11415                    PyUnicode_WRITE(okind, odata, o++, '\\');
11416                    PyUnicode_WRITE(okind, odata, o++, 'x');
11417                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0x000F]);
11418                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0x000F]);
11419                }
11420                /* Map 21-bit characters to '\U00xxxxxx' */
11421                else if (ch >= 0x10000) {
11422                    PyUnicode_WRITE(okind, odata, o++, '\\');
11423                    PyUnicode_WRITE(okind, odata, o++, 'U');
11424                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 28) & 0xF]);
11425                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 24) & 0xF]);
11426                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 20) & 0xF]);
11427                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 16) & 0xF]);
11428                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11429                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11430                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11431                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11432                }
11433                /* Map 16-bit characters to '\uxxxx' */
11434                else {
11435                    PyUnicode_WRITE(okind, odata, o++, '\\');
11436                    PyUnicode_WRITE(okind, odata, o++, 'u');
11437                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 12) & 0xF]);
11438                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 8) & 0xF]);
11439                    PyUnicode_WRITE(okind, odata, o++, hexdigits[(ch >> 4) & 0xF]);
11440                    PyUnicode_WRITE(okind, odata, o++, hexdigits[ch & 0xF]);
11441                }
11442            }
11443            /* Copy characters as-is */
11444            else {
11445                PyUnicode_WRITE(okind, odata, o++, ch);
11446            }
11447        }
11448    }
11449    /* Closing quote already added at the beginning */
11450    return repr;
11451}
11452
11453PyDoc_STRVAR(rfind__doc__,
11454             "S.rfind(sub[, start[, end]]) -> int\n\
11455\n\
11456Return the highest index in S where substring sub is found,\n\
11457such that sub is contained within S[start:end].  Optional\n\
11458arguments start and end are interpreted as in slice notation.\n\
11459\n\
11460Return -1 on failure.");
11461
11462static PyObject *
11463unicode_rfind(PyObject *self, PyObject *args)
11464{
11465    PyUnicodeObject *substring;
11466    Py_ssize_t start;
11467    Py_ssize_t end;
11468    Py_ssize_t result;
11469
11470    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
11471                                            &start, &end))
11472        return NULL;
11473
11474    if (PyUnicode_READY(self) == -1)
11475        return NULL;
11476    if (PyUnicode_READY(substring) == -1)
11477        return NULL;
11478
11479    result = any_find_slice(
11480        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11481        self, (PyObject*)substring, start, end
11482        );
11483
11484    Py_DECREF(substring);
11485
11486    if (result == -2)
11487        return NULL;
11488
11489    return PyLong_FromSsize_t(result);
11490}
11491
11492PyDoc_STRVAR(rindex__doc__,
11493             "S.rindex(sub[, start[, end]]) -> int\n\
11494\n\
11495Like S.rfind() but raise ValueError when the substring is not found.");
11496
11497static PyObject *
11498unicode_rindex(PyObject *self, PyObject *args)
11499{
11500    PyUnicodeObject *substring;
11501    Py_ssize_t start;
11502    Py_ssize_t end;
11503    Py_ssize_t result;
11504
11505    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
11506                                            &start, &end))
11507        return NULL;
11508
11509    if (PyUnicode_READY(self) == -1)
11510        return NULL;
11511    if (PyUnicode_READY(substring) == -1)
11512        return NULL;
11513
11514    result = any_find_slice(
11515        ucs1lib_rfind_slice, ucs2lib_rfind_slice, ucs4lib_rfind_slice,
11516        self, (PyObject*)substring, start, end
11517        );
11518
11519    Py_DECREF(substring);
11520
11521    if (result == -2)
11522        return NULL;
11523
11524    if (result < 0) {
11525        PyErr_SetString(PyExc_ValueError, "substring not found");
11526        return NULL;
11527    }
11528
11529    return PyLong_FromSsize_t(result);
11530}
11531
11532PyDoc_STRVAR(rjust__doc__,
11533             "S.rjust(width[, fillchar]) -> str\n\
11534\n\
11535Return S right-justified in a string of length width. Padding is\n\
11536done using the specified fill character (default is a space).");
11537
11538static PyObject *
11539unicode_rjust(PyUnicodeObject *self, PyObject *args)
11540{
11541    Py_ssize_t width;
11542    Py_UCS4 fillchar = ' ';
11543
11544    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
11545        return NULL;
11546
11547    if (PyUnicode_READY(self) == -1)
11548        return NULL;
11549
11550    if (_PyUnicode_LENGTH(self) >= width && PyUnicode_CheckExact(self)) {
11551        Py_INCREF(self);
11552        return (PyObject*) self;
11553    }
11554
11555    return (PyObject*) pad(self, width - _PyUnicode_LENGTH(self), 0, fillchar);
11556}
11557
11558PyObject *
11559PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11560{
11561    PyObject *result;
11562
11563    s = PyUnicode_FromObject(s);
11564    if (s == NULL)
11565        return NULL;
11566    if (sep != NULL) {
11567        sep = PyUnicode_FromObject(sep);
11568        if (sep == NULL) {
11569            Py_DECREF(s);
11570            return NULL;
11571        }
11572    }
11573
11574    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11575
11576    Py_DECREF(s);
11577    Py_XDECREF(sep);
11578    return result;
11579}
11580
11581PyDoc_STRVAR(split__doc__,
11582             "S.split([sep[, maxsplit]]) -> list of strings\n\
11583\n\
11584Return a list of the words in S, using sep as the\n\
11585delimiter string.  If maxsplit is given, at most maxsplit\n\
11586splits are done. If sep is not specified or is None, any\n\
11587whitespace string is a separator and empty strings are\n\
11588removed from the result.");
11589
11590static PyObject*
11591unicode_split(PyUnicodeObject *self, PyObject *args)
11592{
11593    PyObject *substring = Py_None;
11594    Py_ssize_t maxcount = -1;
11595
11596    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
11597        return NULL;
11598
11599    if (substring == Py_None)
11600        return split(self, NULL, maxcount);
11601    else if (PyUnicode_Check(substring))
11602        return split(self, (PyUnicodeObject *)substring, maxcount);
11603    else
11604        return PyUnicode_Split((PyObject *)self, substring, maxcount);
11605}
11606
11607PyObject *
11608PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
11609{
11610    PyObject* str_obj;
11611    PyObject* sep_obj;
11612    PyObject* out;
11613    int kind1, kind2, kind;
11614    void *buf1 = NULL, *buf2 = NULL;
11615    Py_ssize_t len1, len2;
11616
11617    str_obj = PyUnicode_FromObject(str_in);
11618    if (!str_obj || PyUnicode_READY(str_obj) == -1)
11619        return NULL;
11620    sep_obj = PyUnicode_FromObject(sep_in);
11621    if (!sep_obj || PyUnicode_READY(sep_obj) == -1) {
11622        Py_DECREF(str_obj);
11623        return NULL;
11624    }
11625
11626    kind1 = PyUnicode_KIND(str_in);
11627    kind2 = PyUnicode_KIND(sep_obj);
11628    kind = kind1 > kind2 ? kind1 : kind2;
11629    buf1 = PyUnicode_DATA(str_in);
11630    if (kind1 != kind)
11631        buf1 = _PyUnicode_AsKind(str_in, kind);
11632    if (!buf1)
11633        goto onError;
11634    buf2 = PyUnicode_DATA(sep_obj);
11635    if (kind2 != kind)
11636        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11637    if (!buf2)
11638        goto onError;
11639    len1 = PyUnicode_GET_LENGTH(str_obj);
11640    len2 = PyUnicode_GET_LENGTH(sep_obj);
11641
11642    switch(PyUnicode_KIND(str_in)) {
11643    case PyUnicode_1BYTE_KIND:
11644        out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11645        break;
11646    case PyUnicode_2BYTE_KIND:
11647        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11648        break;
11649    case PyUnicode_4BYTE_KIND:
11650        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
11651        break;
11652    default:
11653        assert(0);
11654        out = 0;
11655    }
11656
11657    Py_DECREF(sep_obj);
11658    Py_DECREF(str_obj);
11659    if (kind1 != kind)
11660        PyMem_Free(buf1);
11661    if (kind2 != kind)
11662        PyMem_Free(buf2);
11663
11664    return out;
11665  onError:
11666    Py_DECREF(sep_obj);
11667    Py_DECREF(str_obj);
11668    if (kind1 != kind && buf1)
11669        PyMem_Free(buf1);
11670    if (kind2 != kind && buf2)
11671        PyMem_Free(buf2);
11672    return NULL;
11673}
11674
11675
11676PyObject *
11677PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
11678{
11679    PyObject* str_obj;
11680    PyObject* sep_obj;
11681    PyObject* out;
11682    int kind1, kind2, kind;
11683    void *buf1 = NULL, *buf2 = NULL;
11684    Py_ssize_t len1, len2;
11685
11686    str_obj = PyUnicode_FromObject(str_in);
11687    if (!str_obj)
11688        return NULL;
11689    sep_obj = PyUnicode_FromObject(sep_in);
11690    if (!sep_obj) {
11691        Py_DECREF(str_obj);
11692        return NULL;
11693    }
11694
11695    kind1 = PyUnicode_KIND(str_in);
11696    kind2 = PyUnicode_KIND(sep_obj);
11697    kind = Py_MAX(kind1, kind2);
11698    buf1 = PyUnicode_DATA(str_in);
11699    if (kind1 != kind)
11700        buf1 = _PyUnicode_AsKind(str_in, kind);
11701    if (!buf1)
11702        goto onError;
11703    buf2 = PyUnicode_DATA(sep_obj);
11704    if (kind2 != kind)
11705        buf2 = _PyUnicode_AsKind(sep_obj, kind);
11706    if (!buf2)
11707        goto onError;
11708    len1 = PyUnicode_GET_LENGTH(str_obj);
11709    len2 = PyUnicode_GET_LENGTH(sep_obj);
11710
11711    switch(PyUnicode_KIND(str_in)) {
11712    case PyUnicode_1BYTE_KIND:
11713        out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11714        break;
11715    case PyUnicode_2BYTE_KIND:
11716        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11717        break;
11718    case PyUnicode_4BYTE_KIND:
11719        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
11720        break;
11721    default:
11722        assert(0);
11723        out = 0;
11724    }
11725
11726    Py_DECREF(sep_obj);
11727    Py_DECREF(str_obj);
11728    if (kind1 != kind)
11729        PyMem_Free(buf1);
11730    if (kind2 != kind)
11731        PyMem_Free(buf2);
11732
11733    return out;
11734  onError:
11735    Py_DECREF(sep_obj);
11736    Py_DECREF(str_obj);
11737    if (kind1 != kind && buf1)
11738        PyMem_Free(buf1);
11739    if (kind2 != kind && buf2)
11740        PyMem_Free(buf2);
11741    return NULL;
11742}
11743
11744PyDoc_STRVAR(partition__doc__,
11745             "S.partition(sep) -> (head, sep, tail)\n\
11746\n\
11747Search for the separator sep in S, and return the part before it,\n\
11748the separator itself, and the part after it.  If the separator is not\n\
11749found, return S and two empty strings.");
11750
11751static PyObject*
11752unicode_partition(PyUnicodeObject *self, PyObject *separator)
11753{
11754    return PyUnicode_Partition((PyObject *)self, separator);
11755}
11756
11757PyDoc_STRVAR(rpartition__doc__,
11758             "S.rpartition(sep) -> (head, sep, tail)\n\
11759\n\
11760Search for the separator sep in S, starting at the end of S, and return\n\
11761the part before it, the separator itself, and the part after it.  If the\n\
11762separator is not found, return two empty strings and S.");
11763
11764static PyObject*
11765unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
11766{
11767    return PyUnicode_RPartition((PyObject *)self, separator);
11768}
11769
11770PyObject *
11771PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
11772{
11773    PyObject *result;
11774
11775    s = PyUnicode_FromObject(s);
11776    if (s == NULL)
11777        return NULL;
11778    if (sep != NULL) {
11779        sep = PyUnicode_FromObject(sep);
11780        if (sep == NULL) {
11781            Py_DECREF(s);
11782            return NULL;
11783        }
11784    }
11785
11786    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
11787
11788    Py_DECREF(s);
11789    Py_XDECREF(sep);
11790    return result;
11791}
11792
11793PyDoc_STRVAR(rsplit__doc__,
11794             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
11795\n\
11796Return a list of the words in S, using sep as the\n\
11797delimiter string, starting at the end of the string and\n\
11798working to the front.  If maxsplit is given, at most maxsplit\n\
11799splits are done. If sep is not specified, any whitespace string\n\
11800is a separator.");
11801
11802static PyObject*
11803unicode_rsplit(PyUnicodeObject *self, PyObject *args)
11804{
11805    PyObject *substring = Py_None;
11806    Py_ssize_t maxcount = -1;
11807
11808    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
11809        return NULL;
11810
11811    if (substring == Py_None)
11812        return rsplit(self, NULL, maxcount);
11813    else if (PyUnicode_Check(substring))
11814        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
11815    else
11816        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
11817}
11818
11819PyDoc_STRVAR(splitlines__doc__,
11820             "S.splitlines([keepends]) -> list of strings\n\
11821\n\
11822Return a list of the lines in S, breaking at line boundaries.\n\
11823Line breaks are not included in the resulting list unless keepends\n\
11824is given and true.");
11825
11826static PyObject*
11827unicode_splitlines(PyUnicodeObject *self, PyObject *args, PyObject *kwds)
11828{
11829    static char *kwlist[] = {"keepends", 0};
11830    int keepends = 0;
11831
11832    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
11833                                     kwlist, &keepends))
11834        return NULL;
11835
11836    return PyUnicode_Splitlines((PyObject *)self, keepends);
11837}
11838
11839static
11840PyObject *unicode_str(PyObject *self)
11841{
11842    if (PyUnicode_CheckExact(self)) {
11843        Py_INCREF(self);
11844        return self;
11845    } else
11846        /* Subtype -- return genuine unicode string with the same value. */
11847        return PyUnicode_Copy(self);
11848}
11849
11850PyDoc_STRVAR(swapcase__doc__,
11851             "S.swapcase() -> str\n\
11852\n\
11853Return a copy of S with uppercase characters converted to lowercase\n\
11854and vice versa.");
11855
11856static PyObject*
11857unicode_swapcase(PyUnicodeObject *self)
11858{
11859    return fixup(self, fixswapcase);
11860}
11861
11862PyDoc_STRVAR(maketrans__doc__,
11863             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
11864\n\
11865Return a translation table usable for str.translate().\n\
11866If there is only one argument, it must be a dictionary mapping Unicode\n\
11867ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
11868Character keys will be then converted to ordinals.\n\
11869If there are two arguments, they must be strings of equal length, and\n\
11870in the resulting dictionary, each character in x will be mapped to the\n\
11871character at the same position in y. If there is a third argument, it\n\
11872must be a string, whose characters will be mapped to None in the result.");
11873
11874static PyObject*
11875unicode_maketrans(PyUnicodeObject *null, PyObject *args)
11876{
11877    PyObject *x, *y = NULL, *z = NULL;
11878    PyObject *new = NULL, *key, *value;
11879    Py_ssize_t i = 0;
11880    int res;
11881
11882    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
11883        return NULL;
11884    new = PyDict_New();
11885    if (!new)
11886        return NULL;
11887    if (y != NULL) {
11888        int x_kind, y_kind, z_kind;
11889        void *x_data, *y_data, *z_data;
11890
11891        /* x must be a string too, of equal length */
11892        if (!PyUnicode_Check(x)) {
11893            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
11894                            "be a string if there is a second argument");
11895            goto err;
11896        }
11897        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
11898            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
11899                            "arguments must have equal length");
11900            goto err;
11901        }
11902        /* create entries for translating chars in x to those in y */
11903        x_kind = PyUnicode_KIND(x);
11904        y_kind = PyUnicode_KIND(y);
11905        x_data = PyUnicode_DATA(x);
11906        y_data = PyUnicode_DATA(y);
11907        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
11908            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
11909            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
11910            if (!key || !value)
11911                goto err;
11912            res = PyDict_SetItem(new, key, value);
11913            Py_DECREF(key);
11914            Py_DECREF(value);
11915            if (res < 0)
11916                goto err;
11917        }
11918        /* create entries for deleting chars in z */
11919        if (z != NULL) {
11920            z_kind = PyUnicode_KIND(z);
11921            z_data = PyUnicode_DATA(z);
11922            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
11923                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
11924                if (!key)
11925                    goto err;
11926                res = PyDict_SetItem(new, key, Py_None);
11927                Py_DECREF(key);
11928                if (res < 0)
11929                    goto err;
11930            }
11931        }
11932    } else {
11933        int kind;
11934        void *data;
11935
11936        /* x must be a dict */
11937        if (!PyDict_CheckExact(x)) {
11938            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
11939                            "to maketrans it must be a dict");
11940            goto err;
11941        }
11942        /* copy entries into the new dict, converting string keys to int keys */
11943        while (PyDict_Next(x, &i, &key, &value)) {
11944            if (PyUnicode_Check(key)) {
11945                /* convert string keys to integer keys */
11946                PyObject *newkey;
11947                if (PyUnicode_GET_SIZE(key) != 1) {
11948                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
11949                                    "table must be of length 1");
11950                    goto err;
11951                }
11952                kind = PyUnicode_KIND(key);
11953                data = PyUnicode_DATA(key);
11954                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
11955                if (!newkey)
11956                    goto err;
11957                res = PyDict_SetItem(new, newkey, value);
11958                Py_DECREF(newkey);
11959                if (res < 0)
11960                    goto err;
11961            } else if (PyLong_Check(key)) {
11962                /* just keep integer keys */
11963                if (PyDict_SetItem(new, key, value) < 0)
11964                    goto err;
11965            } else {
11966                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
11967                                "be strings or integers");
11968                goto err;
11969            }
11970        }
11971    }
11972    return new;
11973  err:
11974    Py_DECREF(new);
11975    return NULL;
11976}
11977
11978PyDoc_STRVAR(translate__doc__,
11979             "S.translate(table) -> str\n\
11980\n\
11981Return a copy of the string S, where all characters have been mapped\n\
11982through the given translation table, which must be a mapping of\n\
11983Unicode ordinals to Unicode ordinals, strings, or None.\n\
11984Unmapped characters are left untouched. Characters mapped to None\n\
11985are deleted.");
11986
11987static PyObject*
11988unicode_translate(PyObject *self, PyObject *table)
11989{
11990    return _PyUnicode_TranslateCharmap(self, table, "ignore");
11991}
11992
11993PyDoc_STRVAR(upper__doc__,
11994             "S.upper() -> str\n\
11995\n\
11996Return a copy of S converted to uppercase.");
11997
11998static PyObject*
11999unicode_upper(PyUnicodeObject *self)
12000{
12001    return fixup(self, fixupper);
12002}
12003
12004PyDoc_STRVAR(zfill__doc__,
12005             "S.zfill(width) -> str\n\
12006\n\
12007Pad a numeric string S with zeros on the left, to fill a field\n\
12008of the specified width. The string S is never truncated.");
12009
12010static PyObject *
12011unicode_zfill(PyUnicodeObject *self, PyObject *args)
12012{
12013    Py_ssize_t fill;
12014    PyUnicodeObject *u;
12015    Py_ssize_t width;
12016    int kind;
12017    void *data;
12018    Py_UCS4 chr;
12019
12020    if (PyUnicode_READY(self) == -1)
12021        return NULL;
12022
12023    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12024        return NULL;
12025
12026    if (PyUnicode_GET_LENGTH(self) >= width) {
12027        if (PyUnicode_CheckExact(self)) {
12028            Py_INCREF(self);
12029            return (PyObject*) self;
12030        }
12031        else
12032            return PyUnicode_Copy((PyObject*)self);
12033    }
12034
12035    fill = width - _PyUnicode_LENGTH(self);
12036
12037    u = pad(self, fill, 0, '0');
12038
12039    if (u == NULL)
12040        return NULL;
12041
12042    kind = PyUnicode_KIND(u);
12043    data = PyUnicode_DATA(u);
12044    chr = PyUnicode_READ(kind, data, fill);
12045
12046    if (chr == '+' || chr == '-') {
12047        /* move sign to beginning of string */
12048        PyUnicode_WRITE(kind, data, 0, chr);
12049        PyUnicode_WRITE(kind, data, fill, '0');
12050    }
12051
12052    return (PyObject*) u;
12053}
12054
12055#if 0
12056static PyObject *
12057unicode__decimal2ascii(PyObject *self)
12058{
12059    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12060}
12061#endif
12062
12063PyDoc_STRVAR(startswith__doc__,
12064             "S.startswith(prefix[, start[, end]]) -> bool\n\
12065\n\
12066Return True if S starts with the specified prefix, False otherwise.\n\
12067With optional start, test S beginning at that position.\n\
12068With optional end, stop comparing S at that position.\n\
12069prefix can also be a tuple of strings to try.");
12070
12071static PyObject *
12072unicode_startswith(PyUnicodeObject *self,
12073                   PyObject *args)
12074{
12075    PyObject *subobj;
12076    PyUnicodeObject *substring;
12077    Py_ssize_t start = 0;
12078    Py_ssize_t end = PY_SSIZE_T_MAX;
12079    int result;
12080
12081    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12082        return NULL;
12083    if (PyTuple_Check(subobj)) {
12084        Py_ssize_t i;
12085        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12086            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12087                PyTuple_GET_ITEM(subobj, i));
12088            if (substring == NULL)
12089                return NULL;
12090            result = tailmatch(self, substring, start, end, -1);
12091            Py_DECREF(substring);
12092            if (result) {
12093                Py_RETURN_TRUE;
12094            }
12095        }
12096        /* nothing matched */
12097        Py_RETURN_FALSE;
12098    }
12099    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12100    if (substring == NULL) {
12101        if (PyErr_ExceptionMatches(PyExc_TypeError))
12102            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12103                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12104        return NULL;
12105    }
12106    result = tailmatch(self, substring, start, end, -1);
12107    Py_DECREF(substring);
12108    return PyBool_FromLong(result);
12109}
12110
12111
12112PyDoc_STRVAR(endswith__doc__,
12113             "S.endswith(suffix[, start[, end]]) -> bool\n\
12114\n\
12115Return True if S ends with the specified suffix, False otherwise.\n\
12116With optional start, test S beginning at that position.\n\
12117With optional end, stop comparing S at that position.\n\
12118suffix can also be a tuple of strings to try.");
12119
12120static PyObject *
12121unicode_endswith(PyUnicodeObject *self,
12122                 PyObject *args)
12123{
12124    PyObject *subobj;
12125    PyUnicodeObject *substring;
12126    Py_ssize_t start = 0;
12127    Py_ssize_t end = PY_SSIZE_T_MAX;
12128    int result;
12129
12130    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12131        return NULL;
12132    if (PyTuple_Check(subobj)) {
12133        Py_ssize_t i;
12134        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12135            substring = (PyUnicodeObject *)PyUnicode_FromObject(
12136                PyTuple_GET_ITEM(subobj, i));
12137            if (substring == NULL)
12138                return NULL;
12139            result = tailmatch(self, substring, start, end, +1);
12140            Py_DECREF(substring);
12141            if (result) {
12142                Py_RETURN_TRUE;
12143            }
12144        }
12145        Py_RETURN_FALSE;
12146    }
12147    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
12148    if (substring == NULL) {
12149        if (PyErr_ExceptionMatches(PyExc_TypeError))
12150            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12151                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12152        return NULL;
12153    }
12154    result = tailmatch(self, substring, start, end, +1);
12155    Py_DECREF(substring);
12156    return PyBool_FromLong(result);
12157}
12158
12159#include "stringlib/unicode_format.h"
12160
12161PyDoc_STRVAR(format__doc__,
12162             "S.format(*args, **kwargs) -> str\n\
12163\n\
12164Return a formatted version of S, using substitutions from args and kwargs.\n\
12165The substitutions are identified by braces ('{' and '}').");
12166
12167PyDoc_STRVAR(format_map__doc__,
12168             "S.format_map(mapping) -> str\n\
12169\n\
12170Return a formatted version of S, using substitutions from mapping.\n\
12171The substitutions are identified by braces ('{' and '}').");
12172
12173static PyObject *
12174unicode__format__(PyObject* self, PyObject* args)
12175{
12176    PyObject *format_spec;
12177
12178    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
12179        return NULL;
12180
12181    return _PyUnicode_FormatAdvanced(self, format_spec, 0,
12182                                     PyUnicode_GET_LENGTH(format_spec));
12183}
12184
12185PyDoc_STRVAR(p_format__doc__,
12186             "S.__format__(format_spec) -> str\n\
12187\n\
12188Return a formatted version of S as described by format_spec.");
12189
12190static PyObject *
12191unicode__sizeof__(PyUnicodeObject *v)
12192{
12193    Py_ssize_t size;
12194
12195    /* If it's a compact object, account for base structure +
12196       character data. */
12197    if (PyUnicode_IS_COMPACT_ASCII(v))
12198        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
12199    else if (PyUnicode_IS_COMPACT(v))
12200        size = sizeof(PyCompactUnicodeObject) +
12201            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_CHARACTER_SIZE(v);
12202    else {
12203        /* If it is a two-block object, account for base object, and
12204           for character block if present. */
12205        size = sizeof(PyUnicodeObject);
12206        if (_PyUnicode_DATA_ANY(v))
12207            size += (PyUnicode_GET_LENGTH(v) + 1) *
12208                PyUnicode_CHARACTER_SIZE(v);
12209    }
12210    /* If the wstr pointer is present, account for it unless it is shared
12211       with the data pointer. Check if the data is not shared. */
12212    if (_PyUnicode_HAS_WSTR_MEMORY(v))
12213        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
12214    if (_PyUnicode_HAS_UTF8_MEMORY(v))
12215        size += PyUnicode_UTF8_LENGTH(v) + 1;
12216
12217    return PyLong_FromSsize_t(size);
12218}
12219
12220PyDoc_STRVAR(sizeof__doc__,
12221             "S.__sizeof__() -> size of S in memory, in bytes");
12222
12223static PyObject *
12224unicode_getnewargs(PyObject *v)
12225{
12226    PyObject *copy = PyUnicode_Copy(v);
12227    if (!copy)
12228        return NULL;
12229    return Py_BuildValue("(N)", copy);
12230}
12231
12232static PyMethodDef unicode_methods[] = {
12233
12234    /* Order is according to common usage: often used methods should
12235       appear first, since lookup is done sequentially. */
12236
12237    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
12238    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
12239    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
12240    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
12241    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
12242    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
12243    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
12244    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
12245    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
12246    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
12247    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
12248    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
12249    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
12250    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
12251    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
12252    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
12253    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
12254    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
12255    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
12256    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
12257    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
12258    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
12259    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
12260    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
12261    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
12262    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
12263    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
12264    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
12265    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
12266    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
12267    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
12268    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
12269    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
12270    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
12271    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
12272    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
12273    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
12274    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
12275    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
12276    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
12277    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
12278    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
12279    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
12280    {"maketrans", (PyCFunction) unicode_maketrans,
12281     METH_VARARGS | METH_STATIC, maketrans__doc__},
12282    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
12283#if 0
12284    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
12285#endif
12286
12287#if 0
12288    /* These methods are just used for debugging the implementation. */
12289    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
12290#endif
12291
12292    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
12293    {NULL, NULL}
12294};
12295
12296static PyObject *
12297unicode_mod(PyObject *v, PyObject *w)
12298{
12299    if (!PyUnicode_Check(v))
12300        Py_RETURN_NOTIMPLEMENTED;
12301    return PyUnicode_Format(v, w);
12302}
12303
12304static PyNumberMethods unicode_as_number = {
12305    0,              /*nb_add*/
12306    0,              /*nb_subtract*/
12307    0,              /*nb_multiply*/
12308    unicode_mod,            /*nb_remainder*/
12309};
12310
12311static PySequenceMethods unicode_as_sequence = {
12312    (lenfunc) unicode_length,       /* sq_length */
12313    PyUnicode_Concat,           /* sq_concat */
12314    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
12315    (ssizeargfunc) unicode_getitem,     /* sq_item */
12316    0,                  /* sq_slice */
12317    0,                  /* sq_ass_item */
12318    0,                  /* sq_ass_slice */
12319    PyUnicode_Contains,         /* sq_contains */
12320};
12321
12322static PyObject*
12323unicode_subscript(PyUnicodeObject* self, PyObject* item)
12324{
12325    if (PyUnicode_READY(self) == -1)
12326        return NULL;
12327
12328    if (PyIndex_Check(item)) {
12329        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
12330        if (i == -1 && PyErr_Occurred())
12331            return NULL;
12332        if (i < 0)
12333            i += PyUnicode_GET_LENGTH(self);
12334        return unicode_getitem((PyObject*)self, i);
12335    } else if (PySlice_Check(item)) {
12336        Py_ssize_t start, stop, step, slicelength, cur, i;
12337        PyObject *result;
12338        void *src_data, *dest_data;
12339        int src_kind, dest_kind;
12340        Py_UCS4 ch, max_char;
12341
12342        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
12343                                 &start, &stop, &step, &slicelength) < 0) {
12344            return NULL;
12345        }
12346
12347        if (slicelength <= 0) {
12348            return PyUnicode_New(0, 0);
12349        } else if (start == 0 && step == 1 &&
12350                   slicelength == PyUnicode_GET_LENGTH(self) &&
12351                   PyUnicode_CheckExact(self)) {
12352            Py_INCREF(self);
12353            return (PyObject *)self;
12354        } else if (step == 1) {
12355            return PyUnicode_Substring((PyObject*)self,
12356                                       start, start + slicelength);
12357        }
12358        /* General case */
12359        max_char = 127;
12360        src_kind = PyUnicode_KIND(self);
12361        src_data = PyUnicode_DATA(self);
12362        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12363            ch = PyUnicode_READ(src_kind, src_data, cur);
12364            if (ch > max_char)
12365                max_char = ch;
12366        }
12367        result = PyUnicode_New(slicelength, max_char);
12368        if (result == NULL)
12369            return NULL;
12370        dest_kind = PyUnicode_KIND(result);
12371        dest_data = PyUnicode_DATA(result);
12372
12373        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
12374            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
12375            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
12376        }
12377        return result;
12378    } else {
12379        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
12380        return NULL;
12381    }
12382}
12383
12384static PyMappingMethods unicode_as_mapping = {
12385    (lenfunc)unicode_length,        /* mp_length */
12386    (binaryfunc)unicode_subscript,  /* mp_subscript */
12387    (objobjargproc)0,           /* mp_ass_subscript */
12388};
12389
12390
12391/* Helpers for PyUnicode_Format() */
12392
12393static PyObject *
12394getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
12395{
12396    Py_ssize_t argidx = *p_argidx;
12397    if (argidx < arglen) {
12398        (*p_argidx)++;
12399        if (arglen < 0)
12400            return args;
12401        else
12402            return PyTuple_GetItem(args, argidx);
12403    }
12404    PyErr_SetString(PyExc_TypeError,
12405                    "not enough arguments for format string");
12406    return NULL;
12407}
12408
12409/* Returns a new reference to a PyUnicode object, or NULL on failure. */
12410
12411static PyObject *
12412formatfloat(PyObject *v, int flags, int prec, int type)
12413{
12414    char *p;
12415    PyObject *result;
12416    double x;
12417
12418    x = PyFloat_AsDouble(v);
12419    if (x == -1.0 && PyErr_Occurred())
12420        return NULL;
12421
12422    if (prec < 0)
12423        prec = 6;
12424
12425    p = PyOS_double_to_string(x, type, prec,
12426                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
12427    if (p == NULL)
12428        return NULL;
12429    result = PyUnicode_DecodeASCII(p, strlen(p), NULL);
12430    PyMem_Free(p);
12431    return result;
12432}
12433
12434static PyObject*
12435formatlong(PyObject *val, int flags, int prec, int type)
12436{
12437    char *buf;
12438    int len;
12439    PyObject *str; /* temporary string object. */
12440    PyObject *result;
12441
12442    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
12443    if (!str)
12444        return NULL;
12445    result = PyUnicode_DecodeASCII(buf, len, NULL);
12446    Py_DECREF(str);
12447    return result;
12448}
12449
12450static int
12451formatchar(Py_UCS4 *buf,
12452           size_t buflen,
12453           PyObject *v)
12454{
12455    /* presume that the buffer is at least 3 characters long */
12456    if (PyUnicode_Check(v)) {
12457        if (PyUnicode_GET_LENGTH(v) == 1) {
12458            buf[0] = PyUnicode_READ_CHAR(v, 0);
12459            buf[1] = '\0';
12460            return 1;
12461        }
12462        goto onError;
12463    }
12464    else {
12465        /* Integer input truncated to a character */
12466        long x;
12467        x = PyLong_AsLong(v);
12468        if (x == -1 && PyErr_Occurred())
12469            goto onError;
12470
12471        if (x < 0 || x > 0x10ffff) {
12472            PyErr_SetString(PyExc_OverflowError,
12473                            "%c arg not in range(0x110000)");
12474            return -1;
12475        }
12476
12477        buf[0] = (Py_UCS4) x;
12478        buf[1] = '\0';
12479        return 1;
12480    }
12481
12482  onError:
12483    PyErr_SetString(PyExc_TypeError,
12484                    "%c requires int or char");
12485    return -1;
12486}
12487
12488/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
12489   FORMATBUFLEN is the length of the buffer in which chars are formatted.
12490*/
12491#define FORMATBUFLEN (size_t)10
12492
12493PyObject *
12494PyUnicode_Format(PyObject *format, PyObject *args)
12495{
12496    void *fmt;
12497    int fmtkind;
12498    PyObject *result;
12499    Py_UCS4 *res, *res0;
12500    Py_UCS4 max;
12501    int kind;
12502    Py_ssize_t fmtcnt, fmtpos, rescnt, reslen, arglen, argidx;
12503    int args_owned = 0;
12504    PyObject *dict = NULL;
12505    PyUnicodeObject *uformat;
12506
12507    if (format == NULL || args == NULL) {
12508        PyErr_BadInternalCall();
12509        return NULL;
12510    }
12511    uformat = (PyUnicodeObject*)PyUnicode_FromObject(format);
12512    if (uformat == NULL || PyUnicode_READY(uformat) == -1)
12513        return NULL;
12514    fmt = PyUnicode_DATA(uformat);
12515    fmtkind = PyUnicode_KIND(uformat);
12516    fmtcnt = PyUnicode_GET_LENGTH(uformat);
12517    fmtpos = 0;
12518
12519    reslen = rescnt = fmtcnt + 100;
12520    res = res0 = PyMem_Malloc(reslen * sizeof(Py_UCS4));
12521    if (res0 == NULL) {
12522        PyErr_NoMemory();
12523        goto onError;
12524    }
12525
12526    if (PyTuple_Check(args)) {
12527        arglen = PyTuple_Size(args);
12528        argidx = 0;
12529    }
12530    else {
12531        arglen = -1;
12532        argidx = -2;
12533    }
12534    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
12535        !PyUnicode_Check(args))
12536        dict = args;
12537
12538    while (--fmtcnt >= 0) {
12539        if (PyUnicode_READ(fmtkind, fmt, fmtpos) != '%') {
12540            if (--rescnt < 0) {
12541                rescnt = fmtcnt + 100;
12542                reslen += rescnt;
12543                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12544                if (res0 == NULL){
12545                    PyErr_NoMemory();
12546                    goto onError;
12547                }
12548                res = res0 + reslen - rescnt;
12549                --rescnt;
12550            }
12551            *res++ = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12552        }
12553        else {
12554            /* Got a format specifier */
12555            int flags = 0;
12556            Py_ssize_t width = -1;
12557            int prec = -1;
12558            Py_UCS4 c = '\0';
12559            Py_UCS4 fill;
12560            int isnumok;
12561            PyObject *v = NULL;
12562            PyObject *temp = NULL;
12563            void *pbuf;
12564            Py_ssize_t pindex;
12565            Py_UNICODE sign;
12566            Py_ssize_t len, len1;
12567            Py_UCS4 formatbuf[FORMATBUFLEN]; /* For formatchar() */
12568
12569            fmtpos++;
12570            if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(') {
12571                Py_ssize_t keystart;
12572                Py_ssize_t keylen;
12573                PyObject *key;
12574                int pcount = 1;
12575
12576                if (dict == NULL) {
12577                    PyErr_SetString(PyExc_TypeError,
12578                                    "format requires a mapping");
12579                    goto onError;
12580                }
12581                ++fmtpos;
12582                --fmtcnt;
12583                keystart = fmtpos;
12584                /* Skip over balanced parentheses */
12585                while (pcount > 0 && --fmtcnt >= 0) {
12586                    if (PyUnicode_READ(fmtkind, fmt, fmtpos) == ')')
12587                        --pcount;
12588                    else if (PyUnicode_READ(fmtkind, fmt, fmtpos) == '(')
12589                        ++pcount;
12590                    fmtpos++;
12591                }
12592                keylen = fmtpos - keystart - 1;
12593                if (fmtcnt < 0 || pcount > 0) {
12594                    PyErr_SetString(PyExc_ValueError,
12595                                    "incomplete format key");
12596                    goto onError;
12597                }
12598                key = PyUnicode_Substring((PyObject*)uformat,
12599                                          keystart, keystart + keylen);
12600                if (key == NULL)
12601                    goto onError;
12602                if (args_owned) {
12603                    Py_DECREF(args);
12604                    args_owned = 0;
12605                }
12606                args = PyObject_GetItem(dict, key);
12607                Py_DECREF(key);
12608                if (args == NULL) {
12609                    goto onError;
12610                }
12611                args_owned = 1;
12612                arglen = -1;
12613                argidx = -2;
12614            }
12615            while (--fmtcnt >= 0) {
12616                switch (c = PyUnicode_READ(fmtkind, fmt, fmtpos++)) {
12617                case '-': flags |= F_LJUST; continue;
12618                case '+': flags |= F_SIGN; continue;
12619                case ' ': flags |= F_BLANK; continue;
12620                case '#': flags |= F_ALT; continue;
12621                case '0': flags |= F_ZERO; continue;
12622                }
12623                break;
12624            }
12625            if (c == '*') {
12626                v = getnextarg(args, arglen, &argidx);
12627                if (v == NULL)
12628                    goto onError;
12629                if (!PyLong_Check(v)) {
12630                    PyErr_SetString(PyExc_TypeError,
12631                                    "* wants int");
12632                    goto onError;
12633                }
12634                width = PyLong_AsLong(v);
12635                if (width == -1 && PyErr_Occurred())
12636                    goto onError;
12637                if (width < 0) {
12638                    flags |= F_LJUST;
12639                    width = -width;
12640                }
12641                if (--fmtcnt >= 0)
12642                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12643            }
12644            else if (c >= '0' && c <= '9') {
12645                width = c - '0';
12646                while (--fmtcnt >= 0) {
12647                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12648                    if (c < '0' || c > '9')
12649                        break;
12650                    if ((width*10) / 10 != width) {
12651                        PyErr_SetString(PyExc_ValueError,
12652                                        "width too big");
12653                        goto onError;
12654                    }
12655                    width = width*10 + (c - '0');
12656                }
12657            }
12658            if (c == '.') {
12659                prec = 0;
12660                if (--fmtcnt >= 0)
12661                    c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12662                if (c == '*') {
12663                    v = getnextarg(args, arglen, &argidx);
12664                    if (v == NULL)
12665                        goto onError;
12666                    if (!PyLong_Check(v)) {
12667                        PyErr_SetString(PyExc_TypeError,
12668                                        "* wants int");
12669                        goto onError;
12670                    }
12671                    prec = PyLong_AsLong(v);
12672                    if (prec == -1 && PyErr_Occurred())
12673                        goto onError;
12674                    if (prec < 0)
12675                        prec = 0;
12676                    if (--fmtcnt >= 0)
12677                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12678                }
12679                else if (c >= '0' && c <= '9') {
12680                    prec = c - '0';
12681                    while (--fmtcnt >= 0) {
12682                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12683                        if (c < '0' || c > '9')
12684                            break;
12685                        if ((prec*10) / 10 != prec) {
12686                            PyErr_SetString(PyExc_ValueError,
12687                                            "prec too big");
12688                            goto onError;
12689                        }
12690                        prec = prec*10 + (c - '0');
12691                    }
12692                }
12693            } /* prec */
12694            if (fmtcnt >= 0) {
12695                if (c == 'h' || c == 'l' || c == 'L') {
12696                    if (--fmtcnt >= 0)
12697                        c = PyUnicode_READ(fmtkind, fmt, fmtpos++);
12698                }
12699            }
12700            if (fmtcnt < 0) {
12701                PyErr_SetString(PyExc_ValueError,
12702                                "incomplete format");
12703                goto onError;
12704            }
12705            if (c != '%') {
12706                v = getnextarg(args, arglen, &argidx);
12707                if (v == NULL)
12708                    goto onError;
12709            }
12710            sign = 0;
12711            fill = ' ';
12712            switch (c) {
12713
12714            case '%':
12715                pbuf = formatbuf;
12716                kind = PyUnicode_4BYTE_KIND;
12717                /* presume that buffer length is at least 1 */
12718                PyUnicode_WRITE(kind, pbuf, 0, '%');
12719                len = 1;
12720                break;
12721
12722            case 's':
12723            case 'r':
12724            case 'a':
12725                if (PyUnicode_CheckExact(v) && c == 's') {
12726                    temp = v;
12727                    Py_INCREF(temp);
12728                }
12729                else {
12730                    if (c == 's')
12731                        temp = PyObject_Str(v);
12732                    else if (c == 'r')
12733                        temp = PyObject_Repr(v);
12734                    else
12735                        temp = PyObject_ASCII(v);
12736                    if (temp == NULL)
12737                        goto onError;
12738                    if (PyUnicode_Check(temp))
12739                        /* nothing to do */;
12740                    else {
12741                        Py_DECREF(temp);
12742                        PyErr_SetString(PyExc_TypeError,
12743                                        "%s argument has non-string str()");
12744                        goto onError;
12745                    }
12746                }
12747                if (PyUnicode_READY(temp) == -1) {
12748                    Py_CLEAR(temp);
12749                    goto onError;
12750                }
12751                pbuf = PyUnicode_DATA(temp);
12752                kind = PyUnicode_KIND(temp);
12753                len = PyUnicode_GET_LENGTH(temp);
12754                if (prec >= 0 && len > prec)
12755                    len = prec;
12756                break;
12757
12758            case 'i':
12759            case 'd':
12760            case 'u':
12761            case 'o':
12762            case 'x':
12763            case 'X':
12764                isnumok = 0;
12765                if (PyNumber_Check(v)) {
12766                    PyObject *iobj=NULL;
12767
12768                    if (PyLong_Check(v)) {
12769                        iobj = v;
12770                        Py_INCREF(iobj);
12771                    }
12772                    else {
12773                        iobj = PyNumber_Long(v);
12774                    }
12775                    if (iobj!=NULL) {
12776                        if (PyLong_Check(iobj)) {
12777                            isnumok = 1;
12778                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
12779                            Py_DECREF(iobj);
12780                            if (!temp)
12781                                goto onError;
12782                            if (PyUnicode_READY(temp) == -1) {
12783                                Py_CLEAR(temp);
12784                                goto onError;
12785                            }
12786                            pbuf = PyUnicode_DATA(temp);
12787                            kind = PyUnicode_KIND(temp);
12788                            len = PyUnicode_GET_LENGTH(temp);
12789                            sign = 1;
12790                        }
12791                        else {
12792                            Py_DECREF(iobj);
12793                        }
12794                    }
12795                }
12796                if (!isnumok) {
12797                    PyErr_Format(PyExc_TypeError,
12798                                 "%%%c format: a number is required, "
12799                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
12800                    goto onError;
12801                }
12802                if (flags & F_ZERO)
12803                    fill = '0';
12804                break;
12805
12806            case 'e':
12807            case 'E':
12808            case 'f':
12809            case 'F':
12810            case 'g':
12811            case 'G':
12812                temp = formatfloat(v, flags, prec, c);
12813                if (!temp)
12814                    goto onError;
12815                if (PyUnicode_READY(temp) == -1) {
12816                    Py_CLEAR(temp);
12817                    goto onError;
12818                }
12819                pbuf = PyUnicode_DATA(temp);
12820                kind = PyUnicode_KIND(temp);
12821                len = PyUnicode_GET_LENGTH(temp);
12822                sign = 1;
12823                if (flags & F_ZERO)
12824                    fill = '0';
12825                break;
12826
12827            case 'c':
12828                pbuf = formatbuf;
12829                kind = PyUnicode_4BYTE_KIND;
12830                len = formatchar(pbuf, Py_ARRAY_LENGTH(formatbuf), v);
12831                if (len < 0)
12832                    goto onError;
12833                break;
12834
12835            default:
12836                PyErr_Format(PyExc_ValueError,
12837                             "unsupported format character '%c' (0x%x) "
12838                             "at index %zd",
12839                             (31<=c && c<=126) ? (char)c : '?',
12840                             (int)c,
12841                             fmtpos - 1);
12842                goto onError;
12843            }
12844            /* pbuf is initialized here. */
12845            pindex = 0;
12846            if (sign) {
12847                if (PyUnicode_READ(kind, pbuf, pindex) == '-' ||
12848                    PyUnicode_READ(kind, pbuf, pindex) == '+') {
12849                    sign = PyUnicode_READ(kind, pbuf, pindex++);
12850                    len--;
12851                }
12852                else if (flags & F_SIGN)
12853                    sign = '+';
12854                else if (flags & F_BLANK)
12855                    sign = ' ';
12856                else
12857                    sign = 0;
12858            }
12859            if (width < len)
12860                width = len;
12861            if (rescnt - (sign != 0) < width) {
12862                reslen -= rescnt;
12863                rescnt = width + fmtcnt + 100;
12864                reslen += rescnt;
12865                if (reslen < 0) {
12866                    Py_XDECREF(temp);
12867                    PyErr_NoMemory();
12868                    goto onError;
12869                }
12870                res0 = PyMem_Realloc(res0, reslen*sizeof(Py_UCS4));
12871                if (res0 == 0) {
12872                    PyErr_NoMemory();
12873                    Py_XDECREF(temp);
12874                    goto onError;
12875                }
12876                res = res0 + reslen - rescnt;
12877            }
12878            if (sign) {
12879                if (fill != ' ')
12880                    *res++ = sign;
12881                rescnt--;
12882                if (width > len)
12883                    width--;
12884            }
12885            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12886                assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12887                assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12888                if (fill != ' ') {
12889                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12890                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12891                }
12892                rescnt -= 2;
12893                width -= 2;
12894                if (width < 0)
12895                    width = 0;
12896                len -= 2;
12897            }
12898            if (width > len && !(flags & F_LJUST)) {
12899                do {
12900                    --rescnt;
12901                    *res++ = fill;
12902                } while (--width > len);
12903            }
12904            if (fill == ' ') {
12905                if (sign)
12906                    *res++ = sign;
12907                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
12908                    assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
12909                    assert(PyUnicode_READ(kind, pbuf, pindex+1) == c);
12910                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12911                    *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12912                }
12913            }
12914            /* Copy all characters, preserving len */
12915            len1 = len;
12916            while (len1--) {
12917                *res++ = PyUnicode_READ(kind, pbuf, pindex++);
12918                rescnt--;
12919            }
12920            while (--width >= len) {
12921                --rescnt;
12922                *res++ = ' ';
12923            }
12924            if (dict && (argidx < arglen) && c != '%') {
12925                PyErr_SetString(PyExc_TypeError,
12926                                "not all arguments converted during string formatting");
12927                Py_XDECREF(temp);
12928                goto onError;
12929            }
12930            Py_XDECREF(temp);
12931        } /* '%' */
12932    } /* until end */
12933    if (argidx < arglen && !dict) {
12934        PyErr_SetString(PyExc_TypeError,
12935                        "not all arguments converted during string formatting");
12936        goto onError;
12937    }
12938
12939
12940    for (max=0, res = res0; res < res0+reslen-rescnt; res++)
12941        if (*res > max)
12942            max = *res;
12943    result = PyUnicode_New(reslen - rescnt, max);
12944    if (!result)
12945        goto onError;
12946    kind = PyUnicode_KIND(result);
12947    for (res = res0; res < res0+reslen-rescnt; res++)
12948        PyUnicode_WRITE(kind, PyUnicode_DATA(result), res-res0, *res);
12949    PyMem_Free(res0);
12950    if (args_owned) {
12951        Py_DECREF(args);
12952    }
12953    Py_DECREF(uformat);
12954    return (PyObject *)result;
12955
12956  onError:
12957    PyMem_Free(res0);
12958    Py_DECREF(uformat);
12959    if (args_owned) {
12960        Py_DECREF(args);
12961    }
12962    return NULL;
12963}
12964
12965static PyObject *
12966unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
12967
12968static PyObject *
12969unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12970{
12971    PyObject *x = NULL;
12972    static char *kwlist[] = {"object", "encoding", "errors", 0};
12973    char *encoding = NULL;
12974    char *errors = NULL;
12975
12976    if (type != &PyUnicode_Type)
12977        return unicode_subtype_new(type, args, kwds);
12978    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
12979                                     kwlist, &x, &encoding, &errors))
12980        return NULL;
12981    if (x == NULL)
12982        return (PyObject *)PyUnicode_New(0, 0);
12983    if (encoding == NULL && errors == NULL)
12984        return PyObject_Str(x);
12985    else
12986        return PyUnicode_FromEncodedObject(x, encoding, errors);
12987}
12988
12989static PyObject *
12990unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
12991{
12992    PyUnicodeObject *unicode, *self;
12993    Py_ssize_t length, char_size;
12994    int share_wstr, share_utf8;
12995    unsigned int kind;
12996    void *data;
12997
12998    assert(PyType_IsSubtype(type, &PyUnicode_Type));
12999
13000    unicode = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
13001    if (unicode == NULL)
13002        return NULL;
13003    assert(_PyUnicode_CHECK(unicode));
13004    if (PyUnicode_READY(unicode))
13005        return NULL;
13006
13007    self = (PyUnicodeObject *) type->tp_alloc(type, 0);
13008    if (self == NULL) {
13009        Py_DECREF(unicode);
13010        return NULL;
13011    }
13012    kind = PyUnicode_KIND(unicode);
13013    length = PyUnicode_GET_LENGTH(unicode);
13014
13015    _PyUnicode_LENGTH(self) = length;
13016    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
13017    _PyUnicode_STATE(self).interned = 0;
13018    _PyUnicode_STATE(self).kind = kind;
13019    _PyUnicode_STATE(self).compact = 0;
13020    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
13021    _PyUnicode_STATE(self).ready = 1;
13022    _PyUnicode_WSTR(self) = NULL;
13023    _PyUnicode_UTF8_LENGTH(self) = 0;
13024    _PyUnicode_UTF8(self) = NULL;
13025    _PyUnicode_WSTR_LENGTH(self) = 0;
13026    _PyUnicode_DATA_ANY(self) = NULL;
13027
13028    share_utf8 = 0;
13029    share_wstr = 0;
13030    if (kind == PyUnicode_1BYTE_KIND) {
13031        char_size = 1;
13032        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
13033            share_utf8 = 1;
13034    }
13035    else if (kind == PyUnicode_2BYTE_KIND) {
13036        char_size = 2;
13037        if (sizeof(wchar_t) == 2)
13038            share_wstr = 1;
13039    }
13040    else {
13041        assert(kind == PyUnicode_4BYTE_KIND);
13042        char_size = 4;
13043        if (sizeof(wchar_t) == 4)
13044            share_wstr = 1;
13045    }
13046
13047    /* Ensure we won't overflow the length. */
13048    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
13049        PyErr_NoMemory();
13050        goto onError;
13051    }
13052    data = PyObject_MALLOC((length + 1) * char_size);
13053    if (data == NULL) {
13054        PyErr_NoMemory();
13055        goto onError;
13056    }
13057
13058    _PyUnicode_DATA_ANY(self) = data;
13059    if (share_utf8) {
13060        _PyUnicode_UTF8_LENGTH(self) = length;
13061        _PyUnicode_UTF8(self) = data;
13062    }
13063    if (share_wstr) {
13064        _PyUnicode_WSTR_LENGTH(self) = length;
13065        _PyUnicode_WSTR(self) = (wchar_t *)data;
13066    }
13067
13068    Py_MEMCPY(data, PyUnicode_DATA(unicode),
13069              PyUnicode_KIND_SIZE(kind, length + 1));
13070    Py_DECREF(unicode);
13071    return (PyObject *)self;
13072
13073onError:
13074    Py_DECREF(unicode);
13075    Py_DECREF(self);
13076    return NULL;
13077}
13078
13079PyDoc_STRVAR(unicode_doc,
13080             "str(string[, encoding[, errors]]) -> str\n\
13081\n\
13082Create a new string object from the given encoded string.\n\
13083encoding defaults to the current default string encoding.\n\
13084errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
13085
13086static PyObject *unicode_iter(PyObject *seq);
13087
13088PyTypeObject PyUnicode_Type = {
13089    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13090    "str",              /* tp_name */
13091    sizeof(PyUnicodeObject),        /* tp_size */
13092    0,                  /* tp_itemsize */
13093    /* Slots */
13094    (destructor)unicode_dealloc,    /* tp_dealloc */
13095    0,                  /* tp_print */
13096    0,                  /* tp_getattr */
13097    0,                  /* tp_setattr */
13098    0,                  /* tp_reserved */
13099    unicode_repr,           /* tp_repr */
13100    &unicode_as_number,         /* tp_as_number */
13101    &unicode_as_sequence,       /* tp_as_sequence */
13102    &unicode_as_mapping,        /* tp_as_mapping */
13103    (hashfunc) unicode_hash,        /* tp_hash*/
13104    0,                  /* tp_call*/
13105    (reprfunc) unicode_str,     /* tp_str */
13106    PyObject_GenericGetAttr,        /* tp_getattro */
13107    0,                  /* tp_setattro */
13108    0,                  /* tp_as_buffer */
13109    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
13110    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
13111    unicode_doc,            /* tp_doc */
13112    0,                  /* tp_traverse */
13113    0,                  /* tp_clear */
13114    PyUnicode_RichCompare,      /* tp_richcompare */
13115    0,                  /* tp_weaklistoffset */
13116    unicode_iter,           /* tp_iter */
13117    0,                  /* tp_iternext */
13118    unicode_methods,            /* tp_methods */
13119    0,                  /* tp_members */
13120    0,                  /* tp_getset */
13121    &PyBaseObject_Type,         /* tp_base */
13122    0,                  /* tp_dict */
13123    0,                  /* tp_descr_get */
13124    0,                  /* tp_descr_set */
13125    0,                  /* tp_dictoffset */
13126    0,                  /* tp_init */
13127    0,                  /* tp_alloc */
13128    unicode_new,            /* tp_new */
13129    PyObject_Del,           /* tp_free */
13130};
13131
13132/* Initialize the Unicode implementation */
13133
13134void _PyUnicode_Init(void)
13135{
13136    int i;
13137
13138    /* XXX - move this array to unicodectype.c ? */
13139    Py_UCS2 linebreak[] = {
13140        0x000A, /* LINE FEED */
13141        0x000D, /* CARRIAGE RETURN */
13142        0x001C, /* FILE SEPARATOR */
13143        0x001D, /* GROUP SEPARATOR */
13144        0x001E, /* RECORD SEPARATOR */
13145        0x0085, /* NEXT LINE */
13146        0x2028, /* LINE SEPARATOR */
13147        0x2029, /* PARAGRAPH SEPARATOR */
13148    };
13149
13150    /* Init the implementation */
13151    unicode_empty = PyUnicode_New(0, 0);
13152    if (!unicode_empty)
13153        Py_FatalError("Can't create empty string");
13154
13155    for (i = 0; i < 256; i++)
13156        unicode_latin1[i] = NULL;
13157    if (PyType_Ready(&PyUnicode_Type) < 0)
13158        Py_FatalError("Can't initialize 'unicode'");
13159
13160    /* initialize the linebreak bloom filter */
13161    bloom_linebreak = make_bloom_mask(
13162        PyUnicode_2BYTE_KIND, linebreak,
13163        Py_ARRAY_LENGTH(linebreak));
13164
13165    PyType_Ready(&EncodingMapType);
13166}
13167
13168/* Finalize the Unicode implementation */
13169
13170int
13171PyUnicode_ClearFreeList(void)
13172{
13173    return 0;
13174}
13175
13176void
13177_PyUnicode_Fini(void)
13178{
13179    int i;
13180
13181    Py_XDECREF(unicode_empty);
13182    unicode_empty = NULL;
13183
13184    for (i = 0; i < 256; i++) {
13185        if (unicode_latin1[i]) {
13186            Py_DECREF(unicode_latin1[i]);
13187            unicode_latin1[i] = NULL;
13188        }
13189    }
13190    (void)PyUnicode_ClearFreeList();
13191}
13192
13193void
13194PyUnicode_InternInPlace(PyObject **p)
13195{
13196    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
13197    PyObject *t;
13198#ifdef Py_DEBUG
13199    assert(s != NULL);
13200    assert(_PyUnicode_CHECK(s));
13201#else
13202    if (s == NULL || !PyUnicode_Check(s))
13203        return;
13204#endif
13205    /* If it's a subclass, we don't really know what putting
13206       it in the interned dict might do. */
13207    if (!PyUnicode_CheckExact(s))
13208        return;
13209    if (PyUnicode_CHECK_INTERNED(s))
13210        return;
13211    if (_PyUnicode_READY_REPLACE(p)) {
13212        assert(0 && "_PyUnicode_READY_REPLACE fail in PyUnicode_InternInPlace");
13213        return;
13214    }
13215    s = (PyUnicodeObject *)(*p);
13216    if (interned == NULL) {
13217        interned = PyDict_New();
13218        if (interned == NULL) {
13219            PyErr_Clear(); /* Don't leave an exception */
13220            return;
13221        }
13222    }
13223    /* It might be that the GetItem call fails even
13224       though the key is present in the dictionary,
13225       namely when this happens during a stack overflow. */
13226    Py_ALLOW_RECURSION
13227        t = PyDict_GetItem(interned, (PyObject *)s);
13228    Py_END_ALLOW_RECURSION
13229
13230        if (t) {
13231            Py_INCREF(t);
13232            Py_DECREF(*p);
13233            *p = t;
13234            return;
13235        }
13236
13237    PyThreadState_GET()->recursion_critical = 1;
13238    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
13239        PyErr_Clear();
13240        PyThreadState_GET()->recursion_critical = 0;
13241        return;
13242    }
13243    PyThreadState_GET()->recursion_critical = 0;
13244    /* The two references in interned are not counted by refcnt.
13245       The deallocator will take care of this */
13246    Py_REFCNT(s) -= 2;
13247    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
13248}
13249
13250void
13251PyUnicode_InternImmortal(PyObject **p)
13252{
13253    PyUnicodeObject *u = (PyUnicodeObject *)*p;
13254
13255    PyUnicode_InternInPlace(p);
13256    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
13257        _PyUnicode_STATE(u).interned = SSTATE_INTERNED_IMMORTAL;
13258        Py_INCREF(*p);
13259    }
13260}
13261
13262PyObject *
13263PyUnicode_InternFromString(const char *cp)
13264{
13265    PyObject *s = PyUnicode_FromString(cp);
13266    if (s == NULL)
13267        return NULL;
13268    PyUnicode_InternInPlace(&s);
13269    return s;
13270}
13271
13272void
13273_Py_ReleaseInternedUnicodeStrings(void)
13274{
13275    PyObject *keys;
13276    PyUnicodeObject *s;
13277    Py_ssize_t i, n;
13278    Py_ssize_t immortal_size = 0, mortal_size = 0;
13279
13280    if (interned == NULL || !PyDict_Check(interned))
13281        return;
13282    keys = PyDict_Keys(interned);
13283    if (keys == NULL || !PyList_Check(keys)) {
13284        PyErr_Clear();
13285        return;
13286    }
13287
13288    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
13289       detector, interned unicode strings are not forcibly deallocated;
13290       rather, we give them their stolen references back, and then clear
13291       and DECREF the interned dict. */
13292
13293    n = PyList_GET_SIZE(keys);
13294    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
13295            n);
13296    for (i = 0; i < n; i++) {
13297        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
13298        if (PyUnicode_READY(s) == -1) {
13299            assert(0 && "could not ready string");
13300            fprintf(stderr, "could not ready string\n");
13301        }
13302        switch (PyUnicode_CHECK_INTERNED(s)) {
13303        case SSTATE_NOT_INTERNED:
13304            /* XXX Shouldn't happen */
13305            break;
13306        case SSTATE_INTERNED_IMMORTAL:
13307            Py_REFCNT(s) += 1;
13308            immortal_size += PyUnicode_GET_LENGTH(s);
13309            break;
13310        case SSTATE_INTERNED_MORTAL:
13311            Py_REFCNT(s) += 2;
13312            mortal_size += PyUnicode_GET_LENGTH(s);
13313            break;
13314        default:
13315            Py_FatalError("Inconsistent interned string state.");
13316        }
13317        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
13318    }
13319    fprintf(stderr, "total size of all interned strings: "
13320            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
13321            "mortal/immortal\n", mortal_size, immortal_size);
13322    Py_DECREF(keys);
13323    PyDict_Clear(interned);
13324    Py_DECREF(interned);
13325    interned = NULL;
13326}
13327
13328
13329/********************* Unicode Iterator **************************/
13330
13331typedef struct {
13332    PyObject_HEAD
13333    Py_ssize_t it_index;
13334    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
13335} unicodeiterobject;
13336
13337static void
13338unicodeiter_dealloc(unicodeiterobject *it)
13339{
13340    _PyObject_GC_UNTRACK(it);
13341    Py_XDECREF(it->it_seq);
13342    PyObject_GC_Del(it);
13343}
13344
13345static int
13346unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
13347{
13348    Py_VISIT(it->it_seq);
13349    return 0;
13350}
13351
13352static PyObject *
13353unicodeiter_next(unicodeiterobject *it)
13354{
13355    PyUnicodeObject *seq;
13356    PyObject *item;
13357
13358    assert(it != NULL);
13359    seq = it->it_seq;
13360    if (seq == NULL)
13361        return NULL;
13362    assert(_PyUnicode_CHECK(seq));
13363
13364    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
13365        int kind = PyUnicode_KIND(seq);
13366        void *data = PyUnicode_DATA(seq);
13367        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
13368        item = PyUnicode_FromOrdinal(chr);
13369        if (item != NULL)
13370            ++it->it_index;
13371        return item;
13372    }
13373
13374    Py_DECREF(seq);
13375    it->it_seq = NULL;
13376    return NULL;
13377}
13378
13379static PyObject *
13380unicodeiter_len(unicodeiterobject *it)
13381{
13382    Py_ssize_t len = 0;
13383    if (it->it_seq)
13384        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
13385    return PyLong_FromSsize_t(len);
13386}
13387
13388PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
13389
13390static PyMethodDef unicodeiter_methods[] = {
13391    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
13392     length_hint_doc},
13393    {NULL,      NULL}       /* sentinel */
13394};
13395
13396PyTypeObject PyUnicodeIter_Type = {
13397    PyVarObject_HEAD_INIT(&PyType_Type, 0)
13398    "str_iterator",         /* tp_name */
13399    sizeof(unicodeiterobject),      /* tp_basicsize */
13400    0,                  /* tp_itemsize */
13401    /* methods */
13402    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
13403    0,                  /* tp_print */
13404    0,                  /* tp_getattr */
13405    0,                  /* tp_setattr */
13406    0,                  /* tp_reserved */
13407    0,                  /* tp_repr */
13408    0,                  /* tp_as_number */
13409    0,                  /* tp_as_sequence */
13410    0,                  /* tp_as_mapping */
13411    0,                  /* tp_hash */
13412    0,                  /* tp_call */
13413    0,                  /* tp_str */
13414    PyObject_GenericGetAttr,        /* tp_getattro */
13415    0,                  /* tp_setattro */
13416    0,                  /* tp_as_buffer */
13417    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
13418    0,                  /* tp_doc */
13419    (traverseproc)unicodeiter_traverse, /* tp_traverse */
13420    0,                  /* tp_clear */
13421    0,                  /* tp_richcompare */
13422    0,                  /* tp_weaklistoffset */
13423    PyObject_SelfIter,          /* tp_iter */
13424    (iternextfunc)unicodeiter_next,     /* tp_iternext */
13425    unicodeiter_methods,            /* tp_methods */
13426    0,
13427};
13428
13429static PyObject *
13430unicode_iter(PyObject *seq)
13431{
13432    unicodeiterobject *it;
13433
13434    if (!PyUnicode_Check(seq)) {
13435        PyErr_BadInternalCall();
13436        return NULL;
13437    }
13438    if (PyUnicode_READY(seq) == -1)
13439        return NULL;
13440    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
13441    if (it == NULL)
13442        return NULL;
13443    it->it_index = 0;
13444    Py_INCREF(seq);
13445    it->it_seq = (PyUnicodeObject *)seq;
13446    _PyObject_GC_TRACK(it);
13447    return (PyObject *)it;
13448}
13449
13450#define UNIOP(x) Py_UNICODE_##x
13451#define UNIOP_t Py_UNICODE
13452#include "uniops.h"
13453#undef UNIOP
13454#undef UNIOP_t
13455#define UNIOP(x) Py_UCS4_##x
13456#define UNIOP_t Py_UCS4
13457#include "uniops.h"
13458#undef UNIOP
13459#undef UNIOP_t
13460
13461Py_UNICODE*
13462PyUnicode_AsUnicodeCopy(PyObject *object)
13463{
13464    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
13465    Py_UNICODE *copy;
13466    Py_ssize_t size;
13467
13468    if (!PyUnicode_Check(unicode)) {
13469        PyErr_BadArgument();
13470        return NULL;
13471    }
13472    /* Ensure we won't overflow the size. */
13473    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
13474        PyErr_NoMemory();
13475        return NULL;
13476    }
13477    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
13478    size *= sizeof(Py_UNICODE);
13479    copy = PyMem_Malloc(size);
13480    if (copy == NULL) {
13481        PyErr_NoMemory();
13482        return NULL;
13483    }
13484    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
13485    return copy;
13486}
13487
13488/* A _string module, to export formatter_parser and formatter_field_name_split
13489   to the string.Formatter class implemented in Python. */
13490
13491static PyMethodDef _string_methods[] = {
13492    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
13493     METH_O, PyDoc_STR("split the argument as a field name")},
13494    {"formatter_parser", (PyCFunction) formatter_parser,
13495     METH_O, PyDoc_STR("parse the argument as a format string")},
13496    {NULL, NULL}
13497};
13498
13499static struct PyModuleDef _string_module = {
13500    PyModuleDef_HEAD_INIT,
13501    "_string",
13502    PyDoc_STR("string helper module"),
13503    0,
13504    _string_methods,
13505    NULL,
13506    NULL,
13507    NULL,
13508    NULL
13509};
13510
13511PyMODINIT_FUNC
13512PyInit__string(void)
13513{
13514    return PyModule_Create(&_string_module);
13515}
13516
13517
13518#ifdef __cplusplus
13519}
13520#endif
13521