unicodeobject.c revision 0030cd52dacdd95d2017a0947d661feb737449af
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45#include "stringlib/eq.h"
46
47#ifdef MS_WINDOWS
48#include <windows.h>
49#endif
50
51/*[clinic input]
52class str "PyUnicodeObject *" "&PyUnicode_Type"
53[clinic start generated code]*/
54/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
55
56/* --- Globals ------------------------------------------------------------
57
58NOTE: In the interpreter's initialization phase, some globals are currently
59      initialized dynamically as needed. In the process Unicode objects may
60      be created before the Unicode type is ready.
61
62*/
63
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68
69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
72#ifdef Py_DEBUG
73#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
74#else
75#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
77
78#define _PyUnicode_UTF8(op)                             \
79    (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op)                              \
81    (assert(_PyUnicode_CHECK(op)),                      \
82     assert(PyUnicode_IS_READY(op)),                    \
83     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
84         ((char*)((PyASCIIObject*)(op) + 1)) :          \
85         _PyUnicode_UTF8(op))
86#define _PyUnicode_UTF8_LENGTH(op)                      \
87    (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op)                       \
89    (assert(_PyUnicode_CHECK(op)),                      \
90     assert(PyUnicode_IS_READY(op)),                    \
91     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
92         ((PyASCIIObject*)(op))->length :               \
93         _PyUnicode_UTF8_LENGTH(op))
94#define _PyUnicode_WSTR(op)                             \
95    (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op)                      \
97    (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op)                           \
99    (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op)                            \
101    (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op)                             \
103    (((PyASCIIObject *)(op))->hash)
104#define _PyUnicode_KIND(op)                             \
105    (assert(_PyUnicode_CHECK(op)),                      \
106     ((PyASCIIObject *)(op))->state.kind)
107#define _PyUnicode_GET_LENGTH(op)                       \
108    (assert(_PyUnicode_CHECK(op)),                      \
109     ((PyASCIIObject *)(op))->length)
110#define _PyUnicode_DATA_ANY(op)                         \
111    (((PyUnicodeObject*)(op))->data.any)
112
113#undef PyUnicode_READY
114#define PyUnicode_READY(op)                             \
115    (assert(_PyUnicode_CHECK(op)),                      \
116     (PyUnicode_IS_READY(op) ?                          \
117      0 :                                               \
118      _PyUnicode_Ready(op)))
119
120#define _PyUnicode_SHARE_UTF8(op)                       \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
123     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op)                       \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
128/* true if the Unicode object has an allocated UTF-8 memory block
129   (not shared with other data) */
130#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
131    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
132      && _PyUnicode_UTF8(op)                            \
133      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
135/* true if the Unicode object has an allocated wstr memory block
136   (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
138    ((_PyUnicode_WSTR(op) &&                            \
139      (!PyUnicode_IS_READY(op) ||                       \
140       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
142/* Generic helper macro to convert characters of different types.
143   from_type and to_type have to be valid type names, begin and end
144   are pointers to the source characters which should be of type
145   "from_type *".  to is a pointer of type "to_type *" and points to the
146   buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148    do {                                                \
149        to_type *_to = (to_type *)(to);                \
150        const from_type *_iter = (from_type *)(begin);  \
151        const from_type *_end = (from_type *)(end);     \
152        Py_ssize_t n = (_end) - (_iter);                \
153        const from_type *_unrolled_end =                \
154            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
155        while (_iter < (_unrolled_end)) {               \
156            _to[0] = (to_type) _iter[0];                \
157            _to[1] = (to_type) _iter[1];                \
158            _to[2] = (to_type) _iter[2];                \
159            _to[3] = (to_type) _iter[3];                \
160            _iter += 4; _to += 4;                       \
161        }                                               \
162        while (_iter < (_end))                          \
163            *_to++ = (to_type) *_iter++;                \
164    } while (0)
165
166/* This dictionary holds all interned unicode strings.  Note that references
167   to strings in this dictionary are *not* counted in the string's ob_refcnt.
168   When the interned string reaches a refcnt of 0 the string deallocation
169   function will delete the reference from this dictionary.
170
171   Another way to look at this is that to say that the actual reference
172   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
173*/
174static PyObject *interned = NULL;
175
176/* The empty Unicode object is shared to improve performance. */
177static PyObject *unicode_empty = NULL;
178
179#define _Py_INCREF_UNICODE_EMPTY()                      \
180    do {                                                \
181        if (unicode_empty != NULL)                      \
182            Py_INCREF(unicode_empty);                   \
183        else {                                          \
184            unicode_empty = PyUnicode_New(0, 0);        \
185            if (unicode_empty != NULL) {                \
186                Py_INCREF(unicode_empty);               \
187                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
188            }                                           \
189        }                                               \
190    } while (0)
191
192#define _Py_RETURN_UNICODE_EMPTY()                      \
193    do {                                                \
194        _Py_INCREF_UNICODE_EMPTY();                     \
195        return unicode_empty;                           \
196    } while (0)
197
198/* Forward declaration */
199Py_LOCAL_INLINE(int)
200_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
201
202/* List of static strings. */
203static _Py_Identifier *static_strings = NULL;
204
205/* Single character Unicode strings in the Latin-1 range are being
206   shared as well. */
207static PyObject *unicode_latin1[256] = {NULL};
208
209/* Fast detection of the most frequent whitespace characters */
210const unsigned char _Py_ascii_whitespace[] = {
211    0, 0, 0, 0, 0, 0, 0, 0,
212/*     case 0x0009: * CHARACTER TABULATION */
213/*     case 0x000A: * LINE FEED */
214/*     case 0x000B: * LINE TABULATION */
215/*     case 0x000C: * FORM FEED */
216/*     case 0x000D: * CARRIAGE RETURN */
217    0, 1, 1, 1, 1, 1, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219/*     case 0x001C: * FILE SEPARATOR */
220/*     case 0x001D: * GROUP SEPARATOR */
221/*     case 0x001E: * RECORD SEPARATOR */
222/*     case 0x001F: * UNIT SEPARATOR */
223    0, 0, 0, 0, 1, 1, 1, 1,
224/*     case 0x0020: * SPACE */
225    1, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228    0, 0, 0, 0, 0, 0, 0, 0,
229
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0,
237    0, 0, 0, 0, 0, 0, 0, 0
238};
239
240/* forward */
241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
242static PyObject* get_latin1_char(unsigned char ch);
243static int unicode_modifiable(PyObject *unicode);
244
245
246static PyObject *
247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
248static PyObject *
249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
250static PyObject *
251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
252
253static PyObject *
254unicode_encode_call_errorhandler(const char *errors,
255       PyObject **errorHandler,const char *encoding, const char *reason,
256       PyObject *unicode, PyObject **exceptionObject,
257       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
259static void
260raise_encode_exception(PyObject **exceptionObject,
261                       const char *encoding,
262                       PyObject *unicode,
263                       Py_ssize_t startpos, Py_ssize_t endpos,
264                       const char *reason);
265
266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
268    0, 0, 0, 0, 0, 0, 0, 0,
269/*         0x000A, * LINE FEED */
270/*         0x000B, * LINE TABULATION */
271/*         0x000C, * FORM FEED */
272/*         0x000D, * CARRIAGE RETURN */
273    0, 0, 1, 1, 1, 1, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275/*         0x001C, * FILE SEPARATOR */
276/*         0x001D, * GROUP SEPARATOR */
277/*         0x001E, * RECORD SEPARATOR */
278    0, 0, 0, 0, 1, 1, 1, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0,
291    0, 0, 0, 0, 0, 0, 0, 0
292};
293
294#include "clinic/unicodeobject.c.h"
295
296typedef enum {
297    _Py_ERROR_UNKNOWN=0,
298    _Py_ERROR_STRICT,
299    _Py_ERROR_SURROGATEESCAPE,
300    _Py_ERROR_REPLACE,
301    _Py_ERROR_IGNORE,
302    _Py_ERROR_XMLCHARREFREPLACE,
303    _Py_ERROR_OTHER
304} _Py_error_handler;
305
306static _Py_error_handler
307get_error_handler(const char *errors)
308{
309    if (errors == NULL)
310        return _Py_ERROR_STRICT;
311    if (strcmp(errors, "strict") == 0)
312        return _Py_ERROR_STRICT;
313    if (strcmp(errors, "surrogateescape") == 0)
314        return _Py_ERROR_SURROGATEESCAPE;
315    if (strcmp(errors, "ignore") == 0)
316        return _Py_ERROR_IGNORE;
317    if (strcmp(errors, "replace") == 0)
318        return _Py_ERROR_REPLACE;
319    if (strcmp(errors, "xmlcharrefreplace") == 0)
320        return _Py_ERROR_XMLCHARREFREPLACE;
321    return _Py_ERROR_OTHER;
322}
323
324/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
325   This function is kept for backward compatibility with the old API. */
326Py_UNICODE
327PyUnicode_GetMax(void)
328{
329#ifdef Py_UNICODE_WIDE
330    return 0x10FFFF;
331#else
332    /* This is actually an illegal character, so it should
333       not be passed to unichr. */
334    return 0xFFFF;
335#endif
336}
337
338#ifdef Py_DEBUG
339int
340_PyUnicode_CheckConsistency(PyObject *op, int check_content)
341{
342    PyASCIIObject *ascii;
343    unsigned int kind;
344
345    assert(PyUnicode_Check(op));
346
347    ascii = (PyASCIIObject *)op;
348    kind = ascii->state.kind;
349
350    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
351        assert(kind == PyUnicode_1BYTE_KIND);
352        assert(ascii->state.ready == 1);
353    }
354    else {
355        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
356        void *data;
357
358        if (ascii->state.compact == 1) {
359            data = compact + 1;
360            assert(kind == PyUnicode_1BYTE_KIND
361                   || kind == PyUnicode_2BYTE_KIND
362                   || kind == PyUnicode_4BYTE_KIND);
363            assert(ascii->state.ascii == 0);
364            assert(ascii->state.ready == 1);
365            assert (compact->utf8 != data);
366        }
367        else {
368            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
369
370            data = unicode->data.any;
371            if (kind == PyUnicode_WCHAR_KIND) {
372                assert(ascii->length == 0);
373                assert(ascii->hash == -1);
374                assert(ascii->state.compact == 0);
375                assert(ascii->state.ascii == 0);
376                assert(ascii->state.ready == 0);
377                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
378                assert(ascii->wstr != NULL);
379                assert(data == NULL);
380                assert(compact->utf8 == NULL);
381            }
382            else {
383                assert(kind == PyUnicode_1BYTE_KIND
384                       || kind == PyUnicode_2BYTE_KIND
385                       || kind == PyUnicode_4BYTE_KIND);
386                assert(ascii->state.compact == 0);
387                assert(ascii->state.ready == 1);
388                assert(data != NULL);
389                if (ascii->state.ascii) {
390                    assert (compact->utf8 == data);
391                    assert (compact->utf8_length == ascii->length);
392                }
393                else
394                    assert (compact->utf8 != data);
395            }
396        }
397        if (kind != PyUnicode_WCHAR_KIND) {
398            if (
399#if SIZEOF_WCHAR_T == 2
400                kind == PyUnicode_2BYTE_KIND
401#else
402                kind == PyUnicode_4BYTE_KIND
403#endif
404               )
405            {
406                assert(ascii->wstr == data);
407                assert(compact->wstr_length == ascii->length);
408            } else
409                assert(ascii->wstr != data);
410        }
411
412        if (compact->utf8 == NULL)
413            assert(compact->utf8_length == 0);
414        if (ascii->wstr == NULL)
415            assert(compact->wstr_length == 0);
416    }
417    /* check that the best kind is used */
418    if (check_content && kind != PyUnicode_WCHAR_KIND)
419    {
420        Py_ssize_t i;
421        Py_UCS4 maxchar = 0;
422        void *data;
423        Py_UCS4 ch;
424
425        data = PyUnicode_DATA(ascii);
426        for (i=0; i < ascii->length; i++)
427        {
428            ch = PyUnicode_READ(kind, data, i);
429            if (ch > maxchar)
430                maxchar = ch;
431        }
432        if (kind == PyUnicode_1BYTE_KIND) {
433            if (ascii->state.ascii == 0) {
434                assert(maxchar >= 128);
435                assert(maxchar <= 255);
436            }
437            else
438                assert(maxchar < 128);
439        }
440        else if (kind == PyUnicode_2BYTE_KIND) {
441            assert(maxchar >= 0x100);
442            assert(maxchar <= 0xFFFF);
443        }
444        else {
445            assert(maxchar >= 0x10000);
446            assert(maxchar <= MAX_UNICODE);
447        }
448        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
449    }
450    return 1;
451}
452#endif
453
454static PyObject*
455unicode_result_wchar(PyObject *unicode)
456{
457#ifndef Py_DEBUG
458    Py_ssize_t len;
459
460    len = _PyUnicode_WSTR_LENGTH(unicode);
461    if (len == 0) {
462        Py_DECREF(unicode);
463        _Py_RETURN_UNICODE_EMPTY();
464    }
465
466    if (len == 1) {
467        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
468        if ((Py_UCS4)ch < 256) {
469            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
470            Py_DECREF(unicode);
471            return latin1_char;
472        }
473    }
474
475    if (_PyUnicode_Ready(unicode) < 0) {
476        Py_DECREF(unicode);
477        return NULL;
478    }
479#else
480    assert(Py_REFCNT(unicode) == 1);
481
482    /* don't make the result ready in debug mode to ensure that the caller
483       makes the string ready before using it */
484    assert(_PyUnicode_CheckConsistency(unicode, 1));
485#endif
486    return unicode;
487}
488
489static PyObject*
490unicode_result_ready(PyObject *unicode)
491{
492    Py_ssize_t length;
493
494    length = PyUnicode_GET_LENGTH(unicode);
495    if (length == 0) {
496        if (unicode != unicode_empty) {
497            Py_DECREF(unicode);
498            _Py_RETURN_UNICODE_EMPTY();
499        }
500        return unicode_empty;
501    }
502
503    if (length == 1) {
504        void *data = PyUnicode_DATA(unicode);
505        int kind = PyUnicode_KIND(unicode);
506        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
507        if (ch < 256) {
508            PyObject *latin1_char = unicode_latin1[ch];
509            if (latin1_char != NULL) {
510                if (unicode != latin1_char) {
511                    Py_INCREF(latin1_char);
512                    Py_DECREF(unicode);
513                }
514                return latin1_char;
515            }
516            else {
517                assert(_PyUnicode_CheckConsistency(unicode, 1));
518                Py_INCREF(unicode);
519                unicode_latin1[ch] = unicode;
520                return unicode;
521            }
522        }
523    }
524
525    assert(_PyUnicode_CheckConsistency(unicode, 1));
526    return unicode;
527}
528
529static PyObject*
530unicode_result(PyObject *unicode)
531{
532    assert(_PyUnicode_CHECK(unicode));
533    if (PyUnicode_IS_READY(unicode))
534        return unicode_result_ready(unicode);
535    else
536        return unicode_result_wchar(unicode);
537}
538
539static PyObject*
540unicode_result_unchanged(PyObject *unicode)
541{
542    if (PyUnicode_CheckExact(unicode)) {
543        if (PyUnicode_READY(unicode) == -1)
544            return NULL;
545        Py_INCREF(unicode);
546        return unicode;
547    }
548    else
549        /* Subtype -- return genuine unicode string with the same value. */
550        return _PyUnicode_Copy(unicode);
551}
552
553/* --- Bloom Filters ----------------------------------------------------- */
554
555/* stuff to implement simple "bloom filters" for Unicode characters.
556   to keep things simple, we use a single bitmask, using the least 5
557   bits from each unicode characters as the bit index. */
558
559/* the linebreak mask is set up by Unicode_Init below */
560
561#if LONG_BIT >= 128
562#define BLOOM_WIDTH 128
563#elif LONG_BIT >= 64
564#define BLOOM_WIDTH 64
565#elif LONG_BIT >= 32
566#define BLOOM_WIDTH 32
567#else
568#error "LONG_BIT is smaller than 32"
569#endif
570
571#define BLOOM_MASK unsigned long
572
573static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
574
575#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
576
577#define BLOOM_LINEBREAK(ch)                                             \
578    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
579     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
580
581Py_LOCAL_INLINE(BLOOM_MASK)
582make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
583{
584#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
585    do {                                               \
586        TYPE *data = (TYPE *)PTR;                      \
587        TYPE *end = data + LEN;                        \
588        Py_UCS4 ch;                                    \
589        for (; data != end; data++) {                  \
590            ch = *data;                                \
591            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
592        }                                              \
593        break;                                         \
594    } while (0)
595
596    /* calculate simple bloom-style bitmask for a given unicode string */
597
598    BLOOM_MASK mask;
599
600    mask = 0;
601    switch (kind) {
602    case PyUnicode_1BYTE_KIND:
603        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
604        break;
605    case PyUnicode_2BYTE_KIND:
606        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
607        break;
608    case PyUnicode_4BYTE_KIND:
609        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
610        break;
611    default:
612        assert(0);
613    }
614    return mask;
615
616#undef BLOOM_UPDATE
617}
618
619/* Compilation of templated routines */
620
621#include "stringlib/asciilib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
627#include "stringlib/find_max_char.h"
628#include "stringlib/localeutil.h"
629#include "stringlib/undef.h"
630
631#include "stringlib/ucs1lib.h"
632#include "stringlib/fastsearch.h"
633#include "stringlib/partition.h"
634#include "stringlib/split.h"
635#include "stringlib/count.h"
636#include "stringlib/find.h"
637#include "stringlib/replace.h"
638#include "stringlib/find_max_char.h"
639#include "stringlib/localeutil.h"
640#include "stringlib/undef.h"
641
642#include "stringlib/ucs2lib.h"
643#include "stringlib/fastsearch.h"
644#include "stringlib/partition.h"
645#include "stringlib/split.h"
646#include "stringlib/count.h"
647#include "stringlib/find.h"
648#include "stringlib/replace.h"
649#include "stringlib/find_max_char.h"
650#include "stringlib/localeutil.h"
651#include "stringlib/undef.h"
652
653#include "stringlib/ucs4lib.h"
654#include "stringlib/fastsearch.h"
655#include "stringlib/partition.h"
656#include "stringlib/split.h"
657#include "stringlib/count.h"
658#include "stringlib/find.h"
659#include "stringlib/replace.h"
660#include "stringlib/find_max_char.h"
661#include "stringlib/localeutil.h"
662#include "stringlib/undef.h"
663
664#include "stringlib/unicodedefs.h"
665#include "stringlib/fastsearch.h"
666#include "stringlib/count.h"
667#include "stringlib/find.h"
668#include "stringlib/undef.h"
669
670/* --- Unicode Object ----------------------------------------------------- */
671
672static PyObject *
673fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
674
675Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
676                                     Py_ssize_t size, Py_UCS4 ch,
677                                     int direction)
678{
679    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
680
681    switch (kind) {
682    case PyUnicode_1BYTE_KIND:
683        {
684            Py_UCS1 ch1 = (Py_UCS1) ch;
685            if (ch1 == ch)
686                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
687            else
688                return -1;
689        }
690    case PyUnicode_2BYTE_KIND:
691        {
692            Py_UCS2 ch2 = (Py_UCS2) ch;
693            if (ch2 == ch)
694                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
695            else
696                return -1;
697        }
698    case PyUnicode_4BYTE_KIND:
699        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
700    default:
701        assert(0);
702        return -1;
703    }
704}
705
706#ifdef Py_DEBUG
707/* Fill the data of an Unicode string with invalid characters to detect bugs
708   earlier.
709
710   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
711   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
712   invalid character in Unicode 6.0. */
713static void
714unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
715{
716    int kind = PyUnicode_KIND(unicode);
717    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
718    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
719    if (length <= old_length)
720        return;
721    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
722}
723#endif
724
725static PyObject*
726resize_compact(PyObject *unicode, Py_ssize_t length)
727{
728    Py_ssize_t char_size;
729    Py_ssize_t struct_size;
730    Py_ssize_t new_size;
731    int share_wstr;
732    PyObject *new_unicode;
733#ifdef Py_DEBUG
734    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
735#endif
736
737    assert(unicode_modifiable(unicode));
738    assert(PyUnicode_IS_READY(unicode));
739    assert(PyUnicode_IS_COMPACT(unicode));
740
741    char_size = PyUnicode_KIND(unicode);
742    if (PyUnicode_IS_ASCII(unicode))
743        struct_size = sizeof(PyASCIIObject);
744    else
745        struct_size = sizeof(PyCompactUnicodeObject);
746    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
747
748    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
749        PyErr_NoMemory();
750        return NULL;
751    }
752    new_size = (struct_size + (length + 1) * char_size);
753
754    _Py_DEC_REFTOTAL;
755    _Py_ForgetReference(unicode);
756
757    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
758    if (new_unicode == NULL) {
759        _Py_NewReference(unicode);
760        PyErr_NoMemory();
761        return NULL;
762    }
763    unicode = new_unicode;
764    _Py_NewReference(unicode);
765
766    _PyUnicode_LENGTH(unicode) = length;
767    if (share_wstr) {
768        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
769        if (!PyUnicode_IS_ASCII(unicode))
770            _PyUnicode_WSTR_LENGTH(unicode) = length;
771    }
772    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
773        PyObject_DEL(_PyUnicode_WSTR(unicode));
774        _PyUnicode_WSTR(unicode) = NULL;
775    }
776#ifdef Py_DEBUG
777    unicode_fill_invalid(unicode, old_length);
778#endif
779    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
780                    length, 0);
781    assert(_PyUnicode_CheckConsistency(unicode, 0));
782    return unicode;
783}
784
785static int
786resize_inplace(PyObject *unicode, Py_ssize_t length)
787{
788    wchar_t *wstr;
789    Py_ssize_t new_size;
790    assert(!PyUnicode_IS_COMPACT(unicode));
791    assert(Py_REFCNT(unicode) == 1);
792
793    if (PyUnicode_IS_READY(unicode)) {
794        Py_ssize_t char_size;
795        int share_wstr, share_utf8;
796        void *data;
797#ifdef Py_DEBUG
798        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
799#endif
800
801        data = _PyUnicode_DATA_ANY(unicode);
802        char_size = PyUnicode_KIND(unicode);
803        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
804        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
805
806        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
807            PyErr_NoMemory();
808            return -1;
809        }
810        new_size = (length + 1) * char_size;
811
812        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
813        {
814            PyObject_DEL(_PyUnicode_UTF8(unicode));
815            _PyUnicode_UTF8(unicode) = NULL;
816            _PyUnicode_UTF8_LENGTH(unicode) = 0;
817        }
818
819        data = (PyObject *)PyObject_REALLOC(data, new_size);
820        if (data == NULL) {
821            PyErr_NoMemory();
822            return -1;
823        }
824        _PyUnicode_DATA_ANY(unicode) = data;
825        if (share_wstr) {
826            _PyUnicode_WSTR(unicode) = data;
827            _PyUnicode_WSTR_LENGTH(unicode) = length;
828        }
829        if (share_utf8) {
830            _PyUnicode_UTF8(unicode) = data;
831            _PyUnicode_UTF8_LENGTH(unicode) = length;
832        }
833        _PyUnicode_LENGTH(unicode) = length;
834        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
835#ifdef Py_DEBUG
836        unicode_fill_invalid(unicode, old_length);
837#endif
838        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
839            assert(_PyUnicode_CheckConsistency(unicode, 0));
840            return 0;
841        }
842    }
843    assert(_PyUnicode_WSTR(unicode) != NULL);
844
845    /* check for integer overflow */
846    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
847        PyErr_NoMemory();
848        return -1;
849    }
850    new_size = sizeof(wchar_t) * (length + 1);
851    wstr =  _PyUnicode_WSTR(unicode);
852    wstr = PyObject_REALLOC(wstr, new_size);
853    if (!wstr) {
854        PyErr_NoMemory();
855        return -1;
856    }
857    _PyUnicode_WSTR(unicode) = wstr;
858    _PyUnicode_WSTR(unicode)[length] = 0;
859    _PyUnicode_WSTR_LENGTH(unicode) = length;
860    assert(_PyUnicode_CheckConsistency(unicode, 0));
861    return 0;
862}
863
864static PyObject*
865resize_copy(PyObject *unicode, Py_ssize_t length)
866{
867    Py_ssize_t copy_length;
868    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
869        PyObject *copy;
870
871        if (PyUnicode_READY(unicode) == -1)
872            return NULL;
873
874        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
875        if (copy == NULL)
876            return NULL;
877
878        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
879        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
880        return copy;
881    }
882    else {
883        PyObject *w;
884
885        w = (PyObject*)_PyUnicode_New(length);
886        if (w == NULL)
887            return NULL;
888        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
889        copy_length = Py_MIN(copy_length, length);
890        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
891                  copy_length * sizeof(wchar_t));
892        return w;
893    }
894}
895
896/* We allocate one more byte to make sure the string is
897   Ux0000 terminated; some code (e.g. new_identifier)
898   relies on that.
899
900   XXX This allocator could further be enhanced by assuring that the
901   free list never reduces its size below 1.
902
903*/
904
905static PyUnicodeObject *
906_PyUnicode_New(Py_ssize_t length)
907{
908    PyUnicodeObject *unicode;
909    size_t new_size;
910
911    /* Optimization for empty strings */
912    if (length == 0 && unicode_empty != NULL) {
913        Py_INCREF(unicode_empty);
914        return (PyUnicodeObject*)unicode_empty;
915    }
916
917    /* Ensure we won't overflow the size. */
918    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
919        return (PyUnicodeObject *)PyErr_NoMemory();
920    }
921    if (length < 0) {
922        PyErr_SetString(PyExc_SystemError,
923                        "Negative size passed to _PyUnicode_New");
924        return NULL;
925    }
926
927    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
928    if (unicode == NULL)
929        return NULL;
930    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
931
932    _PyUnicode_WSTR_LENGTH(unicode) = length;
933    _PyUnicode_HASH(unicode) = -1;
934    _PyUnicode_STATE(unicode).interned = 0;
935    _PyUnicode_STATE(unicode).kind = 0;
936    _PyUnicode_STATE(unicode).compact = 0;
937    _PyUnicode_STATE(unicode).ready = 0;
938    _PyUnicode_STATE(unicode).ascii = 0;
939    _PyUnicode_DATA_ANY(unicode) = NULL;
940    _PyUnicode_LENGTH(unicode) = 0;
941    _PyUnicode_UTF8(unicode) = NULL;
942    _PyUnicode_UTF8_LENGTH(unicode) = 0;
943
944    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
945    if (!_PyUnicode_WSTR(unicode)) {
946        Py_DECREF(unicode);
947        PyErr_NoMemory();
948        return NULL;
949    }
950
951    /* Initialize the first element to guard against cases where
952     * the caller fails before initializing str -- unicode_resize()
953     * reads str[0], and the Keep-Alive optimization can keep memory
954     * allocated for str alive across a call to unicode_dealloc(unicode).
955     * We don't want unicode_resize to read uninitialized memory in
956     * that case.
957     */
958    _PyUnicode_WSTR(unicode)[0] = 0;
959    _PyUnicode_WSTR(unicode)[length] = 0;
960
961    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
962    return unicode;
963}
964
965static const char*
966unicode_kind_name(PyObject *unicode)
967{
968    /* don't check consistency: unicode_kind_name() is called from
969       _PyUnicode_Dump() */
970    if (!PyUnicode_IS_COMPACT(unicode))
971    {
972        if (!PyUnicode_IS_READY(unicode))
973            return "wstr";
974        switch (PyUnicode_KIND(unicode))
975        {
976        case PyUnicode_1BYTE_KIND:
977            if (PyUnicode_IS_ASCII(unicode))
978                return "legacy ascii";
979            else
980                return "legacy latin1";
981        case PyUnicode_2BYTE_KIND:
982            return "legacy UCS2";
983        case PyUnicode_4BYTE_KIND:
984            return "legacy UCS4";
985        default:
986            return "<legacy invalid kind>";
987        }
988    }
989    assert(PyUnicode_IS_READY(unicode));
990    switch (PyUnicode_KIND(unicode)) {
991    case PyUnicode_1BYTE_KIND:
992        if (PyUnicode_IS_ASCII(unicode))
993            return "ascii";
994        else
995            return "latin1";
996    case PyUnicode_2BYTE_KIND:
997        return "UCS2";
998    case PyUnicode_4BYTE_KIND:
999        return "UCS4";
1000    default:
1001        return "<invalid compact kind>";
1002    }
1003}
1004
1005#ifdef Py_DEBUG
1006/* Functions wrapping macros for use in debugger */
1007char *_PyUnicode_utf8(void *unicode){
1008    return PyUnicode_UTF8(unicode);
1009}
1010
1011void *_PyUnicode_compact_data(void *unicode) {
1012    return _PyUnicode_COMPACT_DATA(unicode);
1013}
1014void *_PyUnicode_data(void *unicode){
1015    printf("obj %p\n", unicode);
1016    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1017    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1018    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1019    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1020    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1021    return PyUnicode_DATA(unicode);
1022}
1023
1024void
1025_PyUnicode_Dump(PyObject *op)
1026{
1027    PyASCIIObject *ascii = (PyASCIIObject *)op;
1028    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1029    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1030    void *data;
1031
1032    if (ascii->state.compact)
1033    {
1034        if (ascii->state.ascii)
1035            data = (ascii + 1);
1036        else
1037            data = (compact + 1);
1038    }
1039    else
1040        data = unicode->data.any;
1041    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1042           unicode_kind_name(op), ascii->length);
1043
1044    if (ascii->wstr == data)
1045        printf("shared ");
1046    printf("wstr=%p", ascii->wstr);
1047
1048    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1049        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1050        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1051            printf("shared ");
1052        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1053               compact->utf8, compact->utf8_length);
1054    }
1055    printf(", data=%p\n", data);
1056}
1057#endif
1058
1059PyObject *
1060PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1061{
1062    PyObject *obj;
1063    PyCompactUnicodeObject *unicode;
1064    void *data;
1065    enum PyUnicode_Kind kind;
1066    int is_sharing, is_ascii;
1067    Py_ssize_t char_size;
1068    Py_ssize_t struct_size;
1069
1070    /* Optimization for empty strings */
1071    if (size == 0 && unicode_empty != NULL) {
1072        Py_INCREF(unicode_empty);
1073        return unicode_empty;
1074    }
1075
1076    is_ascii = 0;
1077    is_sharing = 0;
1078    struct_size = sizeof(PyCompactUnicodeObject);
1079    if (maxchar < 128) {
1080        kind = PyUnicode_1BYTE_KIND;
1081        char_size = 1;
1082        is_ascii = 1;
1083        struct_size = sizeof(PyASCIIObject);
1084    }
1085    else if (maxchar < 256) {
1086        kind = PyUnicode_1BYTE_KIND;
1087        char_size = 1;
1088    }
1089    else if (maxchar < 65536) {
1090        kind = PyUnicode_2BYTE_KIND;
1091        char_size = 2;
1092        if (sizeof(wchar_t) == 2)
1093            is_sharing = 1;
1094    }
1095    else {
1096        if (maxchar > MAX_UNICODE) {
1097            PyErr_SetString(PyExc_SystemError,
1098                            "invalid maximum character passed to PyUnicode_New");
1099            return NULL;
1100        }
1101        kind = PyUnicode_4BYTE_KIND;
1102        char_size = 4;
1103        if (sizeof(wchar_t) == 4)
1104            is_sharing = 1;
1105    }
1106
1107    /* Ensure we won't overflow the size. */
1108    if (size < 0) {
1109        PyErr_SetString(PyExc_SystemError,
1110                        "Negative size passed to PyUnicode_New");
1111        return NULL;
1112    }
1113    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1114        return PyErr_NoMemory();
1115
1116    /* Duplicated allocation code from _PyObject_New() instead of a call to
1117     * PyObject_New() so we are able to allocate space for the object and
1118     * it's data buffer.
1119     */
1120    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1121    if (obj == NULL)
1122        return PyErr_NoMemory();
1123    obj = PyObject_INIT(obj, &PyUnicode_Type);
1124    if (obj == NULL)
1125        return NULL;
1126
1127    unicode = (PyCompactUnicodeObject *)obj;
1128    if (is_ascii)
1129        data = ((PyASCIIObject*)obj) + 1;
1130    else
1131        data = unicode + 1;
1132    _PyUnicode_LENGTH(unicode) = size;
1133    _PyUnicode_HASH(unicode) = -1;
1134    _PyUnicode_STATE(unicode).interned = 0;
1135    _PyUnicode_STATE(unicode).kind = kind;
1136    _PyUnicode_STATE(unicode).compact = 1;
1137    _PyUnicode_STATE(unicode).ready = 1;
1138    _PyUnicode_STATE(unicode).ascii = is_ascii;
1139    if (is_ascii) {
1140        ((char*)data)[size] = 0;
1141        _PyUnicode_WSTR(unicode) = NULL;
1142    }
1143    else if (kind == PyUnicode_1BYTE_KIND) {
1144        ((char*)data)[size] = 0;
1145        _PyUnicode_WSTR(unicode) = NULL;
1146        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1147        unicode->utf8 = NULL;
1148        unicode->utf8_length = 0;
1149    }
1150    else {
1151        unicode->utf8 = NULL;
1152        unicode->utf8_length = 0;
1153        if (kind == PyUnicode_2BYTE_KIND)
1154            ((Py_UCS2*)data)[size] = 0;
1155        else /* kind == PyUnicode_4BYTE_KIND */
1156            ((Py_UCS4*)data)[size] = 0;
1157        if (is_sharing) {
1158            _PyUnicode_WSTR_LENGTH(unicode) = size;
1159            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1160        }
1161        else {
1162            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1163            _PyUnicode_WSTR(unicode) = NULL;
1164        }
1165    }
1166#ifdef Py_DEBUG
1167    unicode_fill_invalid((PyObject*)unicode, 0);
1168#endif
1169    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1170    return obj;
1171}
1172
1173#if SIZEOF_WCHAR_T == 2
1174/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1175   will decode surrogate pairs, the other conversions are implemented as macros
1176   for efficiency.
1177
1178   This function assumes that unicode can hold one more code point than wstr
1179   characters for a terminating null character. */
1180static void
1181unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1182                              PyObject *unicode)
1183{
1184    const wchar_t *iter;
1185    Py_UCS4 *ucs4_out;
1186
1187    assert(unicode != NULL);
1188    assert(_PyUnicode_CHECK(unicode));
1189    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1190    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1191
1192    for (iter = begin; iter < end; ) {
1193        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1194                           _PyUnicode_GET_LENGTH(unicode)));
1195        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1196            && (iter+1) < end
1197            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1198        {
1199            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1200            iter += 2;
1201        }
1202        else {
1203            *ucs4_out++ = *iter;
1204            iter++;
1205        }
1206    }
1207    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1208                        _PyUnicode_GET_LENGTH(unicode)));
1209
1210}
1211#endif
1212
1213static int
1214unicode_check_modifiable(PyObject *unicode)
1215{
1216    if (!unicode_modifiable(unicode)) {
1217        PyErr_SetString(PyExc_SystemError,
1218                        "Cannot modify a string currently used");
1219        return -1;
1220    }
1221    return 0;
1222}
1223
1224static int
1225_copy_characters(PyObject *to, Py_ssize_t to_start,
1226                 PyObject *from, Py_ssize_t from_start,
1227                 Py_ssize_t how_many, int check_maxchar)
1228{
1229    unsigned int from_kind, to_kind;
1230    void *from_data, *to_data;
1231
1232    assert(0 <= how_many);
1233    assert(0 <= from_start);
1234    assert(0 <= to_start);
1235    assert(PyUnicode_Check(from));
1236    assert(PyUnicode_IS_READY(from));
1237    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1238
1239    assert(PyUnicode_Check(to));
1240    assert(PyUnicode_IS_READY(to));
1241    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1242
1243    if (how_many == 0)
1244        return 0;
1245
1246    from_kind = PyUnicode_KIND(from);
1247    from_data = PyUnicode_DATA(from);
1248    to_kind = PyUnicode_KIND(to);
1249    to_data = PyUnicode_DATA(to);
1250
1251#ifdef Py_DEBUG
1252    if (!check_maxchar
1253        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1254    {
1255        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1256        Py_UCS4 ch;
1257        Py_ssize_t i;
1258        for (i=0; i < how_many; i++) {
1259            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1260            assert(ch <= to_maxchar);
1261        }
1262    }
1263#endif
1264
1265    if (from_kind == to_kind) {
1266        if (check_maxchar
1267            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1268        {
1269            /* Writing Latin-1 characters into an ASCII string requires to
1270               check that all written characters are pure ASCII */
1271            Py_UCS4 max_char;
1272            max_char = ucs1lib_find_max_char(from_data,
1273                                             (Py_UCS1*)from_data + how_many);
1274            if (max_char >= 128)
1275                return -1;
1276        }
1277        Py_MEMCPY((char*)to_data + to_kind * to_start,
1278                  (char*)from_data + from_kind * from_start,
1279                  to_kind * how_many);
1280    }
1281    else if (from_kind == PyUnicode_1BYTE_KIND
1282             && to_kind == PyUnicode_2BYTE_KIND)
1283    {
1284        _PyUnicode_CONVERT_BYTES(
1285            Py_UCS1, Py_UCS2,
1286            PyUnicode_1BYTE_DATA(from) + from_start,
1287            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1288            PyUnicode_2BYTE_DATA(to) + to_start
1289            );
1290    }
1291    else if (from_kind == PyUnicode_1BYTE_KIND
1292             && to_kind == PyUnicode_4BYTE_KIND)
1293    {
1294        _PyUnicode_CONVERT_BYTES(
1295            Py_UCS1, Py_UCS4,
1296            PyUnicode_1BYTE_DATA(from) + from_start,
1297            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1298            PyUnicode_4BYTE_DATA(to) + to_start
1299            );
1300    }
1301    else if (from_kind == PyUnicode_2BYTE_KIND
1302             && to_kind == PyUnicode_4BYTE_KIND)
1303    {
1304        _PyUnicode_CONVERT_BYTES(
1305            Py_UCS2, Py_UCS4,
1306            PyUnicode_2BYTE_DATA(from) + from_start,
1307            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1308            PyUnicode_4BYTE_DATA(to) + to_start
1309            );
1310    }
1311    else {
1312        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1313
1314        if (!check_maxchar) {
1315            if (from_kind == PyUnicode_2BYTE_KIND
1316                && to_kind == PyUnicode_1BYTE_KIND)
1317            {
1318                _PyUnicode_CONVERT_BYTES(
1319                    Py_UCS2, Py_UCS1,
1320                    PyUnicode_2BYTE_DATA(from) + from_start,
1321                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1322                    PyUnicode_1BYTE_DATA(to) + to_start
1323                    );
1324            }
1325            else if (from_kind == PyUnicode_4BYTE_KIND
1326                     && to_kind == PyUnicode_1BYTE_KIND)
1327            {
1328                _PyUnicode_CONVERT_BYTES(
1329                    Py_UCS4, Py_UCS1,
1330                    PyUnicode_4BYTE_DATA(from) + from_start,
1331                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1332                    PyUnicode_1BYTE_DATA(to) + to_start
1333                    );
1334            }
1335            else if (from_kind == PyUnicode_4BYTE_KIND
1336                     && to_kind == PyUnicode_2BYTE_KIND)
1337            {
1338                _PyUnicode_CONVERT_BYTES(
1339                    Py_UCS4, Py_UCS2,
1340                    PyUnicode_4BYTE_DATA(from) + from_start,
1341                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1342                    PyUnicode_2BYTE_DATA(to) + to_start
1343                    );
1344            }
1345            else {
1346                assert(0);
1347                return -1;
1348            }
1349        }
1350        else {
1351            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1352            Py_UCS4 ch;
1353            Py_ssize_t i;
1354
1355            for (i=0; i < how_many; i++) {
1356                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1357                if (ch > to_maxchar)
1358                    return -1;
1359                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1360            }
1361        }
1362    }
1363    return 0;
1364}
1365
1366void
1367_PyUnicode_FastCopyCharacters(
1368    PyObject *to, Py_ssize_t to_start,
1369    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1370{
1371    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1372}
1373
1374Py_ssize_t
1375PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1376                         PyObject *from, Py_ssize_t from_start,
1377                         Py_ssize_t how_many)
1378{
1379    int err;
1380
1381    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1382        PyErr_BadInternalCall();
1383        return -1;
1384    }
1385
1386    if (PyUnicode_READY(from) == -1)
1387        return -1;
1388    if (PyUnicode_READY(to) == -1)
1389        return -1;
1390
1391    if (from_start < 0) {
1392        PyErr_SetString(PyExc_IndexError, "string index out of range");
1393        return -1;
1394    }
1395    if (to_start < 0) {
1396        PyErr_SetString(PyExc_IndexError, "string index out of range");
1397        return -1;
1398    }
1399    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1400    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1401        PyErr_Format(PyExc_SystemError,
1402                     "Cannot write %zi characters at %zi "
1403                     "in a string of %zi characters",
1404                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1405        return -1;
1406    }
1407
1408    if (how_many == 0)
1409        return 0;
1410
1411    if (unicode_check_modifiable(to))
1412        return -1;
1413
1414    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1415    if (err) {
1416        PyErr_Format(PyExc_SystemError,
1417                     "Cannot copy %s characters "
1418                     "into a string of %s characters",
1419                     unicode_kind_name(from),
1420                     unicode_kind_name(to));
1421        return -1;
1422    }
1423    return how_many;
1424}
1425
1426/* Find the maximum code point and count the number of surrogate pairs so a
1427   correct string length can be computed before converting a string to UCS4.
1428   This function counts single surrogates as a character and not as a pair.
1429
1430   Return 0 on success, or -1 on error. */
1431static int
1432find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1433                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1434{
1435    const wchar_t *iter;
1436    Py_UCS4 ch;
1437
1438    assert(num_surrogates != NULL && maxchar != NULL);
1439    *num_surrogates = 0;
1440    *maxchar = 0;
1441
1442    for (iter = begin; iter < end; ) {
1443#if SIZEOF_WCHAR_T == 2
1444        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1445            && (iter+1) < end
1446            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1447        {
1448            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1449            ++(*num_surrogates);
1450            iter += 2;
1451        }
1452        else
1453#endif
1454        {
1455            ch = *iter;
1456            iter++;
1457        }
1458        if (ch > *maxchar) {
1459            *maxchar = ch;
1460            if (*maxchar > MAX_UNICODE) {
1461                PyErr_Format(PyExc_ValueError,
1462                             "character U+%x is not in range [U+0000; U+10ffff]",
1463                             ch);
1464                return -1;
1465            }
1466        }
1467    }
1468    return 0;
1469}
1470
1471int
1472_PyUnicode_Ready(PyObject *unicode)
1473{
1474    wchar_t *end;
1475    Py_UCS4 maxchar = 0;
1476    Py_ssize_t num_surrogates;
1477#if SIZEOF_WCHAR_T == 2
1478    Py_ssize_t length_wo_surrogates;
1479#endif
1480
1481    /* _PyUnicode_Ready() is only intended for old-style API usage where
1482       strings were created using _PyObject_New() and where no canonical
1483       representation (the str field) has been set yet aka strings
1484       which are not yet ready. */
1485    assert(_PyUnicode_CHECK(unicode));
1486    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1487    assert(_PyUnicode_WSTR(unicode) != NULL);
1488    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1489    assert(_PyUnicode_UTF8(unicode) == NULL);
1490    /* Actually, it should neither be interned nor be anything else: */
1491    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1492
1493    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1494    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1495                                &maxchar, &num_surrogates) == -1)
1496        return -1;
1497
1498    if (maxchar < 256) {
1499        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1500        if (!_PyUnicode_DATA_ANY(unicode)) {
1501            PyErr_NoMemory();
1502            return -1;
1503        }
1504        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1505                                _PyUnicode_WSTR(unicode), end,
1506                                PyUnicode_1BYTE_DATA(unicode));
1507        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1508        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1509        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1510        if (maxchar < 128) {
1511            _PyUnicode_STATE(unicode).ascii = 1;
1512            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1513            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1514        }
1515        else {
1516            _PyUnicode_STATE(unicode).ascii = 0;
1517            _PyUnicode_UTF8(unicode) = NULL;
1518            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1519        }
1520        PyObject_FREE(_PyUnicode_WSTR(unicode));
1521        _PyUnicode_WSTR(unicode) = NULL;
1522        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523    }
1524    /* In this case we might have to convert down from 4-byte native
1525       wchar_t to 2-byte unicode. */
1526    else if (maxchar < 65536) {
1527        assert(num_surrogates == 0 &&
1528               "FindMaxCharAndNumSurrogatePairs() messed up");
1529
1530#if SIZEOF_WCHAR_T == 2
1531        /* We can share representations and are done. */
1532        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1533        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1534        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1535        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1536        _PyUnicode_UTF8(unicode) = NULL;
1537        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1538#else
1539        /* sizeof(wchar_t) == 4 */
1540        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1541            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1542        if (!_PyUnicode_DATA_ANY(unicode)) {
1543            PyErr_NoMemory();
1544            return -1;
1545        }
1546        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1547                                _PyUnicode_WSTR(unicode), end,
1548                                PyUnicode_2BYTE_DATA(unicode));
1549        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1550        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1551        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1552        _PyUnicode_UTF8(unicode) = NULL;
1553        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1554        PyObject_FREE(_PyUnicode_WSTR(unicode));
1555        _PyUnicode_WSTR(unicode) = NULL;
1556        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1557#endif
1558    }
1559    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1560    else {
1561#if SIZEOF_WCHAR_T == 2
1562        /* in case the native representation is 2-bytes, we need to allocate a
1563           new normalized 4-byte version. */
1564        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1565        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1566            PyErr_NoMemory();
1567            return -1;
1568        }
1569        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1570        if (!_PyUnicode_DATA_ANY(unicode)) {
1571            PyErr_NoMemory();
1572            return -1;
1573        }
1574        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1575        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1576        _PyUnicode_UTF8(unicode) = NULL;
1577        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1578        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1579        _PyUnicode_STATE(unicode).ready = 1;
1580        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1581        PyObject_FREE(_PyUnicode_WSTR(unicode));
1582        _PyUnicode_WSTR(unicode) = NULL;
1583        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1584#else
1585        assert(num_surrogates == 0);
1586
1587        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1588        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1589        _PyUnicode_UTF8(unicode) = NULL;
1590        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1591        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1592#endif
1593        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1594    }
1595    _PyUnicode_STATE(unicode).ready = 1;
1596    assert(_PyUnicode_CheckConsistency(unicode, 1));
1597    return 0;
1598}
1599
1600static void
1601unicode_dealloc(PyObject *unicode)
1602{
1603    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1604    case SSTATE_NOT_INTERNED:
1605        break;
1606
1607    case SSTATE_INTERNED_MORTAL:
1608        /* revive dead object temporarily for DelItem */
1609        Py_REFCNT(unicode) = 3;
1610        if (PyDict_DelItem(interned, unicode) != 0)
1611            Py_FatalError(
1612                "deletion of interned string failed");
1613        break;
1614
1615    case SSTATE_INTERNED_IMMORTAL:
1616        Py_FatalError("Immortal interned string died.");
1617
1618    default:
1619        Py_FatalError("Inconsistent interned string state.");
1620    }
1621
1622    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1623        PyObject_DEL(_PyUnicode_WSTR(unicode));
1624    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1625        PyObject_DEL(_PyUnicode_UTF8(unicode));
1626    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1627        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1628
1629    Py_TYPE(unicode)->tp_free(unicode);
1630}
1631
1632#ifdef Py_DEBUG
1633static int
1634unicode_is_singleton(PyObject *unicode)
1635{
1636    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1637    if (unicode == unicode_empty)
1638        return 1;
1639    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1640    {
1641        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1642        if (ch < 256 && unicode_latin1[ch] == unicode)
1643            return 1;
1644    }
1645    return 0;
1646}
1647#endif
1648
1649static int
1650unicode_modifiable(PyObject *unicode)
1651{
1652    assert(_PyUnicode_CHECK(unicode));
1653    if (Py_REFCNT(unicode) != 1)
1654        return 0;
1655    if (_PyUnicode_HASH(unicode) != -1)
1656        return 0;
1657    if (PyUnicode_CHECK_INTERNED(unicode))
1658        return 0;
1659    if (!PyUnicode_CheckExact(unicode))
1660        return 0;
1661#ifdef Py_DEBUG
1662    /* singleton refcount is greater than 1 */
1663    assert(!unicode_is_singleton(unicode));
1664#endif
1665    return 1;
1666}
1667
1668static int
1669unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1670{
1671    PyObject *unicode;
1672    Py_ssize_t old_length;
1673
1674    assert(p_unicode != NULL);
1675    unicode = *p_unicode;
1676
1677    assert(unicode != NULL);
1678    assert(PyUnicode_Check(unicode));
1679    assert(0 <= length);
1680
1681    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1682        old_length = PyUnicode_WSTR_LENGTH(unicode);
1683    else
1684        old_length = PyUnicode_GET_LENGTH(unicode);
1685    if (old_length == length)
1686        return 0;
1687
1688    if (length == 0) {
1689        _Py_INCREF_UNICODE_EMPTY();
1690        if (!unicode_empty)
1691            return -1;
1692        Py_DECREF(*p_unicode);
1693        *p_unicode = unicode_empty;
1694        return 0;
1695    }
1696
1697    if (!unicode_modifiable(unicode)) {
1698        PyObject *copy = resize_copy(unicode, length);
1699        if (copy == NULL)
1700            return -1;
1701        Py_DECREF(*p_unicode);
1702        *p_unicode = copy;
1703        return 0;
1704    }
1705
1706    if (PyUnicode_IS_COMPACT(unicode)) {
1707        PyObject *new_unicode = resize_compact(unicode, length);
1708        if (new_unicode == NULL)
1709            return -1;
1710        *p_unicode = new_unicode;
1711        return 0;
1712    }
1713    return resize_inplace(unicode, length);
1714}
1715
1716int
1717PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1718{
1719    PyObject *unicode;
1720    if (p_unicode == NULL) {
1721        PyErr_BadInternalCall();
1722        return -1;
1723    }
1724    unicode = *p_unicode;
1725    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1726    {
1727        PyErr_BadInternalCall();
1728        return -1;
1729    }
1730    return unicode_resize(p_unicode, length);
1731}
1732
1733/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1734
1735   WARNING: The function doesn't copy the terminating null character and
1736   doesn't check the maximum character (may write a latin1 character in an
1737   ASCII string). */
1738static void
1739unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1740                   const char *str, Py_ssize_t len)
1741{
1742    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1743    void *data = PyUnicode_DATA(unicode);
1744    const char *end = str + len;
1745
1746    switch (kind) {
1747    case PyUnicode_1BYTE_KIND: {
1748        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1749#ifdef Py_DEBUG
1750        if (PyUnicode_IS_ASCII(unicode)) {
1751            Py_UCS4 maxchar = ucs1lib_find_max_char(
1752                (const Py_UCS1*)str,
1753                (const Py_UCS1*)str + len);
1754            assert(maxchar < 128);
1755        }
1756#endif
1757        memcpy((char *) data + index, str, len);
1758        break;
1759    }
1760    case PyUnicode_2BYTE_KIND: {
1761        Py_UCS2 *start = (Py_UCS2 *)data + index;
1762        Py_UCS2 *ucs2 = start;
1763        assert(index <= PyUnicode_GET_LENGTH(unicode));
1764
1765        for (; str < end; ++ucs2, ++str)
1766            *ucs2 = (Py_UCS2)*str;
1767
1768        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1769        break;
1770    }
1771    default: {
1772        Py_UCS4 *start = (Py_UCS4 *)data + index;
1773        Py_UCS4 *ucs4 = start;
1774        assert(kind == PyUnicode_4BYTE_KIND);
1775        assert(index <= PyUnicode_GET_LENGTH(unicode));
1776
1777        for (; str < end; ++ucs4, ++str)
1778            *ucs4 = (Py_UCS4)*str;
1779
1780        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1781    }
1782    }
1783}
1784
1785static PyObject*
1786get_latin1_char(unsigned char ch)
1787{
1788    PyObject *unicode = unicode_latin1[ch];
1789    if (!unicode) {
1790        unicode = PyUnicode_New(1, ch);
1791        if (!unicode)
1792            return NULL;
1793        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1794        assert(_PyUnicode_CheckConsistency(unicode, 1));
1795        unicode_latin1[ch] = unicode;
1796    }
1797    Py_INCREF(unicode);
1798    return unicode;
1799}
1800
1801static PyObject*
1802unicode_char(Py_UCS4 ch)
1803{
1804    PyObject *unicode;
1805
1806    assert(ch <= MAX_UNICODE);
1807
1808    if (ch < 256)
1809        return get_latin1_char(ch);
1810
1811    unicode = PyUnicode_New(1, ch);
1812    if (unicode == NULL)
1813        return NULL;
1814    switch (PyUnicode_KIND(unicode)) {
1815    case PyUnicode_1BYTE_KIND:
1816        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1817        break;
1818    case PyUnicode_2BYTE_KIND:
1819        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1820        break;
1821    default:
1822        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1823        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1824    }
1825    assert(_PyUnicode_CheckConsistency(unicode, 1));
1826    return unicode;
1827}
1828
1829PyObject *
1830PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1831{
1832    PyObject *unicode;
1833    Py_UCS4 maxchar = 0;
1834    Py_ssize_t num_surrogates;
1835
1836    if (u == NULL)
1837        return (PyObject*)_PyUnicode_New(size);
1838
1839    /* If the Unicode data is known at construction time, we can apply
1840       some optimizations which share commonly used objects. */
1841
1842    /* Optimization for empty strings */
1843    if (size == 0)
1844        _Py_RETURN_UNICODE_EMPTY();
1845
1846    /* Single character Unicode objects in the Latin-1 range are
1847       shared when using this constructor */
1848    if (size == 1 && (Py_UCS4)*u < 256)
1849        return get_latin1_char((unsigned char)*u);
1850
1851    /* If not empty and not single character, copy the Unicode data
1852       into the new object */
1853    if (find_maxchar_surrogates(u, u + size,
1854                                &maxchar, &num_surrogates) == -1)
1855        return NULL;
1856
1857    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1858    if (!unicode)
1859        return NULL;
1860
1861    switch (PyUnicode_KIND(unicode)) {
1862    case PyUnicode_1BYTE_KIND:
1863        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1864                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1865        break;
1866    case PyUnicode_2BYTE_KIND:
1867#if Py_UNICODE_SIZE == 2
1868        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1869#else
1870        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1871                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1872#endif
1873        break;
1874    case PyUnicode_4BYTE_KIND:
1875#if SIZEOF_WCHAR_T == 2
1876        /* This is the only case which has to process surrogates, thus
1877           a simple copy loop is not enough and we need a function. */
1878        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1879#else
1880        assert(num_surrogates == 0);
1881        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1882#endif
1883        break;
1884    default:
1885        assert(0 && "Impossible state");
1886    }
1887
1888    return unicode_result(unicode);
1889}
1890
1891PyObject *
1892PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1893{
1894    if (size < 0) {
1895        PyErr_SetString(PyExc_SystemError,
1896                        "Negative size passed to PyUnicode_FromStringAndSize");
1897        return NULL;
1898    }
1899    if (u != NULL)
1900        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1901    else
1902        return (PyObject *)_PyUnicode_New(size);
1903}
1904
1905PyObject *
1906PyUnicode_FromString(const char *u)
1907{
1908    size_t size = strlen(u);
1909    if (size > PY_SSIZE_T_MAX) {
1910        PyErr_SetString(PyExc_OverflowError, "input too long");
1911        return NULL;
1912    }
1913    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1914}
1915
1916PyObject *
1917_PyUnicode_FromId(_Py_Identifier *id)
1918{
1919    if (!id->object) {
1920        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1921                                                  strlen(id->string),
1922                                                  NULL, NULL);
1923        if (!id->object)
1924            return NULL;
1925        PyUnicode_InternInPlace(&id->object);
1926        assert(!id->next);
1927        id->next = static_strings;
1928        static_strings = id;
1929    }
1930    return id->object;
1931}
1932
1933void
1934_PyUnicode_ClearStaticStrings()
1935{
1936    _Py_Identifier *tmp, *s = static_strings;
1937    while (s) {
1938        Py_CLEAR(s->object);
1939        tmp = s->next;
1940        s->next = NULL;
1941        s = tmp;
1942    }
1943    static_strings = NULL;
1944}
1945
1946/* Internal function, doesn't check maximum character */
1947
1948PyObject*
1949_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1950{
1951    const unsigned char *s = (const unsigned char *)buffer;
1952    PyObject *unicode;
1953    if (size == 1) {
1954#ifdef Py_DEBUG
1955        assert((unsigned char)s[0] < 128);
1956#endif
1957        return get_latin1_char(s[0]);
1958    }
1959    unicode = PyUnicode_New(size, 127);
1960    if (!unicode)
1961        return NULL;
1962    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1963    assert(_PyUnicode_CheckConsistency(unicode, 1));
1964    return unicode;
1965}
1966
1967static Py_UCS4
1968kind_maxchar_limit(unsigned int kind)
1969{
1970    switch (kind) {
1971    case PyUnicode_1BYTE_KIND:
1972        return 0x80;
1973    case PyUnicode_2BYTE_KIND:
1974        return 0x100;
1975    case PyUnicode_4BYTE_KIND:
1976        return 0x10000;
1977    default:
1978        assert(0 && "invalid kind");
1979        return MAX_UNICODE;
1980    }
1981}
1982
1983Py_LOCAL_INLINE(Py_UCS4)
1984align_maxchar(Py_UCS4 maxchar)
1985{
1986    if (maxchar <= 127)
1987        return 127;
1988    else if (maxchar <= 255)
1989        return 255;
1990    else if (maxchar <= 65535)
1991        return 65535;
1992    else
1993        return MAX_UNICODE;
1994}
1995
1996static PyObject*
1997_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1998{
1999    PyObject *res;
2000    unsigned char max_char;
2001
2002    if (size == 0)
2003        _Py_RETURN_UNICODE_EMPTY();
2004    assert(size > 0);
2005    if (size == 1)
2006        return get_latin1_char(u[0]);
2007
2008    max_char = ucs1lib_find_max_char(u, u + size);
2009    res = PyUnicode_New(size, max_char);
2010    if (!res)
2011        return NULL;
2012    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2013    assert(_PyUnicode_CheckConsistency(res, 1));
2014    return res;
2015}
2016
2017static PyObject*
2018_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2019{
2020    PyObject *res;
2021    Py_UCS2 max_char;
2022
2023    if (size == 0)
2024        _Py_RETURN_UNICODE_EMPTY();
2025    assert(size > 0);
2026    if (size == 1)
2027        return unicode_char(u[0]);
2028
2029    max_char = ucs2lib_find_max_char(u, u + size);
2030    res = PyUnicode_New(size, max_char);
2031    if (!res)
2032        return NULL;
2033    if (max_char >= 256)
2034        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2035    else {
2036        _PyUnicode_CONVERT_BYTES(
2037            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2038    }
2039    assert(_PyUnicode_CheckConsistency(res, 1));
2040    return res;
2041}
2042
2043static PyObject*
2044_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2045{
2046    PyObject *res;
2047    Py_UCS4 max_char;
2048
2049    if (size == 0)
2050        _Py_RETURN_UNICODE_EMPTY();
2051    assert(size > 0);
2052    if (size == 1)
2053        return unicode_char(u[0]);
2054
2055    max_char = ucs4lib_find_max_char(u, u + size);
2056    res = PyUnicode_New(size, max_char);
2057    if (!res)
2058        return NULL;
2059    if (max_char < 256)
2060        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2061                                 PyUnicode_1BYTE_DATA(res));
2062    else if (max_char < 0x10000)
2063        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2064                                 PyUnicode_2BYTE_DATA(res));
2065    else
2066        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2067    assert(_PyUnicode_CheckConsistency(res, 1));
2068    return res;
2069}
2070
2071PyObject*
2072PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2073{
2074    if (size < 0) {
2075        PyErr_SetString(PyExc_ValueError, "size must be positive");
2076        return NULL;
2077    }
2078    switch (kind) {
2079    case PyUnicode_1BYTE_KIND:
2080        return _PyUnicode_FromUCS1(buffer, size);
2081    case PyUnicode_2BYTE_KIND:
2082        return _PyUnicode_FromUCS2(buffer, size);
2083    case PyUnicode_4BYTE_KIND:
2084        return _PyUnicode_FromUCS4(buffer, size);
2085    default:
2086        PyErr_SetString(PyExc_SystemError, "invalid kind");
2087        return NULL;
2088    }
2089}
2090
2091Py_UCS4
2092_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2093{
2094    enum PyUnicode_Kind kind;
2095    void *startptr, *endptr;
2096
2097    assert(PyUnicode_IS_READY(unicode));
2098    assert(0 <= start);
2099    assert(end <= PyUnicode_GET_LENGTH(unicode));
2100    assert(start <= end);
2101
2102    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2103        return PyUnicode_MAX_CHAR_VALUE(unicode);
2104
2105    if (start == end)
2106        return 127;
2107
2108    if (PyUnicode_IS_ASCII(unicode))
2109        return 127;
2110
2111    kind = PyUnicode_KIND(unicode);
2112    startptr = PyUnicode_DATA(unicode);
2113    endptr = (char *)startptr + end * kind;
2114    startptr = (char *)startptr + start * kind;
2115    switch(kind) {
2116    case PyUnicode_1BYTE_KIND:
2117        return ucs1lib_find_max_char(startptr, endptr);
2118    case PyUnicode_2BYTE_KIND:
2119        return ucs2lib_find_max_char(startptr, endptr);
2120    case PyUnicode_4BYTE_KIND:
2121        return ucs4lib_find_max_char(startptr, endptr);
2122    default:
2123        assert(0);
2124        return 0;
2125    }
2126}
2127
2128/* Ensure that a string uses the most efficient storage, if it is not the
2129   case: create a new string with of the right kind. Write NULL into *p_unicode
2130   on error. */
2131static void
2132unicode_adjust_maxchar(PyObject **p_unicode)
2133{
2134    PyObject *unicode, *copy;
2135    Py_UCS4 max_char;
2136    Py_ssize_t len;
2137    unsigned int kind;
2138
2139    assert(p_unicode != NULL);
2140    unicode = *p_unicode;
2141    assert(PyUnicode_IS_READY(unicode));
2142    if (PyUnicode_IS_ASCII(unicode))
2143        return;
2144
2145    len = PyUnicode_GET_LENGTH(unicode);
2146    kind = PyUnicode_KIND(unicode);
2147    if (kind == PyUnicode_1BYTE_KIND) {
2148        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2149        max_char = ucs1lib_find_max_char(u, u + len);
2150        if (max_char >= 128)
2151            return;
2152    }
2153    else if (kind == PyUnicode_2BYTE_KIND) {
2154        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2155        max_char = ucs2lib_find_max_char(u, u + len);
2156        if (max_char >= 256)
2157            return;
2158    }
2159    else {
2160        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2161        assert(kind == PyUnicode_4BYTE_KIND);
2162        max_char = ucs4lib_find_max_char(u, u + len);
2163        if (max_char >= 0x10000)
2164            return;
2165    }
2166    copy = PyUnicode_New(len, max_char);
2167    if (copy != NULL)
2168        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2169    Py_DECREF(unicode);
2170    *p_unicode = copy;
2171}
2172
2173PyObject*
2174_PyUnicode_Copy(PyObject *unicode)
2175{
2176    Py_ssize_t length;
2177    PyObject *copy;
2178
2179    if (!PyUnicode_Check(unicode)) {
2180        PyErr_BadInternalCall();
2181        return NULL;
2182    }
2183    if (PyUnicode_READY(unicode) == -1)
2184        return NULL;
2185
2186    length = PyUnicode_GET_LENGTH(unicode);
2187    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2188    if (!copy)
2189        return NULL;
2190    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2191
2192    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2193              length * PyUnicode_KIND(unicode));
2194    assert(_PyUnicode_CheckConsistency(copy, 1));
2195    return copy;
2196}
2197
2198
2199/* Widen Unicode objects to larger buffers. Don't write terminating null
2200   character. Return NULL on error. */
2201
2202void*
2203_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2204{
2205    Py_ssize_t len;
2206    void *result;
2207    unsigned int skind;
2208
2209    if (PyUnicode_READY(s) == -1)
2210        return NULL;
2211
2212    len = PyUnicode_GET_LENGTH(s);
2213    skind = PyUnicode_KIND(s);
2214    if (skind >= kind) {
2215        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2216        return NULL;
2217    }
2218    switch (kind) {
2219    case PyUnicode_2BYTE_KIND:
2220        result = PyMem_New(Py_UCS2, len);
2221        if (!result)
2222            return PyErr_NoMemory();
2223        assert(skind == PyUnicode_1BYTE_KIND);
2224        _PyUnicode_CONVERT_BYTES(
2225            Py_UCS1, Py_UCS2,
2226            PyUnicode_1BYTE_DATA(s),
2227            PyUnicode_1BYTE_DATA(s) + len,
2228            result);
2229        return result;
2230    case PyUnicode_4BYTE_KIND:
2231        result = PyMem_New(Py_UCS4, len);
2232        if (!result)
2233            return PyErr_NoMemory();
2234        if (skind == PyUnicode_2BYTE_KIND) {
2235            _PyUnicode_CONVERT_BYTES(
2236                Py_UCS2, Py_UCS4,
2237                PyUnicode_2BYTE_DATA(s),
2238                PyUnicode_2BYTE_DATA(s) + len,
2239                result);
2240        }
2241        else {
2242            assert(skind == PyUnicode_1BYTE_KIND);
2243            _PyUnicode_CONVERT_BYTES(
2244                Py_UCS1, Py_UCS4,
2245                PyUnicode_1BYTE_DATA(s),
2246                PyUnicode_1BYTE_DATA(s) + len,
2247                result);
2248        }
2249        return result;
2250    default:
2251        break;
2252    }
2253    PyErr_SetString(PyExc_SystemError, "invalid kind");
2254    return NULL;
2255}
2256
2257static Py_UCS4*
2258as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2259        int copy_null)
2260{
2261    int kind;
2262    void *data;
2263    Py_ssize_t len, targetlen;
2264    if (PyUnicode_READY(string) == -1)
2265        return NULL;
2266    kind = PyUnicode_KIND(string);
2267    data = PyUnicode_DATA(string);
2268    len = PyUnicode_GET_LENGTH(string);
2269    targetlen = len;
2270    if (copy_null)
2271        targetlen++;
2272    if (!target) {
2273        target = PyMem_New(Py_UCS4, targetlen);
2274        if (!target) {
2275            PyErr_NoMemory();
2276            return NULL;
2277        }
2278    }
2279    else {
2280        if (targetsize < targetlen) {
2281            PyErr_Format(PyExc_SystemError,
2282                         "string is longer than the buffer");
2283            if (copy_null && 0 < targetsize)
2284                target[0] = 0;
2285            return NULL;
2286        }
2287    }
2288    if (kind == PyUnicode_1BYTE_KIND) {
2289        Py_UCS1 *start = (Py_UCS1 *) data;
2290        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2291    }
2292    else if (kind == PyUnicode_2BYTE_KIND) {
2293        Py_UCS2 *start = (Py_UCS2 *) data;
2294        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2295    }
2296    else {
2297        assert(kind == PyUnicode_4BYTE_KIND);
2298        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2299    }
2300    if (copy_null)
2301        target[len] = 0;
2302    return target;
2303}
2304
2305Py_UCS4*
2306PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2307                 int copy_null)
2308{
2309    if (target == NULL || targetsize < 0) {
2310        PyErr_BadInternalCall();
2311        return NULL;
2312    }
2313    return as_ucs4(string, target, targetsize, copy_null);
2314}
2315
2316Py_UCS4*
2317PyUnicode_AsUCS4Copy(PyObject *string)
2318{
2319    return as_ucs4(string, NULL, 0, 1);
2320}
2321
2322#ifdef HAVE_WCHAR_H
2323
2324PyObject *
2325PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2326{
2327    if (w == NULL) {
2328        if (size == 0)
2329            _Py_RETURN_UNICODE_EMPTY();
2330        PyErr_BadInternalCall();
2331        return NULL;
2332    }
2333
2334    if (size == -1) {
2335        size = wcslen(w);
2336    }
2337
2338    return PyUnicode_FromUnicode(w, size);
2339}
2340
2341#endif /* HAVE_WCHAR_H */
2342
2343/* maximum number of characters required for output of %lld or %p.
2344   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2347
2348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350                             Py_ssize_t width, Py_ssize_t precision)
2351{
2352    Py_ssize_t length, fill, arglen;
2353    Py_UCS4 maxchar;
2354
2355    if (PyUnicode_READY(str) == -1)
2356        return -1;
2357
2358    length = PyUnicode_GET_LENGTH(str);
2359    if ((precision == -1 || precision >= length)
2360        && width <= length)
2361        return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363    if (precision != -1)
2364        length = Py_MIN(precision, length);
2365
2366    arglen = Py_MAX(length, width);
2367    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369    else
2370        maxchar = writer->maxchar;
2371
2372    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373        return -1;
2374
2375    if (width > length) {
2376        fill = width - length;
2377        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378            return -1;
2379        writer->pos += fill;
2380    }
2381
2382    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383                                  str, 0, length);
2384    writer->pos += length;
2385    return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390                              Py_ssize_t width, Py_ssize_t precision)
2391{
2392    /* UTF-8 */
2393    Py_ssize_t length;
2394    PyObject *unicode;
2395    int res;
2396
2397    length = strlen(str);
2398    if (precision != -1)
2399        length = Py_MIN(length, precision);
2400    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401    if (unicode == NULL)
2402        return -1;
2403
2404    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405    Py_DECREF(unicode);
2406    return res;
2407}
2408
2409static const char*
2410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411                       const char *f, va_list *vargs)
2412{
2413    const char *p;
2414    Py_ssize_t len;
2415    int zeropad;
2416    Py_ssize_t width;
2417    Py_ssize_t precision;
2418    int longflag;
2419    int longlongflag;
2420    int size_tflag;
2421    Py_ssize_t fill;
2422
2423    p = f;
2424    f++;
2425    zeropad = 0;
2426    if (*f == '0') {
2427        zeropad = 1;
2428        f++;
2429    }
2430
2431    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2432    width = -1;
2433    if (Py_ISDIGIT((unsigned)*f)) {
2434        width = *f - '0';
2435        f++;
2436        while (Py_ISDIGIT((unsigned)*f)) {
2437            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2438                PyErr_SetString(PyExc_ValueError,
2439                                "width too big");
2440                return NULL;
2441            }
2442            width = (width * 10) + (*f - '0');
2443            f++;
2444        }
2445    }
2446    precision = -1;
2447    if (*f == '.') {
2448        f++;
2449        if (Py_ISDIGIT((unsigned)*f)) {
2450            precision = (*f - '0');
2451            f++;
2452            while (Py_ISDIGIT((unsigned)*f)) {
2453                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454                    PyErr_SetString(PyExc_ValueError,
2455                                    "precision too big");
2456                    return NULL;
2457                }
2458                precision = (precision * 10) + (*f - '0');
2459                f++;
2460            }
2461        }
2462        if (*f == '%') {
2463            /* "%.3%s" => f points to "3" */
2464            f--;
2465        }
2466    }
2467    if (*f == '\0') {
2468        /* bogus format "%.123" => go backward, f points to "3" */
2469        f--;
2470    }
2471
2472    /* Handle %ld, %lu, %lld and %llu. */
2473    longflag = 0;
2474    longlongflag = 0;
2475    size_tflag = 0;
2476    if (*f == 'l') {
2477        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2478            longflag = 1;
2479            ++f;
2480        }
2481#ifdef HAVE_LONG_LONG
2482        else if (f[1] == 'l' &&
2483                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2484            longlongflag = 1;
2485            f += 2;
2486        }
2487#endif
2488    }
2489    /* handle the size_t flag. */
2490    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2491        size_tflag = 1;
2492        ++f;
2493    }
2494
2495    if (f[1] == '\0')
2496        writer->overallocate = 0;
2497
2498    switch (*f) {
2499    case 'c':
2500    {
2501        int ordinal = va_arg(*vargs, int);
2502        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2503            PyErr_SetString(PyExc_OverflowError,
2504                            "character argument not in range(0x110000)");
2505            return NULL;
2506        }
2507        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2508            return NULL;
2509        break;
2510    }
2511
2512    case 'i':
2513    case 'd':
2514    case 'u':
2515    case 'x':
2516    {
2517        /* used by sprintf */
2518        char buffer[MAX_LONG_LONG_CHARS];
2519        Py_ssize_t arglen;
2520
2521        if (*f == 'u') {
2522            if (longflag)
2523                len = sprintf(buffer, "%lu",
2524                        va_arg(*vargs, unsigned long));
2525#ifdef HAVE_LONG_LONG
2526            else if (longlongflag)
2527                len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
2528                        va_arg(*vargs, unsigned PY_LONG_LONG));
2529#endif
2530            else if (size_tflag)
2531                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2532                        va_arg(*vargs, size_t));
2533            else
2534                len = sprintf(buffer, "%u",
2535                        va_arg(*vargs, unsigned int));
2536        }
2537        else if (*f == 'x') {
2538            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2539        }
2540        else {
2541            if (longflag)
2542                len = sprintf(buffer, "%li",
2543                        va_arg(*vargs, long));
2544#ifdef HAVE_LONG_LONG
2545            else if (longlongflag)
2546                len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
2547                        va_arg(*vargs, PY_LONG_LONG));
2548#endif
2549            else if (size_tflag)
2550                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2551                        va_arg(*vargs, Py_ssize_t));
2552            else
2553                len = sprintf(buffer, "%i",
2554                        va_arg(*vargs, int));
2555        }
2556        assert(len >= 0);
2557
2558        if (precision < len)
2559            precision = len;
2560
2561        arglen = Py_MAX(precision, width);
2562        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2563            return NULL;
2564
2565        if (width > precision) {
2566            Py_UCS4 fillchar;
2567            fill = width - precision;
2568            fillchar = zeropad?'0':' ';
2569            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2570                return NULL;
2571            writer->pos += fill;
2572        }
2573        if (precision > len) {
2574            fill = precision - len;
2575            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2576                return NULL;
2577            writer->pos += fill;
2578        }
2579
2580        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2581            return NULL;
2582        break;
2583    }
2584
2585    case 'p':
2586    {
2587        char number[MAX_LONG_LONG_CHARS];
2588
2589        len = sprintf(number, "%p", va_arg(*vargs, void*));
2590        assert(len >= 0);
2591
2592        /* %p is ill-defined:  ensure leading 0x. */
2593        if (number[1] == 'X')
2594            number[1] = 'x';
2595        else if (number[1] != 'x') {
2596            memmove(number + 2, number,
2597                    strlen(number) + 1);
2598            number[0] = '0';
2599            number[1] = 'x';
2600            len += 2;
2601        }
2602
2603        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2604            return NULL;
2605        break;
2606    }
2607
2608    case 's':
2609    {
2610        /* UTF-8 */
2611        const char *s = va_arg(*vargs, const char*);
2612        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2613            return NULL;
2614        break;
2615    }
2616
2617    case 'U':
2618    {
2619        PyObject *obj = va_arg(*vargs, PyObject *);
2620        assert(obj && _PyUnicode_CHECK(obj));
2621
2622        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2623            return NULL;
2624        break;
2625    }
2626
2627    case 'V':
2628    {
2629        PyObject *obj = va_arg(*vargs, PyObject *);
2630        const char *str = va_arg(*vargs, const char *);
2631        if (obj) {
2632            assert(_PyUnicode_CHECK(obj));
2633            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2634                return NULL;
2635        }
2636        else {
2637            assert(str != NULL);
2638            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2639                return NULL;
2640        }
2641        break;
2642    }
2643
2644    case 'S':
2645    {
2646        PyObject *obj = va_arg(*vargs, PyObject *);
2647        PyObject *str;
2648        assert(obj);
2649        str = PyObject_Str(obj);
2650        if (!str)
2651            return NULL;
2652        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2653            Py_DECREF(str);
2654            return NULL;
2655        }
2656        Py_DECREF(str);
2657        break;
2658    }
2659
2660    case 'R':
2661    {
2662        PyObject *obj = va_arg(*vargs, PyObject *);
2663        PyObject *repr;
2664        assert(obj);
2665        repr = PyObject_Repr(obj);
2666        if (!repr)
2667            return NULL;
2668        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2669            Py_DECREF(repr);
2670            return NULL;
2671        }
2672        Py_DECREF(repr);
2673        break;
2674    }
2675
2676    case 'A':
2677    {
2678        PyObject *obj = va_arg(*vargs, PyObject *);
2679        PyObject *ascii;
2680        assert(obj);
2681        ascii = PyObject_ASCII(obj);
2682        if (!ascii)
2683            return NULL;
2684        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2685            Py_DECREF(ascii);
2686            return NULL;
2687        }
2688        Py_DECREF(ascii);
2689        break;
2690    }
2691
2692    case '%':
2693        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2694            return NULL;
2695        break;
2696
2697    default:
2698        /* if we stumble upon an unknown formatting code, copy the rest
2699           of the format string to the output string. (we cannot just
2700           skip the code, since there's no way to know what's in the
2701           argument list) */
2702        len = strlen(p);
2703        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2704            return NULL;
2705        f = p+len;
2706        return f;
2707    }
2708
2709    f++;
2710    return f;
2711}
2712
2713PyObject *
2714PyUnicode_FromFormatV(const char *format, va_list vargs)
2715{
2716    va_list vargs2;
2717    const char *f;
2718    _PyUnicodeWriter writer;
2719
2720    _PyUnicodeWriter_Init(&writer);
2721    writer.min_length = strlen(format) + 100;
2722    writer.overallocate = 1;
2723
2724    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2725       Copy it to be able to pass a reference to a subfunction. */
2726    Py_VA_COPY(vargs2, vargs);
2727
2728    for (f = format; *f; ) {
2729        if (*f == '%') {
2730            f = unicode_fromformat_arg(&writer, f, &vargs2);
2731            if (f == NULL)
2732                goto fail;
2733        }
2734        else {
2735            const char *p;
2736            Py_ssize_t len;
2737
2738            p = f;
2739            do
2740            {
2741                if ((unsigned char)*p > 127) {
2742                    PyErr_Format(PyExc_ValueError,
2743                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2744                        "string, got a non-ASCII byte: 0x%02x",
2745                        (unsigned char)*p);
2746                    return NULL;
2747                }
2748                p++;
2749            }
2750            while (*p != '\0' && *p != '%');
2751            len = p - f;
2752
2753            if (*p == '\0')
2754                writer.overallocate = 0;
2755
2756            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2757                goto fail;
2758
2759            f = p;
2760        }
2761    }
2762    return _PyUnicodeWriter_Finish(&writer);
2763
2764  fail:
2765    _PyUnicodeWriter_Dealloc(&writer);
2766    return NULL;
2767}
2768
2769PyObject *
2770PyUnicode_FromFormat(const char *format, ...)
2771{
2772    PyObject* ret;
2773    va_list vargs;
2774
2775#ifdef HAVE_STDARG_PROTOTYPES
2776    va_start(vargs, format);
2777#else
2778    va_start(vargs);
2779#endif
2780    ret = PyUnicode_FromFormatV(format, vargs);
2781    va_end(vargs);
2782    return ret;
2783}
2784
2785#ifdef HAVE_WCHAR_H
2786
2787/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2788   convert a Unicode object to a wide character string.
2789
2790   - If w is NULL: return the number of wide characters (including the null
2791     character) required to convert the unicode object. Ignore size argument.
2792
2793   - Otherwise: return the number of wide characters (excluding the null
2794     character) written into w. Write at most size wide characters (including
2795     the null character). */
2796static Py_ssize_t
2797unicode_aswidechar(PyObject *unicode,
2798                   wchar_t *w,
2799                   Py_ssize_t size)
2800{
2801    Py_ssize_t res;
2802    const wchar_t *wstr;
2803
2804    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2805    if (wstr == NULL)
2806        return -1;
2807
2808    if (w != NULL) {
2809        if (size > res)
2810            size = res + 1;
2811        else
2812            res = size;
2813        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2814        return res;
2815    }
2816    else
2817        return res + 1;
2818}
2819
2820Py_ssize_t
2821PyUnicode_AsWideChar(PyObject *unicode,
2822                     wchar_t *w,
2823                     Py_ssize_t size)
2824{
2825    if (unicode == NULL) {
2826        PyErr_BadInternalCall();
2827        return -1;
2828    }
2829    return unicode_aswidechar(unicode, w, size);
2830}
2831
2832wchar_t*
2833PyUnicode_AsWideCharString(PyObject *unicode,
2834                           Py_ssize_t *size)
2835{
2836    wchar_t* buffer;
2837    Py_ssize_t buflen;
2838
2839    if (unicode == NULL) {
2840        PyErr_BadInternalCall();
2841        return NULL;
2842    }
2843
2844    buflen = unicode_aswidechar(unicode, NULL, 0);
2845    if (buflen == -1)
2846        return NULL;
2847    buffer = PyMem_NEW(wchar_t, buflen);
2848    if (buffer == NULL) {
2849        PyErr_NoMemory();
2850        return NULL;
2851    }
2852    buflen = unicode_aswidechar(unicode, buffer, buflen);
2853    if (buflen == -1) {
2854        PyMem_FREE(buffer);
2855        return NULL;
2856    }
2857    if (size != NULL)
2858        *size = buflen;
2859    return buffer;
2860}
2861
2862#endif /* HAVE_WCHAR_H */
2863
2864PyObject *
2865PyUnicode_FromOrdinal(int ordinal)
2866{
2867    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2868        PyErr_SetString(PyExc_ValueError,
2869                        "chr() arg not in range(0x110000)");
2870        return NULL;
2871    }
2872
2873    return unicode_char((Py_UCS4)ordinal);
2874}
2875
2876PyObject *
2877PyUnicode_FromObject(PyObject *obj)
2878{
2879    /* XXX Perhaps we should make this API an alias of
2880       PyObject_Str() instead ?! */
2881    if (PyUnicode_CheckExact(obj)) {
2882        if (PyUnicode_READY(obj) == -1)
2883            return NULL;
2884        Py_INCREF(obj);
2885        return obj;
2886    }
2887    if (PyUnicode_Check(obj)) {
2888        /* For a Unicode subtype that's not a Unicode object,
2889           return a true Unicode object with the same data. */
2890        return _PyUnicode_Copy(obj);
2891    }
2892    PyErr_Format(PyExc_TypeError,
2893                 "Can't convert '%.100s' object to str implicitly",
2894                 Py_TYPE(obj)->tp_name);
2895    return NULL;
2896}
2897
2898PyObject *
2899PyUnicode_FromEncodedObject(PyObject *obj,
2900                            const char *encoding,
2901                            const char *errors)
2902{
2903    Py_buffer buffer;
2904    PyObject *v;
2905
2906    if (obj == NULL) {
2907        PyErr_BadInternalCall();
2908        return NULL;
2909    }
2910
2911    /* Decoding bytes objects is the most common case and should be fast */
2912    if (PyBytes_Check(obj)) {
2913        if (PyBytes_GET_SIZE(obj) == 0)
2914            _Py_RETURN_UNICODE_EMPTY();
2915        v = PyUnicode_Decode(
2916                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2917                encoding, errors);
2918        return v;
2919    }
2920
2921    if (PyUnicode_Check(obj)) {
2922        PyErr_SetString(PyExc_TypeError,
2923                        "decoding str is not supported");
2924        return NULL;
2925    }
2926
2927    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2928    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2929        PyErr_Format(PyExc_TypeError,
2930                     "coercing to str: need a bytes-like object, %.80s found",
2931                     Py_TYPE(obj)->tp_name);
2932        return NULL;
2933    }
2934
2935    if (buffer.len == 0) {
2936        PyBuffer_Release(&buffer);
2937        _Py_RETURN_UNICODE_EMPTY();
2938    }
2939
2940    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2941    PyBuffer_Release(&buffer);
2942    return v;
2943}
2944
2945/* Convert encoding to lower case and replace '_' with '-' in order to
2946   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2947   1 on success. */
2948int
2949_Py_normalize_encoding(const char *encoding,
2950                       char *lower,
2951                       size_t lower_len)
2952{
2953    const char *e;
2954    char *l;
2955    char *l_end;
2956
2957    if (encoding == NULL) {
2958        /* 6 == strlen("utf-8") + 1 */
2959        if (lower_len < 6)
2960            return 0;
2961        strcpy(lower, "utf-8");
2962        return 1;
2963    }
2964    e = encoding;
2965    l = lower;
2966    l_end = &lower[lower_len - 1];
2967    while (*e) {
2968        if (l == l_end)
2969            return 0;
2970        if (Py_ISUPPER(*e)) {
2971            *l++ = Py_TOLOWER(*e++);
2972        }
2973        else if (*e == '_') {
2974            *l++ = '-';
2975            e++;
2976        }
2977        else {
2978            *l++ = *e++;
2979        }
2980    }
2981    *l = '\0';
2982    return 1;
2983}
2984
2985PyObject *
2986PyUnicode_Decode(const char *s,
2987                 Py_ssize_t size,
2988                 const char *encoding,
2989                 const char *errors)
2990{
2991    PyObject *buffer = NULL, *unicode;
2992    Py_buffer info;
2993    char lower[11];  /* Enough for any encoding shortcut */
2994
2995    /* Shortcuts for common default encodings */
2996    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
2997        if ((strcmp(lower, "utf-8") == 0) ||
2998            (strcmp(lower, "utf8") == 0))
2999            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3000        else if ((strcmp(lower, "latin-1") == 0) ||
3001                 (strcmp(lower, "latin1") == 0) ||
3002                 (strcmp(lower, "iso-8859-1") == 0) ||
3003                 (strcmp(lower, "iso8859-1") == 0))
3004            return PyUnicode_DecodeLatin1(s, size, errors);
3005#ifdef HAVE_MBCS
3006        else if (strcmp(lower, "mbcs") == 0)
3007            return PyUnicode_DecodeMBCS(s, size, errors);
3008#endif
3009        else if (strcmp(lower, "ascii") == 0)
3010            return PyUnicode_DecodeASCII(s, size, errors);
3011        else if (strcmp(lower, "utf-16") == 0)
3012            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3013        else if (strcmp(lower, "utf-32") == 0)
3014            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3015    }
3016
3017    /* Decode via the codec registry */
3018    buffer = NULL;
3019    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3020        goto onError;
3021    buffer = PyMemoryView_FromBuffer(&info);
3022    if (buffer == NULL)
3023        goto onError;
3024    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3025    if (unicode == NULL)
3026        goto onError;
3027    if (!PyUnicode_Check(unicode)) {
3028        PyErr_Format(PyExc_TypeError,
3029                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3030                     "use codecs.decode() to decode to arbitrary types",
3031                     encoding,
3032                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3033        Py_DECREF(unicode);
3034        goto onError;
3035    }
3036    Py_DECREF(buffer);
3037    return unicode_result(unicode);
3038
3039  onError:
3040    Py_XDECREF(buffer);
3041    return NULL;
3042}
3043
3044PyObject *
3045PyUnicode_AsDecodedObject(PyObject *unicode,
3046                          const char *encoding,
3047                          const char *errors)
3048{
3049    PyObject *v;
3050
3051    if (!PyUnicode_Check(unicode)) {
3052        PyErr_BadArgument();
3053        goto onError;
3054    }
3055
3056    if (encoding == NULL)
3057        encoding = PyUnicode_GetDefaultEncoding();
3058
3059    /* Decode via the codec registry */
3060    v = PyCodec_Decode(unicode, encoding, errors);
3061    if (v == NULL)
3062        goto onError;
3063    return unicode_result(v);
3064
3065  onError:
3066    return NULL;
3067}
3068
3069PyObject *
3070PyUnicode_AsDecodedUnicode(PyObject *unicode,
3071                           const char *encoding,
3072                           const char *errors)
3073{
3074    PyObject *v;
3075
3076    if (!PyUnicode_Check(unicode)) {
3077        PyErr_BadArgument();
3078        goto onError;
3079    }
3080
3081    if (encoding == NULL)
3082        encoding = PyUnicode_GetDefaultEncoding();
3083
3084    /* Decode via the codec registry */
3085    v = PyCodec_Decode(unicode, encoding, errors);
3086    if (v == NULL)
3087        goto onError;
3088    if (!PyUnicode_Check(v)) {
3089        PyErr_Format(PyExc_TypeError,
3090                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3091                     "use codecs.decode() to decode to arbitrary types",
3092                     encoding,
3093                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3094        Py_DECREF(v);
3095        goto onError;
3096    }
3097    return unicode_result(v);
3098
3099  onError:
3100    return NULL;
3101}
3102
3103PyObject *
3104PyUnicode_Encode(const Py_UNICODE *s,
3105                 Py_ssize_t size,
3106                 const char *encoding,
3107                 const char *errors)
3108{
3109    PyObject *v, *unicode;
3110
3111    unicode = PyUnicode_FromUnicode(s, size);
3112    if (unicode == NULL)
3113        return NULL;
3114    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3115    Py_DECREF(unicode);
3116    return v;
3117}
3118
3119PyObject *
3120PyUnicode_AsEncodedObject(PyObject *unicode,
3121                          const char *encoding,
3122                          const char *errors)
3123{
3124    PyObject *v;
3125
3126    if (!PyUnicode_Check(unicode)) {
3127        PyErr_BadArgument();
3128        goto onError;
3129    }
3130
3131    if (encoding == NULL)
3132        encoding = PyUnicode_GetDefaultEncoding();
3133
3134    /* Encode via the codec registry */
3135    v = PyCodec_Encode(unicode, encoding, errors);
3136    if (v == NULL)
3137        goto onError;
3138    return v;
3139
3140  onError:
3141    return NULL;
3142}
3143
3144static size_t
3145wcstombs_errorpos(const wchar_t *wstr)
3146{
3147    size_t len;
3148#if SIZEOF_WCHAR_T == 2
3149    wchar_t buf[3];
3150#else
3151    wchar_t buf[2];
3152#endif
3153    char outbuf[MB_LEN_MAX];
3154    const wchar_t *start, *previous;
3155
3156#if SIZEOF_WCHAR_T == 2
3157    buf[2] = 0;
3158#else
3159    buf[1] = 0;
3160#endif
3161    start = wstr;
3162    while (*wstr != L'\0')
3163    {
3164        previous = wstr;
3165#if SIZEOF_WCHAR_T == 2
3166        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3167            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3168        {
3169            buf[0] = wstr[0];
3170            buf[1] = wstr[1];
3171            wstr += 2;
3172        }
3173        else {
3174            buf[0] = *wstr;
3175            buf[1] = 0;
3176            wstr++;
3177        }
3178#else
3179        buf[0] = *wstr;
3180        wstr++;
3181#endif
3182        len = wcstombs(outbuf, buf, sizeof(outbuf));
3183        if (len == (size_t)-1)
3184            return previous - start;
3185    }
3186
3187    /* failed to find the unencodable character */
3188    return 0;
3189}
3190
3191static int
3192locale_error_handler(const char *errors, int *surrogateescape)
3193{
3194    _Py_error_handler error_handler = get_error_handler(errors);
3195    switch (error_handler)
3196    {
3197    case _Py_ERROR_STRICT:
3198        *surrogateescape = 0;
3199        return 0;
3200    case _Py_ERROR_SURROGATEESCAPE:
3201        *surrogateescape = 1;
3202        return 0;
3203    default:
3204        PyErr_Format(PyExc_ValueError,
3205                     "only 'strict' and 'surrogateescape' error handlers "
3206                     "are supported, not '%s'",
3207                     errors);
3208        return -1;
3209    }
3210}
3211
3212PyObject *
3213PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3214{
3215    Py_ssize_t wlen, wlen2;
3216    wchar_t *wstr;
3217    PyObject *bytes = NULL;
3218    char *errmsg;
3219    PyObject *reason = NULL;
3220    PyObject *exc;
3221    size_t error_pos;
3222    int surrogateescape;
3223
3224    if (locale_error_handler(errors, &surrogateescape) < 0)
3225        return NULL;
3226
3227    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3228    if (wstr == NULL)
3229        return NULL;
3230
3231    wlen2 = wcslen(wstr);
3232    if (wlen2 != wlen) {
3233        PyMem_Free(wstr);
3234        PyErr_SetString(PyExc_ValueError, "embedded null character");
3235        return NULL;
3236    }
3237
3238    if (surrogateescape) {
3239        /* "surrogateescape" error handler */
3240        char *str;
3241
3242        str = Py_EncodeLocale(wstr, &error_pos);
3243        if (str == NULL) {
3244            if (error_pos == (size_t)-1) {
3245                PyErr_NoMemory();
3246                PyMem_Free(wstr);
3247                return NULL;
3248            }
3249            else {
3250                goto encode_error;
3251            }
3252        }
3253        PyMem_Free(wstr);
3254
3255        bytes = PyBytes_FromString(str);
3256        PyMem_Free(str);
3257    }
3258    else {
3259        /* strict mode */
3260        size_t len, len2;
3261
3262        len = wcstombs(NULL, wstr, 0);
3263        if (len == (size_t)-1) {
3264            error_pos = (size_t)-1;
3265            goto encode_error;
3266        }
3267
3268        bytes = PyBytes_FromStringAndSize(NULL, len);
3269        if (bytes == NULL) {
3270            PyMem_Free(wstr);
3271            return NULL;
3272        }
3273
3274        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3275        if (len2 == (size_t)-1 || len2 > len) {
3276            error_pos = (size_t)-1;
3277            goto encode_error;
3278        }
3279        PyMem_Free(wstr);
3280    }
3281    return bytes;
3282
3283encode_error:
3284    errmsg = strerror(errno);
3285    assert(errmsg != NULL);
3286
3287    if (error_pos == (size_t)-1)
3288        error_pos = wcstombs_errorpos(wstr);
3289
3290    PyMem_Free(wstr);
3291    Py_XDECREF(bytes);
3292
3293    if (errmsg != NULL) {
3294        size_t errlen;
3295        wstr = Py_DecodeLocale(errmsg, &errlen);
3296        if (wstr != NULL) {
3297            reason = PyUnicode_FromWideChar(wstr, errlen);
3298            PyMem_RawFree(wstr);
3299        } else
3300            errmsg = NULL;
3301    }
3302    if (errmsg == NULL)
3303        reason = PyUnicode_FromString(
3304            "wcstombs() encountered an unencodable "
3305            "wide character");
3306    if (reason == NULL)
3307        return NULL;
3308
3309    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3310                                "locale", unicode,
3311                                (Py_ssize_t)error_pos,
3312                                (Py_ssize_t)(error_pos+1),
3313                                reason);
3314    Py_DECREF(reason);
3315    if (exc != NULL) {
3316        PyCodec_StrictErrors(exc);
3317        Py_XDECREF(exc);
3318    }
3319    return NULL;
3320}
3321
3322PyObject *
3323PyUnicode_EncodeFSDefault(PyObject *unicode)
3324{
3325#ifdef HAVE_MBCS
3326    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3327#elif defined(__APPLE__)
3328    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3329#else
3330    PyInterpreterState *interp = PyThreadState_GET()->interp;
3331    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3332       cannot use it to encode and decode filenames before it is loaded. Load
3333       the Python codec requires to encode at least its own filename. Use the C
3334       version of the locale codec until the codec registry is initialized and
3335       the Python codec is loaded.
3336
3337       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3338       cannot only rely on it: check also interp->fscodec_initialized for
3339       subinterpreters. */
3340    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3341        return PyUnicode_AsEncodedString(unicode,
3342                                         Py_FileSystemDefaultEncoding,
3343                                         "surrogateescape");
3344    }
3345    else {
3346        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3347    }
3348#endif
3349}
3350
3351PyObject *
3352PyUnicode_AsEncodedString(PyObject *unicode,
3353                          const char *encoding,
3354                          const char *errors)
3355{
3356    PyObject *v;
3357    char lower[11];  /* Enough for any encoding shortcut */
3358
3359    if (!PyUnicode_Check(unicode)) {
3360        PyErr_BadArgument();
3361        return NULL;
3362    }
3363
3364    /* Shortcuts for common default encodings */
3365    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3366        if ((strcmp(lower, "utf-8") == 0) ||
3367            (strcmp(lower, "utf8") == 0))
3368        {
3369            if (errors == NULL || strcmp(errors, "strict") == 0)
3370                return _PyUnicode_AsUTF8String(unicode, NULL);
3371            else
3372                return _PyUnicode_AsUTF8String(unicode, errors);
3373        }
3374        else if ((strcmp(lower, "latin-1") == 0) ||
3375                 (strcmp(lower, "latin1") == 0) ||
3376                 (strcmp(lower, "iso-8859-1") == 0) ||
3377                 (strcmp(lower, "iso8859-1") == 0))
3378            return _PyUnicode_AsLatin1String(unicode, errors);
3379#ifdef HAVE_MBCS
3380        else if (strcmp(lower, "mbcs") == 0)
3381            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3382#endif
3383        else if (strcmp(lower, "ascii") == 0)
3384            return _PyUnicode_AsASCIIString(unicode, errors);
3385    }
3386
3387    /* Encode via the codec registry */
3388    v = _PyCodec_EncodeText(unicode, encoding, errors);
3389    if (v == NULL)
3390        return NULL;
3391
3392    /* The normal path */
3393    if (PyBytes_Check(v))
3394        return v;
3395
3396    /* If the codec returns a buffer, raise a warning and convert to bytes */
3397    if (PyByteArray_Check(v)) {
3398        int error;
3399        PyObject *b;
3400
3401        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3402            "encoder %s returned bytearray instead of bytes; "
3403            "use codecs.encode() to encode to arbitrary types",
3404            encoding);
3405        if (error) {
3406            Py_DECREF(v);
3407            return NULL;
3408        }
3409
3410        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3411        Py_DECREF(v);
3412        return b;
3413    }
3414
3415    PyErr_Format(PyExc_TypeError,
3416                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3417                 "use codecs.encode() to encode to arbitrary types",
3418                 encoding,
3419                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3420    Py_DECREF(v);
3421    return NULL;
3422}
3423
3424PyObject *
3425PyUnicode_AsEncodedUnicode(PyObject *unicode,
3426                           const char *encoding,
3427                           const char *errors)
3428{
3429    PyObject *v;
3430
3431    if (!PyUnicode_Check(unicode)) {
3432        PyErr_BadArgument();
3433        goto onError;
3434    }
3435
3436    if (encoding == NULL)
3437        encoding = PyUnicode_GetDefaultEncoding();
3438
3439    /* Encode via the codec registry */
3440    v = PyCodec_Encode(unicode, encoding, errors);
3441    if (v == NULL)
3442        goto onError;
3443    if (!PyUnicode_Check(v)) {
3444        PyErr_Format(PyExc_TypeError,
3445                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3446                     "use codecs.encode() to encode to arbitrary types",
3447                     encoding,
3448                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3449        Py_DECREF(v);
3450        goto onError;
3451    }
3452    return v;
3453
3454  onError:
3455    return NULL;
3456}
3457
3458static size_t
3459mbstowcs_errorpos(const char *str, size_t len)
3460{
3461#ifdef HAVE_MBRTOWC
3462    const char *start = str;
3463    mbstate_t mbs;
3464    size_t converted;
3465    wchar_t ch;
3466
3467    memset(&mbs, 0, sizeof mbs);
3468    while (len)
3469    {
3470        converted = mbrtowc(&ch, str, len, &mbs);
3471        if (converted == 0)
3472            /* Reached end of string */
3473            break;
3474        if (converted == (size_t)-1 || converted == (size_t)-2) {
3475            /* Conversion error or incomplete character */
3476            return str - start;
3477        }
3478        else {
3479            str += converted;
3480            len -= converted;
3481        }
3482    }
3483    /* failed to find the undecodable byte sequence */
3484    return 0;
3485#endif
3486    return 0;
3487}
3488
3489PyObject*
3490PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3491                              const char *errors)
3492{
3493    wchar_t smallbuf[256];
3494    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3495    wchar_t *wstr;
3496    size_t wlen, wlen2;
3497    PyObject *unicode;
3498    int surrogateescape;
3499    size_t error_pos;
3500    char *errmsg;
3501    PyObject *reason = NULL;   /* initialize to prevent gcc warning */
3502    PyObject *exc;
3503
3504    if (locale_error_handler(errors, &surrogateescape) < 0)
3505        return NULL;
3506
3507    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3508        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3509        return NULL;
3510    }
3511
3512    if (surrogateescape) {
3513        /* "surrogateescape" error handler */
3514        wstr = Py_DecodeLocale(str, &wlen);
3515        if (wstr == NULL) {
3516            if (wlen == (size_t)-1)
3517                PyErr_NoMemory();
3518            else
3519                PyErr_SetFromErrno(PyExc_OSError);
3520            return NULL;
3521        }
3522
3523        unicode = PyUnicode_FromWideChar(wstr, wlen);
3524        PyMem_RawFree(wstr);
3525    }
3526    else {
3527        /* strict mode */
3528#ifndef HAVE_BROKEN_MBSTOWCS
3529        wlen = mbstowcs(NULL, str, 0);
3530#else
3531        wlen = len;
3532#endif
3533        if (wlen == (size_t)-1)
3534            goto decode_error;
3535        if (wlen+1 <= smallbuf_len) {
3536            wstr = smallbuf;
3537        }
3538        else {
3539            wstr = PyMem_New(wchar_t, wlen+1);
3540            if (!wstr)
3541                return PyErr_NoMemory();
3542        }
3543
3544        wlen2 = mbstowcs(wstr, str, wlen+1);
3545        if (wlen2 == (size_t)-1) {
3546            if (wstr != smallbuf)
3547                PyMem_Free(wstr);
3548            goto decode_error;
3549        }
3550#ifdef HAVE_BROKEN_MBSTOWCS
3551        assert(wlen2 == wlen);
3552#endif
3553        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3554        if (wstr != smallbuf)
3555            PyMem_Free(wstr);
3556    }
3557    return unicode;
3558
3559decode_error:
3560    reason = NULL;
3561    errmsg = strerror(errno);
3562    assert(errmsg != NULL);
3563
3564    error_pos = mbstowcs_errorpos(str, len);
3565    if (errmsg != NULL) {
3566        size_t errlen;
3567        wstr = Py_DecodeLocale(errmsg, &errlen);
3568        if (wstr != NULL) {
3569            reason = PyUnicode_FromWideChar(wstr, errlen);
3570            PyMem_RawFree(wstr);
3571        }
3572    }
3573    if (reason == NULL)
3574        reason = PyUnicode_FromString(
3575            "mbstowcs() encountered an invalid multibyte sequence");
3576    if (reason == NULL)
3577        return NULL;
3578
3579    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3580                                "locale", str, len,
3581                                (Py_ssize_t)error_pos,
3582                                (Py_ssize_t)(error_pos+1),
3583                                reason);
3584    Py_DECREF(reason);
3585    if (exc != NULL) {
3586        PyCodec_StrictErrors(exc);
3587        Py_XDECREF(exc);
3588    }
3589    return NULL;
3590}
3591
3592PyObject*
3593PyUnicode_DecodeLocale(const char *str, const char *errors)
3594{
3595    Py_ssize_t size = (Py_ssize_t)strlen(str);
3596    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3597}
3598
3599
3600PyObject*
3601PyUnicode_DecodeFSDefault(const char *s) {
3602    Py_ssize_t size = (Py_ssize_t)strlen(s);
3603    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3604}
3605
3606PyObject*
3607PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3608{
3609#ifdef HAVE_MBCS
3610    return PyUnicode_DecodeMBCS(s, size, NULL);
3611#elif defined(__APPLE__)
3612    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3613#else
3614    PyInterpreterState *interp = PyThreadState_GET()->interp;
3615    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3616       cannot use it to encode and decode filenames before it is loaded. Load
3617       the Python codec requires to encode at least its own filename. Use the C
3618       version of the locale codec until the codec registry is initialized and
3619       the Python codec is loaded.
3620
3621       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3622       cannot only rely on it: check also interp->fscodec_initialized for
3623       subinterpreters. */
3624    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3625        return PyUnicode_Decode(s, size,
3626                                Py_FileSystemDefaultEncoding,
3627                                "surrogateescape");
3628    }
3629    else {
3630        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3631    }
3632#endif
3633}
3634
3635
3636int
3637PyUnicode_FSConverter(PyObject* arg, void* addr)
3638{
3639    PyObject *output = NULL;
3640    Py_ssize_t size;
3641    void *data;
3642    if (arg == NULL) {
3643        Py_DECREF(*(PyObject**)addr);
3644        return 1;
3645    }
3646    if (PyBytes_Check(arg)) {
3647        output = arg;
3648        Py_INCREF(output);
3649    }
3650    else {
3651        arg = PyUnicode_FromObject(arg);
3652        if (!arg)
3653            return 0;
3654        output = PyUnicode_EncodeFSDefault(arg);
3655        Py_DECREF(arg);
3656        if (!output)
3657            return 0;
3658        if (!PyBytes_Check(output)) {
3659            Py_DECREF(output);
3660            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3661            return 0;
3662        }
3663    }
3664    size = PyBytes_GET_SIZE(output);
3665    data = PyBytes_AS_STRING(output);
3666    if ((size_t)size != strlen(data)) {
3667        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3668        Py_DECREF(output);
3669        return 0;
3670    }
3671    *(PyObject**)addr = output;
3672    return Py_CLEANUP_SUPPORTED;
3673}
3674
3675
3676int
3677PyUnicode_FSDecoder(PyObject* arg, void* addr)
3678{
3679    PyObject *output = NULL;
3680    if (arg == NULL) {
3681        Py_DECREF(*(PyObject**)addr);
3682        return 1;
3683    }
3684    if (PyUnicode_Check(arg)) {
3685        if (PyUnicode_READY(arg) == -1)
3686            return 0;
3687        output = arg;
3688        Py_INCREF(output);
3689    }
3690    else {
3691        arg = PyBytes_FromObject(arg);
3692        if (!arg)
3693            return 0;
3694        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3695                                                  PyBytes_GET_SIZE(arg));
3696        Py_DECREF(arg);
3697        if (!output)
3698            return 0;
3699        if (!PyUnicode_Check(output)) {
3700            Py_DECREF(output);
3701            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3702            return 0;
3703        }
3704    }
3705    if (PyUnicode_READY(output) == -1) {
3706        Py_DECREF(output);
3707        return 0;
3708    }
3709    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3710                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3711        PyErr_SetString(PyExc_ValueError, "embedded null character");
3712        Py_DECREF(output);
3713        return 0;
3714    }
3715    *(PyObject**)addr = output;
3716    return Py_CLEANUP_SUPPORTED;
3717}
3718
3719
3720char*
3721PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3722{
3723    PyObject *bytes;
3724
3725    if (!PyUnicode_Check(unicode)) {
3726        PyErr_BadArgument();
3727        return NULL;
3728    }
3729    if (PyUnicode_READY(unicode) == -1)
3730        return NULL;
3731
3732    if (PyUnicode_UTF8(unicode) == NULL) {
3733        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3734        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3735        if (bytes == NULL)
3736            return NULL;
3737        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3738        if (_PyUnicode_UTF8(unicode) == NULL) {
3739            PyErr_NoMemory();
3740            Py_DECREF(bytes);
3741            return NULL;
3742        }
3743        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3744        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3745                  PyBytes_AS_STRING(bytes),
3746                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3747        Py_DECREF(bytes);
3748    }
3749
3750    if (psize)
3751        *psize = PyUnicode_UTF8_LENGTH(unicode);
3752    return PyUnicode_UTF8(unicode);
3753}
3754
3755char*
3756PyUnicode_AsUTF8(PyObject *unicode)
3757{
3758    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3759}
3760
3761Py_UNICODE *
3762PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3763{
3764    const unsigned char *one_byte;
3765#if SIZEOF_WCHAR_T == 4
3766    const Py_UCS2 *two_bytes;
3767#else
3768    const Py_UCS4 *four_bytes;
3769    const Py_UCS4 *ucs4_end;
3770    Py_ssize_t num_surrogates;
3771#endif
3772    wchar_t *w;
3773    wchar_t *wchar_end;
3774
3775    if (!PyUnicode_Check(unicode)) {
3776        PyErr_BadArgument();
3777        return NULL;
3778    }
3779    if (_PyUnicode_WSTR(unicode) == NULL) {
3780        /* Non-ASCII compact unicode object */
3781        assert(_PyUnicode_KIND(unicode) != 0);
3782        assert(PyUnicode_IS_READY(unicode));
3783
3784        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3785#if SIZEOF_WCHAR_T == 2
3786            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3787            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3788            num_surrogates = 0;
3789
3790            for (; four_bytes < ucs4_end; ++four_bytes) {
3791                if (*four_bytes > 0xFFFF)
3792                    ++num_surrogates;
3793            }
3794
3795            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3796                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3797            if (!_PyUnicode_WSTR(unicode)) {
3798                PyErr_NoMemory();
3799                return NULL;
3800            }
3801            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3802
3803            w = _PyUnicode_WSTR(unicode);
3804            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3805            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3806            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3807                if (*four_bytes > 0xFFFF) {
3808                    assert(*four_bytes <= MAX_UNICODE);
3809                    /* encode surrogate pair in this case */
3810                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3811                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3812                }
3813                else
3814                    *w = *four_bytes;
3815
3816                if (w > wchar_end) {
3817                    assert(0 && "Miscalculated string end");
3818                }
3819            }
3820            *w = 0;
3821#else
3822            /* sizeof(wchar_t) == 4 */
3823            Py_FatalError("Impossible unicode object state, wstr and str "
3824                          "should share memory already.");
3825            return NULL;
3826#endif
3827        }
3828        else {
3829            if ((size_t)_PyUnicode_LENGTH(unicode) >
3830                    PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3831                PyErr_NoMemory();
3832                return NULL;
3833            }
3834            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3835                                                  (_PyUnicode_LENGTH(unicode) + 1));
3836            if (!_PyUnicode_WSTR(unicode)) {
3837                PyErr_NoMemory();
3838                return NULL;
3839            }
3840            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3841                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3842            w = _PyUnicode_WSTR(unicode);
3843            wchar_end = w + _PyUnicode_LENGTH(unicode);
3844
3845            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3846                one_byte = PyUnicode_1BYTE_DATA(unicode);
3847                for (; w < wchar_end; ++one_byte, ++w)
3848                    *w = *one_byte;
3849                /* null-terminate the wstr */
3850                *w = 0;
3851            }
3852            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3853#if SIZEOF_WCHAR_T == 4
3854                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3855                for (; w < wchar_end; ++two_bytes, ++w)
3856                    *w = *two_bytes;
3857                /* null-terminate the wstr */
3858                *w = 0;
3859#else
3860                /* sizeof(wchar_t) == 2 */
3861                PyObject_FREE(_PyUnicode_WSTR(unicode));
3862                _PyUnicode_WSTR(unicode) = NULL;
3863                Py_FatalError("Impossible unicode object state, wstr "
3864                              "and str should share memory already.");
3865                return NULL;
3866#endif
3867            }
3868            else {
3869                assert(0 && "This should never happen.");
3870            }
3871        }
3872    }
3873    if (size != NULL)
3874        *size = PyUnicode_WSTR_LENGTH(unicode);
3875    return _PyUnicode_WSTR(unicode);
3876}
3877
3878Py_UNICODE *
3879PyUnicode_AsUnicode(PyObject *unicode)
3880{
3881    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3882}
3883
3884
3885Py_ssize_t
3886PyUnicode_GetSize(PyObject *unicode)
3887{
3888    if (!PyUnicode_Check(unicode)) {
3889        PyErr_BadArgument();
3890        goto onError;
3891    }
3892    return PyUnicode_GET_SIZE(unicode);
3893
3894  onError:
3895    return -1;
3896}
3897
3898Py_ssize_t
3899PyUnicode_GetLength(PyObject *unicode)
3900{
3901    if (!PyUnicode_Check(unicode)) {
3902        PyErr_BadArgument();
3903        return -1;
3904    }
3905    if (PyUnicode_READY(unicode) == -1)
3906        return -1;
3907    return PyUnicode_GET_LENGTH(unicode);
3908}
3909
3910Py_UCS4
3911PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3912{
3913    void *data;
3914    int kind;
3915
3916    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3917        PyErr_BadArgument();
3918        return (Py_UCS4)-1;
3919    }
3920    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3921        PyErr_SetString(PyExc_IndexError, "string index out of range");
3922        return (Py_UCS4)-1;
3923    }
3924    data = PyUnicode_DATA(unicode);
3925    kind = PyUnicode_KIND(unicode);
3926    return PyUnicode_READ(kind, data, index);
3927}
3928
3929int
3930PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3931{
3932    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3933        PyErr_BadArgument();
3934        return -1;
3935    }
3936    assert(PyUnicode_IS_READY(unicode));
3937    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3938        PyErr_SetString(PyExc_IndexError, "string index out of range");
3939        return -1;
3940    }
3941    if (unicode_check_modifiable(unicode))
3942        return -1;
3943    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3944        PyErr_SetString(PyExc_ValueError, "character out of range");
3945        return -1;
3946    }
3947    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3948                    index, ch);
3949    return 0;
3950}
3951
3952const char *
3953PyUnicode_GetDefaultEncoding(void)
3954{
3955    return "utf-8";
3956}
3957
3958/* create or adjust a UnicodeDecodeError */
3959static void
3960make_decode_exception(PyObject **exceptionObject,
3961                      const char *encoding,
3962                      const char *input, Py_ssize_t length,
3963                      Py_ssize_t startpos, Py_ssize_t endpos,
3964                      const char *reason)
3965{
3966    if (*exceptionObject == NULL) {
3967        *exceptionObject = PyUnicodeDecodeError_Create(
3968            encoding, input, length, startpos, endpos, reason);
3969    }
3970    else {
3971        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3972            goto onError;
3973        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3974            goto onError;
3975        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3976            goto onError;
3977    }
3978    return;
3979
3980onError:
3981    Py_CLEAR(*exceptionObject);
3982}
3983
3984#ifdef HAVE_MBCS
3985/* error handling callback helper:
3986   build arguments, call the callback and check the arguments,
3987   if no exception occurred, copy the replacement to the output
3988   and adjust various state variables.
3989   return 0 on success, -1 on error
3990*/
3991
3992static int
3993unicode_decode_call_errorhandler_wchar(
3994    const char *errors, PyObject **errorHandler,
3995    const char *encoding, const char *reason,
3996    const char **input, const char **inend, Py_ssize_t *startinpos,
3997    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3998    PyObject **output, Py_ssize_t *outpos)
3999{
4000    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4001
4002    PyObject *restuple = NULL;
4003    PyObject *repunicode = NULL;
4004    Py_ssize_t outsize;
4005    Py_ssize_t insize;
4006    Py_ssize_t requiredsize;
4007    Py_ssize_t newpos;
4008    PyObject *inputobj = NULL;
4009    wchar_t *repwstr;
4010    Py_ssize_t repwlen;
4011
4012    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4013    outsize = _PyUnicode_WSTR_LENGTH(*output);
4014
4015    if (*errorHandler == NULL) {
4016        *errorHandler = PyCodec_LookupError(errors);
4017        if (*errorHandler == NULL)
4018            goto onError;
4019    }
4020
4021    make_decode_exception(exceptionObject,
4022        encoding,
4023        *input, *inend - *input,
4024        *startinpos, *endinpos,
4025        reason);
4026    if (*exceptionObject == NULL)
4027        goto onError;
4028
4029    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4030    if (restuple == NULL)
4031        goto onError;
4032    if (!PyTuple_Check(restuple)) {
4033        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4034        goto onError;
4035    }
4036    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4037        goto onError;
4038
4039    /* Copy back the bytes variables, which might have been modified by the
4040       callback */
4041    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4042    if (!inputobj)
4043        goto onError;
4044    if (!PyBytes_Check(inputobj)) {
4045        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4046    }
4047    *input = PyBytes_AS_STRING(inputobj);
4048    insize = PyBytes_GET_SIZE(inputobj);
4049    *inend = *input + insize;
4050    /* we can DECREF safely, as the exception has another reference,
4051       so the object won't go away. */
4052    Py_DECREF(inputobj);
4053
4054    if (newpos<0)
4055        newpos = insize+newpos;
4056    if (newpos<0 || newpos>insize) {
4057        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4058        goto onError;
4059    }
4060
4061    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4062    if (repwstr == NULL)
4063        goto onError;
4064    /* need more space? (at least enough for what we
4065       have+the replacement+the rest of the string (starting
4066       at the new input position), so we won't have to check space
4067       when there are no errors in the rest of the string) */
4068    requiredsize = *outpos;
4069    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4070        goto overflow;
4071    requiredsize += repwlen;
4072    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4073        goto overflow;
4074    requiredsize += insize - newpos;
4075    if (requiredsize > outsize) {
4076        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4077            requiredsize = 2*outsize;
4078        if (unicode_resize(output, requiredsize) < 0)
4079            goto onError;
4080    }
4081    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4082    *outpos += repwlen;
4083    *endinpos = newpos;
4084    *inptr = *input + newpos;
4085
4086    /* we made it! */
4087    Py_XDECREF(restuple);
4088    return 0;
4089
4090  overflow:
4091    PyErr_SetString(PyExc_OverflowError,
4092                    "decoded result is too long for a Python string");
4093
4094  onError:
4095    Py_XDECREF(restuple);
4096    return -1;
4097}
4098#endif   /* HAVE_MBCS */
4099
4100static int
4101unicode_decode_call_errorhandler_writer(
4102    const char *errors, PyObject **errorHandler,
4103    const char *encoding, const char *reason,
4104    const char **input, const char **inend, Py_ssize_t *startinpos,
4105    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4106    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4107{
4108    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4109
4110    PyObject *restuple = NULL;
4111    PyObject *repunicode = NULL;
4112    Py_ssize_t insize;
4113    Py_ssize_t newpos;
4114    Py_ssize_t replen;
4115    PyObject *inputobj = NULL;
4116
4117    if (*errorHandler == NULL) {
4118        *errorHandler = PyCodec_LookupError(errors);
4119        if (*errorHandler == NULL)
4120            goto onError;
4121    }
4122
4123    make_decode_exception(exceptionObject,
4124        encoding,
4125        *input, *inend - *input,
4126        *startinpos, *endinpos,
4127        reason);
4128    if (*exceptionObject == NULL)
4129        goto onError;
4130
4131    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4132    if (restuple == NULL)
4133        goto onError;
4134    if (!PyTuple_Check(restuple)) {
4135        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4136        goto onError;
4137    }
4138    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4139        goto onError;
4140
4141    /* Copy back the bytes variables, which might have been modified by the
4142       callback */
4143    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4144    if (!inputobj)
4145        goto onError;
4146    if (!PyBytes_Check(inputobj)) {
4147        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4148    }
4149    *input = PyBytes_AS_STRING(inputobj);
4150    insize = PyBytes_GET_SIZE(inputobj);
4151    *inend = *input + insize;
4152    /* we can DECREF safely, as the exception has another reference,
4153       so the object won't go away. */
4154    Py_DECREF(inputobj);
4155
4156    if (newpos<0)
4157        newpos = insize+newpos;
4158    if (newpos<0 || newpos>insize) {
4159        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4160        goto onError;
4161    }
4162
4163    if (PyUnicode_READY(repunicode) < 0)
4164        goto onError;
4165    replen = PyUnicode_GET_LENGTH(repunicode);
4166    if (replen > 1) {
4167        writer->min_length += replen - 1;
4168        writer->overallocate = 1;
4169        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4170                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4171            goto onError;
4172    }
4173    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4174        goto onError;
4175
4176    *endinpos = newpos;
4177    *inptr = *input + newpos;
4178
4179    /* we made it! */
4180    Py_XDECREF(restuple);
4181    return 0;
4182
4183  onError:
4184    Py_XDECREF(restuple);
4185    return -1;
4186}
4187
4188/* --- UTF-7 Codec -------------------------------------------------------- */
4189
4190/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4191
4192/* Three simple macros defining base-64. */
4193
4194/* Is c a base-64 character? */
4195
4196#define IS_BASE64(c) \
4197    (((c) >= 'A' && (c) <= 'Z') ||     \
4198     ((c) >= 'a' && (c) <= 'z') ||     \
4199     ((c) >= '0' && (c) <= '9') ||     \
4200     (c) == '+' || (c) == '/')
4201
4202/* given that c is a base-64 character, what is its base-64 value? */
4203
4204#define FROM_BASE64(c)                                                  \
4205    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4206     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4207     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4208     (c) == '+' ? 62 : 63)
4209
4210/* What is the base-64 character of the bottom 6 bits of n? */
4211
4212#define TO_BASE64(n)  \
4213    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4214
4215/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4216 * decoded as itself.  We are permissive on decoding; the only ASCII
4217 * byte not decoding to itself is the + which begins a base64
4218 * string. */
4219
4220#define DECODE_DIRECT(c)                                \
4221    ((c) <= 127 && (c) != '+')
4222
4223/* The UTF-7 encoder treats ASCII characters differently according to
4224 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4225 * the above).  See RFC2152.  This array identifies these different
4226 * sets:
4227 * 0 : "Set D"
4228 *     alphanumeric and '(),-./:?
4229 * 1 : "Set O"
4230 *     !"#$%&*;<=>@[]^_`{|}
4231 * 2 : "whitespace"
4232 *     ht nl cr sp
4233 * 3 : special (must be base64 encoded)
4234 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4235 */
4236
4237static
4238char utf7_category[128] = {
4239/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4240    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4241/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4242    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4243/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4244    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4245/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4246    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4247/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4248    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4249/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4250    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4251/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4252    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4253/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4254    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4255};
4256
4257/* ENCODE_DIRECT: this character should be encoded as itself.  The
4258 * answer depends on whether we are encoding set O as itself, and also
4259 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4260 * clear that the answers to these questions vary between
4261 * applications, so this code needs to be flexible.  */
4262
4263#define ENCODE_DIRECT(c, directO, directWS)             \
4264    ((c) < 128 && (c) > 0 &&                            \
4265     ((utf7_category[(c)] == 0) ||                      \
4266      (directWS && (utf7_category[(c)] == 2)) ||        \
4267      (directO && (utf7_category[(c)] == 1))))
4268
4269PyObject *
4270PyUnicode_DecodeUTF7(const char *s,
4271                     Py_ssize_t size,
4272                     const char *errors)
4273{
4274    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4275}
4276
4277/* The decoder.  The only state we preserve is our read position,
4278 * i.e. how many characters we have consumed.  So if we end in the
4279 * middle of a shift sequence we have to back off the read position
4280 * and the output to the beginning of the sequence, otherwise we lose
4281 * all the shift state (seen bits, number of bits seen, high
4282 * surrogate). */
4283
4284PyObject *
4285PyUnicode_DecodeUTF7Stateful(const char *s,
4286                             Py_ssize_t size,
4287                             const char *errors,
4288                             Py_ssize_t *consumed)
4289{
4290    const char *starts = s;
4291    Py_ssize_t startinpos;
4292    Py_ssize_t endinpos;
4293    const char *e;
4294    _PyUnicodeWriter writer;
4295    const char *errmsg = "";
4296    int inShift = 0;
4297    Py_ssize_t shiftOutStart;
4298    unsigned int base64bits = 0;
4299    unsigned long base64buffer = 0;
4300    Py_UCS4 surrogate = 0;
4301    PyObject *errorHandler = NULL;
4302    PyObject *exc = NULL;
4303
4304    if (size == 0) {
4305        if (consumed)
4306            *consumed = 0;
4307        _Py_RETURN_UNICODE_EMPTY();
4308    }
4309
4310    /* Start off assuming it's all ASCII. Widen later as necessary. */
4311    _PyUnicodeWriter_Init(&writer);
4312    writer.min_length = size;
4313
4314    shiftOutStart = 0;
4315    e = s + size;
4316
4317    while (s < e) {
4318        Py_UCS4 ch;
4319      restart:
4320        ch = (unsigned char) *s;
4321
4322        if (inShift) { /* in a base-64 section */
4323            if (IS_BASE64(ch)) { /* consume a base-64 character */
4324                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4325                base64bits += 6;
4326                s++;
4327                if (base64bits >= 16) {
4328                    /* we have enough bits for a UTF-16 value */
4329                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4330                    base64bits -= 16;
4331                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4332                    assert(outCh <= 0xffff);
4333                    if (surrogate) {
4334                        /* expecting a second surrogate */
4335                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4336                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4337                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4338                                goto onError;
4339                            surrogate = 0;
4340                            continue;
4341                        }
4342                        else {
4343                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4344                                goto onError;
4345                            surrogate = 0;
4346                        }
4347                    }
4348                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4349                        /* first surrogate */
4350                        surrogate = outCh;
4351                    }
4352                    else {
4353                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4354                            goto onError;
4355                    }
4356                }
4357            }
4358            else { /* now leaving a base-64 section */
4359                inShift = 0;
4360                s++;
4361                if (surrogate) {
4362                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4363                        goto onError;
4364                    surrogate = 0;
4365                }
4366                if (base64bits > 0) { /* left-over bits */
4367                    if (base64bits >= 6) {
4368                        /* We've seen at least one base-64 character */
4369                        errmsg = "partial character in shift sequence";
4370                        goto utf7Error;
4371                    }
4372                    else {
4373                        /* Some bits remain; they should be zero */
4374                        if (base64buffer != 0) {
4375                            errmsg = "non-zero padding bits in shift sequence";
4376                            goto utf7Error;
4377                        }
4378                    }
4379                }
4380                if (ch != '-') {
4381                    /* '-' is absorbed; other terminating
4382                       characters are preserved */
4383                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4384                        goto onError;
4385                }
4386            }
4387        }
4388        else if ( ch == '+' ) {
4389            startinpos = s-starts;
4390            s++; /* consume '+' */
4391            if (s < e && *s == '-') { /* '+-' encodes '+' */
4392                s++;
4393                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4394                    goto onError;
4395            }
4396            else { /* begin base64-encoded section */
4397                inShift = 1;
4398                shiftOutStart = writer.pos;
4399                base64bits = 0;
4400                base64buffer = 0;
4401            }
4402        }
4403        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4404            s++;
4405            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4406                goto onError;
4407        }
4408        else {
4409            startinpos = s-starts;
4410            s++;
4411            errmsg = "unexpected special character";
4412            goto utf7Error;
4413        }
4414        continue;
4415utf7Error:
4416        endinpos = s-starts;
4417        if (unicode_decode_call_errorhandler_writer(
4418                errors, &errorHandler,
4419                "utf7", errmsg,
4420                &starts, &e, &startinpos, &endinpos, &exc, &s,
4421                &writer))
4422            goto onError;
4423    }
4424
4425    /* end of string */
4426
4427    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4428        /* if we're in an inconsistent state, that's an error */
4429        if (surrogate ||
4430                (base64bits >= 6) ||
4431                (base64bits > 0 && base64buffer != 0)) {
4432            endinpos = size;
4433            if (unicode_decode_call_errorhandler_writer(
4434                    errors, &errorHandler,
4435                    "utf7", "unterminated shift sequence",
4436                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4437                    &writer))
4438                goto onError;
4439            if (s < e)
4440                goto restart;
4441        }
4442    }
4443
4444    /* return state */
4445    if (consumed) {
4446        if (inShift) {
4447            *consumed = startinpos;
4448            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4449                PyObject *result = PyUnicode_FromKindAndData(
4450                        writer.kind, writer.data, shiftOutStart);
4451                Py_XDECREF(errorHandler);
4452                Py_XDECREF(exc);
4453                _PyUnicodeWriter_Dealloc(&writer);
4454                return result;
4455            }
4456            writer.pos = shiftOutStart; /* back off output */
4457        }
4458        else {
4459            *consumed = s-starts;
4460        }
4461    }
4462
4463    Py_XDECREF(errorHandler);
4464    Py_XDECREF(exc);
4465    return _PyUnicodeWriter_Finish(&writer);
4466
4467  onError:
4468    Py_XDECREF(errorHandler);
4469    Py_XDECREF(exc);
4470    _PyUnicodeWriter_Dealloc(&writer);
4471    return NULL;
4472}
4473
4474
4475PyObject *
4476_PyUnicode_EncodeUTF7(PyObject *str,
4477                      int base64SetO,
4478                      int base64WhiteSpace,
4479                      const char *errors)
4480{
4481    int kind;
4482    void *data;
4483    Py_ssize_t len;
4484    PyObject *v;
4485    int inShift = 0;
4486    Py_ssize_t i;
4487    unsigned int base64bits = 0;
4488    unsigned long base64buffer = 0;
4489    char * out;
4490    char * start;
4491
4492    if (PyUnicode_READY(str) == -1)
4493        return NULL;
4494    kind = PyUnicode_KIND(str);
4495    data = PyUnicode_DATA(str);
4496    len = PyUnicode_GET_LENGTH(str);
4497
4498    if (len == 0)
4499        return PyBytes_FromStringAndSize(NULL, 0);
4500
4501    /* It might be possible to tighten this worst case */
4502    if (len > PY_SSIZE_T_MAX / 8)
4503        return PyErr_NoMemory();
4504    v = PyBytes_FromStringAndSize(NULL, len * 8);
4505    if (v == NULL)
4506        return NULL;
4507
4508    start = out = PyBytes_AS_STRING(v);
4509    for (i = 0; i < len; ++i) {
4510        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4511
4512        if (inShift) {
4513            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4514                /* shifting out */
4515                if (base64bits) { /* output remaining bits */
4516                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4517                    base64buffer = 0;
4518                    base64bits = 0;
4519                }
4520                inShift = 0;
4521                /* Characters not in the BASE64 set implicitly unshift the sequence
4522                   so no '-' is required, except if the character is itself a '-' */
4523                if (IS_BASE64(ch) || ch == '-') {
4524                    *out++ = '-';
4525                }
4526                *out++ = (char) ch;
4527            }
4528            else {
4529                goto encode_char;
4530            }
4531        }
4532        else { /* not in a shift sequence */
4533            if (ch == '+') {
4534                *out++ = '+';
4535                        *out++ = '-';
4536            }
4537            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4538                *out++ = (char) ch;
4539            }
4540            else {
4541                *out++ = '+';
4542                inShift = 1;
4543                goto encode_char;
4544            }
4545        }
4546        continue;
4547encode_char:
4548        if (ch >= 0x10000) {
4549            assert(ch <= MAX_UNICODE);
4550
4551            /* code first surrogate */
4552            base64bits += 16;
4553            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4554            while (base64bits >= 6) {
4555                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4556                base64bits -= 6;
4557            }
4558            /* prepare second surrogate */
4559            ch = Py_UNICODE_LOW_SURROGATE(ch);
4560        }
4561        base64bits += 16;
4562        base64buffer = (base64buffer << 16) | ch;
4563        while (base64bits >= 6) {
4564            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4565            base64bits -= 6;
4566        }
4567    }
4568    if (base64bits)
4569        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4570    if (inShift)
4571        *out++ = '-';
4572    if (_PyBytes_Resize(&v, out - start) < 0)
4573        return NULL;
4574    return v;
4575}
4576PyObject *
4577PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4578                     Py_ssize_t size,
4579                     int base64SetO,
4580                     int base64WhiteSpace,
4581                     const char *errors)
4582{
4583    PyObject *result;
4584    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4585    if (tmp == NULL)
4586        return NULL;
4587    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4588                                   base64WhiteSpace, errors);
4589    Py_DECREF(tmp);
4590    return result;
4591}
4592
4593#undef IS_BASE64
4594#undef FROM_BASE64
4595#undef TO_BASE64
4596#undef DECODE_DIRECT
4597#undef ENCODE_DIRECT
4598
4599/* --- UTF-8 Codec -------------------------------------------------------- */
4600
4601PyObject *
4602PyUnicode_DecodeUTF8(const char *s,
4603                     Py_ssize_t size,
4604                     const char *errors)
4605{
4606    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4607}
4608
4609#include "stringlib/asciilib.h"
4610#include "stringlib/codecs.h"
4611#include "stringlib/undef.h"
4612
4613#include "stringlib/ucs1lib.h"
4614#include "stringlib/codecs.h"
4615#include "stringlib/undef.h"
4616
4617#include "stringlib/ucs2lib.h"
4618#include "stringlib/codecs.h"
4619#include "stringlib/undef.h"
4620
4621#include "stringlib/ucs4lib.h"
4622#include "stringlib/codecs.h"
4623#include "stringlib/undef.h"
4624
4625/* Mask to quickly check whether a C 'long' contains a
4626   non-ASCII, UTF8-encoded char. */
4627#if (SIZEOF_LONG == 8)
4628# define ASCII_CHAR_MASK 0x8080808080808080UL
4629#elif (SIZEOF_LONG == 4)
4630# define ASCII_CHAR_MASK 0x80808080UL
4631#else
4632# error C 'long' size should be either 4 or 8!
4633#endif
4634
4635static Py_ssize_t
4636ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4637{
4638    const char *p = start;
4639    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4640
4641    /*
4642     * Issue #17237: m68k is a bit different from most architectures in
4643     * that objects do not use "natural alignment" - for example, int and
4644     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4645     * won't work; also, tests have shown that skipping the "optimised
4646     * version" will even speed up m68k.
4647     */
4648#if !defined(__m68k__)
4649#if SIZEOF_LONG <= SIZEOF_VOID_P
4650    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4651    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4652        /* Fast path, see in STRINGLIB(utf8_decode) for
4653           an explanation. */
4654        /* Help allocation */
4655        const char *_p = p;
4656        Py_UCS1 * q = dest;
4657        while (_p < aligned_end) {
4658            unsigned long value = *(const unsigned long *) _p;
4659            if (value & ASCII_CHAR_MASK)
4660                break;
4661            *((unsigned long *)q) = value;
4662            _p += SIZEOF_LONG;
4663            q += SIZEOF_LONG;
4664        }
4665        p = _p;
4666        while (p < end) {
4667            if ((unsigned char)*p & 0x80)
4668                break;
4669            *q++ = *p++;
4670        }
4671        return p - start;
4672    }
4673#endif
4674#endif
4675    while (p < end) {
4676        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4677           for an explanation. */
4678        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4679            /* Help allocation */
4680            const char *_p = p;
4681            while (_p < aligned_end) {
4682                unsigned long value = *(unsigned long *) _p;
4683                if (value & ASCII_CHAR_MASK)
4684                    break;
4685                _p += SIZEOF_LONG;
4686            }
4687            p = _p;
4688            if (_p == end)
4689                break;
4690        }
4691        if ((unsigned char)*p & 0x80)
4692            break;
4693        ++p;
4694    }
4695    memcpy(dest, start, p - start);
4696    return p - start;
4697}
4698
4699PyObject *
4700PyUnicode_DecodeUTF8Stateful(const char *s,
4701                             Py_ssize_t size,
4702                             const char *errors,
4703                             Py_ssize_t *consumed)
4704{
4705    _PyUnicodeWriter writer;
4706    const char *starts = s;
4707    const char *end = s + size;
4708
4709    Py_ssize_t startinpos;
4710    Py_ssize_t endinpos;
4711    const char *errmsg = "";
4712    PyObject *errorHandler = NULL;
4713    PyObject *exc = NULL;
4714
4715    if (size == 0) {
4716        if (consumed)
4717            *consumed = 0;
4718        _Py_RETURN_UNICODE_EMPTY();
4719    }
4720
4721    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4722    if (size == 1 && (unsigned char)s[0] < 128) {
4723        if (consumed)
4724            *consumed = 1;
4725        return get_latin1_char((unsigned char)s[0]);
4726    }
4727
4728    _PyUnicodeWriter_Init(&writer);
4729    writer.min_length = size;
4730    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4731        goto onError;
4732
4733    writer.pos = ascii_decode(s, end, writer.data);
4734    s += writer.pos;
4735    while (s < end) {
4736        Py_UCS4 ch;
4737        int kind = writer.kind;
4738        if (kind == PyUnicode_1BYTE_KIND) {
4739            if (PyUnicode_IS_ASCII(writer.buffer))
4740                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4741            else
4742                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4743        } else if (kind == PyUnicode_2BYTE_KIND) {
4744            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4745        } else {
4746            assert(kind == PyUnicode_4BYTE_KIND);
4747            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4748        }
4749
4750        switch (ch) {
4751        case 0:
4752            if (s == end || consumed)
4753                goto End;
4754            errmsg = "unexpected end of data";
4755            startinpos = s - starts;
4756            endinpos = end - starts;
4757            break;
4758        case 1:
4759            errmsg = "invalid start byte";
4760            startinpos = s - starts;
4761            endinpos = startinpos + 1;
4762            break;
4763        case 2:
4764        case 3:
4765        case 4:
4766            errmsg = "invalid continuation byte";
4767            startinpos = s - starts;
4768            endinpos = startinpos + ch - 1;
4769            break;
4770        default:
4771            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4772                goto onError;
4773            continue;
4774        }
4775
4776        if (unicode_decode_call_errorhandler_writer(
4777                errors, &errorHandler,
4778                "utf-8", errmsg,
4779                &starts, &end, &startinpos, &endinpos, &exc, &s,
4780                &writer))
4781            goto onError;
4782    }
4783
4784End:
4785    if (consumed)
4786        *consumed = s - starts;
4787
4788    Py_XDECREF(errorHandler);
4789    Py_XDECREF(exc);
4790    return _PyUnicodeWriter_Finish(&writer);
4791
4792onError:
4793    Py_XDECREF(errorHandler);
4794    Py_XDECREF(exc);
4795    _PyUnicodeWriter_Dealloc(&writer);
4796    return NULL;
4797}
4798
4799#ifdef __APPLE__
4800
4801/* Simplified UTF-8 decoder using surrogateescape error handler,
4802   used to decode the command line arguments on Mac OS X.
4803
4804   Return a pointer to a newly allocated wide character string (use
4805   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4806
4807wchar_t*
4808_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4809{
4810    const char *e;
4811    wchar_t *unicode;
4812    Py_ssize_t outpos;
4813
4814    /* Note: size will always be longer than the resulting Unicode
4815       character count */
4816    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
4817        return NULL;
4818    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4819    if (!unicode)
4820        return NULL;
4821
4822    /* Unpack UTF-8 encoded data */
4823    e = s + size;
4824    outpos = 0;
4825    while (s < e) {
4826        Py_UCS4 ch;
4827#if SIZEOF_WCHAR_T == 4
4828        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4829#else
4830        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4831#endif
4832        if (ch > 0xFF) {
4833#if SIZEOF_WCHAR_T == 4
4834            assert(0);
4835#else
4836            assert(Py_UNICODE_IS_SURROGATE(ch));
4837            /*  compute and append the two surrogates: */
4838            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4839            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4840#endif
4841        }
4842        else {
4843            if (!ch && s == e)
4844                break;
4845            /* surrogateescape */
4846            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4847        }
4848    }
4849    unicode[outpos] = L'\0';
4850    return unicode;
4851}
4852
4853#endif /* __APPLE__ */
4854
4855/* Primary internal function which creates utf8 encoded bytes objects.
4856
4857   Allocation strategy:  if the string is short, convert into a stack buffer
4858   and allocate exactly as much space needed at the end.  Else allocate the
4859   maximum possible needed (4 result bytes per Unicode character), and return
4860   the excess memory at the end.
4861*/
4862PyObject *
4863_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4864{
4865    enum PyUnicode_Kind kind;
4866    void *data;
4867    Py_ssize_t size;
4868
4869    if (!PyUnicode_Check(unicode)) {
4870        PyErr_BadArgument();
4871        return NULL;
4872    }
4873
4874    if (PyUnicode_READY(unicode) == -1)
4875        return NULL;
4876
4877    if (PyUnicode_UTF8(unicode))
4878        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4879                                         PyUnicode_UTF8_LENGTH(unicode));
4880
4881    kind = PyUnicode_KIND(unicode);
4882    data = PyUnicode_DATA(unicode);
4883    size = PyUnicode_GET_LENGTH(unicode);
4884
4885    switch (kind) {
4886    default:
4887        assert(0);
4888    case PyUnicode_1BYTE_KIND:
4889        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4890        assert(!PyUnicode_IS_ASCII(unicode));
4891        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4892    case PyUnicode_2BYTE_KIND:
4893        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4894    case PyUnicode_4BYTE_KIND:
4895        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4896    }
4897}
4898
4899PyObject *
4900PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4901                     Py_ssize_t size,
4902                     const char *errors)
4903{
4904    PyObject *v, *unicode;
4905
4906    unicode = PyUnicode_FromUnicode(s, size);
4907    if (unicode == NULL)
4908        return NULL;
4909    v = _PyUnicode_AsUTF8String(unicode, errors);
4910    Py_DECREF(unicode);
4911    return v;
4912}
4913
4914PyObject *
4915PyUnicode_AsUTF8String(PyObject *unicode)
4916{
4917    return _PyUnicode_AsUTF8String(unicode, NULL);
4918}
4919
4920/* --- UTF-32 Codec ------------------------------------------------------- */
4921
4922PyObject *
4923PyUnicode_DecodeUTF32(const char *s,
4924                      Py_ssize_t size,
4925                      const char *errors,
4926                      int *byteorder)
4927{
4928    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4929}
4930
4931PyObject *
4932PyUnicode_DecodeUTF32Stateful(const char *s,
4933                              Py_ssize_t size,
4934                              const char *errors,
4935                              int *byteorder,
4936                              Py_ssize_t *consumed)
4937{
4938    const char *starts = s;
4939    Py_ssize_t startinpos;
4940    Py_ssize_t endinpos;
4941    _PyUnicodeWriter writer;
4942    const unsigned char *q, *e;
4943    int le, bo = 0;       /* assume native ordering by default */
4944    const char *encoding;
4945    const char *errmsg = "";
4946    PyObject *errorHandler = NULL;
4947    PyObject *exc = NULL;
4948
4949    q = (unsigned char *)s;
4950    e = q + size;
4951
4952    if (byteorder)
4953        bo = *byteorder;
4954
4955    /* Check for BOM marks (U+FEFF) in the input and adjust current
4956       byte order setting accordingly. In native mode, the leading BOM
4957       mark is skipped, in all other modes, it is copied to the output
4958       stream as-is (giving a ZWNBSP character). */
4959    if (bo == 0 && size >= 4) {
4960        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4961        if (bom == 0x0000FEFF) {
4962            bo = -1;
4963            q += 4;
4964        }
4965        else if (bom == 0xFFFE0000) {
4966            bo = 1;
4967            q += 4;
4968        }
4969        if (byteorder)
4970            *byteorder = bo;
4971    }
4972
4973    if (q == e) {
4974        if (consumed)
4975            *consumed = size;
4976        _Py_RETURN_UNICODE_EMPTY();
4977    }
4978
4979#ifdef WORDS_BIGENDIAN
4980    le = bo < 0;
4981#else
4982    le = bo <= 0;
4983#endif
4984    encoding = le ? "utf-32-le" : "utf-32-be";
4985
4986    _PyUnicodeWriter_Init(&writer);
4987    writer.min_length = (e - q + 3) / 4;
4988    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4989        goto onError;
4990
4991    while (1) {
4992        Py_UCS4 ch = 0;
4993        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
4994
4995        if (e - q >= 4) {
4996            enum PyUnicode_Kind kind = writer.kind;
4997            void *data = writer.data;
4998            const unsigned char *last = e - 4;
4999            Py_ssize_t pos = writer.pos;
5000            if (le) {
5001                do {
5002                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5003                    if (ch > maxch)
5004                        break;
5005                    if (kind != PyUnicode_1BYTE_KIND &&
5006                        Py_UNICODE_IS_SURROGATE(ch))
5007                        break;
5008                    PyUnicode_WRITE(kind, data, pos++, ch);
5009                    q += 4;
5010                } while (q <= last);
5011            }
5012            else {
5013                do {
5014                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5015                    if (ch > maxch)
5016                        break;
5017                    if (kind != PyUnicode_1BYTE_KIND &&
5018                        Py_UNICODE_IS_SURROGATE(ch))
5019                        break;
5020                    PyUnicode_WRITE(kind, data, pos++, ch);
5021                    q += 4;
5022                } while (q <= last);
5023            }
5024            writer.pos = pos;
5025        }
5026
5027        if (Py_UNICODE_IS_SURROGATE(ch)) {
5028            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5029            startinpos = ((const char *)q) - starts;
5030            endinpos = startinpos + 4;
5031        }
5032        else if (ch <= maxch) {
5033            if (q == e || consumed)
5034                break;
5035            /* remaining bytes at the end? (size should be divisible by 4) */
5036            errmsg = "truncated data";
5037            startinpos = ((const char *)q) - starts;
5038            endinpos = ((const char *)e) - starts;
5039        }
5040        else {
5041            if (ch < 0x110000) {
5042                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5043                    goto onError;
5044                q += 4;
5045                continue;
5046            }
5047            errmsg = "code point not in range(0x110000)";
5048            startinpos = ((const char *)q) - starts;
5049            endinpos = startinpos + 4;
5050        }
5051
5052        /* The remaining input chars are ignored if the callback
5053           chooses to skip the input */
5054        if (unicode_decode_call_errorhandler_writer(
5055                errors, &errorHandler,
5056                encoding, errmsg,
5057                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5058                &writer))
5059            goto onError;
5060    }
5061
5062    if (consumed)
5063        *consumed = (const char *)q-starts;
5064
5065    Py_XDECREF(errorHandler);
5066    Py_XDECREF(exc);
5067    return _PyUnicodeWriter_Finish(&writer);
5068
5069  onError:
5070    _PyUnicodeWriter_Dealloc(&writer);
5071    Py_XDECREF(errorHandler);
5072    Py_XDECREF(exc);
5073    return NULL;
5074}
5075
5076PyObject *
5077_PyUnicode_EncodeUTF32(PyObject *str,
5078                       const char *errors,
5079                       int byteorder)
5080{
5081    enum PyUnicode_Kind kind;
5082    const void *data;
5083    Py_ssize_t len;
5084    PyObject *v;
5085    PY_UINT32_T *out;
5086#if PY_LITTLE_ENDIAN
5087    int native_ordering = byteorder <= 0;
5088#else
5089    int native_ordering = byteorder >= 0;
5090#endif
5091    const char *encoding;
5092    Py_ssize_t nsize, pos;
5093    PyObject *errorHandler = NULL;
5094    PyObject *exc = NULL;
5095    PyObject *rep = NULL;
5096
5097    if (!PyUnicode_Check(str)) {
5098        PyErr_BadArgument();
5099        return NULL;
5100    }
5101    if (PyUnicode_READY(str) == -1)
5102        return NULL;
5103    kind = PyUnicode_KIND(str);
5104    data = PyUnicode_DATA(str);
5105    len = PyUnicode_GET_LENGTH(str);
5106
5107    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5108        return PyErr_NoMemory();
5109    nsize = len + (byteorder == 0);
5110    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5111    if (v == NULL)
5112        return NULL;
5113
5114    /* output buffer is 4-bytes aligned */
5115    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5116    out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
5117    if (byteorder == 0)
5118        *out++ = 0xFEFF;
5119    if (len == 0)
5120        goto done;
5121
5122    if (byteorder == -1)
5123        encoding = "utf-32-le";
5124    else if (byteorder == 1)
5125        encoding = "utf-32-be";
5126    else
5127        encoding = "utf-32";
5128
5129    if (kind == PyUnicode_1BYTE_KIND) {
5130        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5131        goto done;
5132    }
5133
5134    pos = 0;
5135    while (pos < len) {
5136        Py_ssize_t repsize, moreunits;
5137
5138        if (kind == PyUnicode_2BYTE_KIND) {
5139            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5140                                        &out, native_ordering);
5141        }
5142        else {
5143            assert(kind == PyUnicode_4BYTE_KIND);
5144            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5145                                        &out, native_ordering);
5146        }
5147        if (pos == len)
5148            break;
5149
5150        rep = unicode_encode_call_errorhandler(
5151                errors, &errorHandler,
5152                encoding, "surrogates not allowed",
5153                str, &exc, pos, pos + 1, &pos);
5154        if (!rep)
5155            goto error;
5156
5157        if (PyBytes_Check(rep)) {
5158            repsize = PyBytes_GET_SIZE(rep);
5159            if (repsize & 3) {
5160                raise_encode_exception(&exc, encoding,
5161                                       str, pos - 1, pos,
5162                                       "surrogates not allowed");
5163                goto error;
5164            }
5165            moreunits = repsize / 4;
5166        }
5167        else {
5168            assert(PyUnicode_Check(rep));
5169            if (PyUnicode_READY(rep) < 0)
5170                goto error;
5171            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5172            if (!PyUnicode_IS_ASCII(rep)) {
5173                raise_encode_exception(&exc, encoding,
5174                                       str, pos - 1, pos,
5175                                       "surrogates not allowed");
5176                goto error;
5177            }
5178        }
5179
5180        /* four bytes are reserved for each surrogate */
5181        if (moreunits > 1) {
5182            Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
5183            Py_ssize_t morebytes = 4 * (moreunits - 1);
5184            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5185                /* integer overflow */
5186                PyErr_NoMemory();
5187                goto error;
5188            }
5189            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5190                goto error;
5191            out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
5192        }
5193
5194        if (PyBytes_Check(rep)) {
5195            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5196            out += moreunits;
5197        } else /* rep is unicode */ {
5198            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5199            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5200                                 &out, native_ordering);
5201        }
5202
5203        Py_CLEAR(rep);
5204    }
5205
5206    /* Cut back to size actually needed. This is necessary for, for example,
5207       encoding of a string containing isolated surrogates and the 'ignore'
5208       handler is used. */
5209    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5210    if (nsize != PyBytes_GET_SIZE(v))
5211      _PyBytes_Resize(&v, nsize);
5212    Py_XDECREF(errorHandler);
5213    Py_XDECREF(exc);
5214  done:
5215    return v;
5216  error:
5217    Py_XDECREF(rep);
5218    Py_XDECREF(errorHandler);
5219    Py_XDECREF(exc);
5220    Py_XDECREF(v);
5221    return NULL;
5222}
5223
5224PyObject *
5225PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5226                      Py_ssize_t size,
5227                      const char *errors,
5228                      int byteorder)
5229{
5230    PyObject *result;
5231    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5232    if (tmp == NULL)
5233        return NULL;
5234    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5235    Py_DECREF(tmp);
5236    return result;
5237}
5238
5239PyObject *
5240PyUnicode_AsUTF32String(PyObject *unicode)
5241{
5242    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5243}
5244
5245/* --- UTF-16 Codec ------------------------------------------------------- */
5246
5247PyObject *
5248PyUnicode_DecodeUTF16(const char *s,
5249                      Py_ssize_t size,
5250                      const char *errors,
5251                      int *byteorder)
5252{
5253    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5254}
5255
5256PyObject *
5257PyUnicode_DecodeUTF16Stateful(const char *s,
5258                              Py_ssize_t size,
5259                              const char *errors,
5260                              int *byteorder,
5261                              Py_ssize_t *consumed)
5262{
5263    const char *starts = s;
5264    Py_ssize_t startinpos;
5265    Py_ssize_t endinpos;
5266    _PyUnicodeWriter writer;
5267    const unsigned char *q, *e;
5268    int bo = 0;       /* assume native ordering by default */
5269    int native_ordering;
5270    const char *errmsg = "";
5271    PyObject *errorHandler = NULL;
5272    PyObject *exc = NULL;
5273    const char *encoding;
5274
5275    q = (unsigned char *)s;
5276    e = q + size;
5277
5278    if (byteorder)
5279        bo = *byteorder;
5280
5281    /* Check for BOM marks (U+FEFF) in the input and adjust current
5282       byte order setting accordingly. In native mode, the leading BOM
5283       mark is skipped, in all other modes, it is copied to the output
5284       stream as-is (giving a ZWNBSP character). */
5285    if (bo == 0 && size >= 2) {
5286        const Py_UCS4 bom = (q[1] << 8) | q[0];
5287        if (bom == 0xFEFF) {
5288            q += 2;
5289            bo = -1;
5290        }
5291        else if (bom == 0xFFFE) {
5292            q += 2;
5293            bo = 1;
5294        }
5295        if (byteorder)
5296            *byteorder = bo;
5297    }
5298
5299    if (q == e) {
5300        if (consumed)
5301            *consumed = size;
5302        _Py_RETURN_UNICODE_EMPTY();
5303    }
5304
5305#if PY_LITTLE_ENDIAN
5306    native_ordering = bo <= 0;
5307    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5308#else
5309    native_ordering = bo >= 0;
5310    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5311#endif
5312
5313    /* Note: size will always be longer than the resulting Unicode
5314       character count */
5315    _PyUnicodeWriter_Init(&writer);
5316    writer.min_length = (e - q + 1) / 2;
5317    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5318        goto onError;
5319
5320    while (1) {
5321        Py_UCS4 ch = 0;
5322        if (e - q >= 2) {
5323            int kind = writer.kind;
5324            if (kind == PyUnicode_1BYTE_KIND) {
5325                if (PyUnicode_IS_ASCII(writer.buffer))
5326                    ch = asciilib_utf16_decode(&q, e,
5327                            (Py_UCS1*)writer.data, &writer.pos,
5328                            native_ordering);
5329                else
5330                    ch = ucs1lib_utf16_decode(&q, e,
5331                            (Py_UCS1*)writer.data, &writer.pos,
5332                            native_ordering);
5333            } else if (kind == PyUnicode_2BYTE_KIND) {
5334                ch = ucs2lib_utf16_decode(&q, e,
5335                        (Py_UCS2*)writer.data, &writer.pos,
5336                        native_ordering);
5337            } else {
5338                assert(kind == PyUnicode_4BYTE_KIND);
5339                ch = ucs4lib_utf16_decode(&q, e,
5340                        (Py_UCS4*)writer.data, &writer.pos,
5341                        native_ordering);
5342            }
5343        }
5344
5345        switch (ch)
5346        {
5347        case 0:
5348            /* remaining byte at the end? (size should be even) */
5349            if (q == e || consumed)
5350                goto End;
5351            errmsg = "truncated data";
5352            startinpos = ((const char *)q) - starts;
5353            endinpos = ((const char *)e) - starts;
5354            break;
5355            /* The remaining input chars are ignored if the callback
5356               chooses to skip the input */
5357        case 1:
5358            q -= 2;
5359            if (consumed)
5360                goto End;
5361            errmsg = "unexpected end of data";
5362            startinpos = ((const char *)q) - starts;
5363            endinpos = ((const char *)e) - starts;
5364            break;
5365        case 2:
5366            errmsg = "illegal encoding";
5367            startinpos = ((const char *)q) - 2 - starts;
5368            endinpos = startinpos + 2;
5369            break;
5370        case 3:
5371            errmsg = "illegal UTF-16 surrogate";
5372            startinpos = ((const char *)q) - 4 - starts;
5373            endinpos = startinpos + 2;
5374            break;
5375        default:
5376            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5377                goto onError;
5378            continue;
5379        }
5380
5381        if (unicode_decode_call_errorhandler_writer(
5382                errors,
5383                &errorHandler,
5384                encoding, errmsg,
5385                &starts,
5386                (const char **)&e,
5387                &startinpos,
5388                &endinpos,
5389                &exc,
5390                (const char **)&q,
5391                &writer))
5392            goto onError;
5393    }
5394
5395End:
5396    if (consumed)
5397        *consumed = (const char *)q-starts;
5398
5399    Py_XDECREF(errorHandler);
5400    Py_XDECREF(exc);
5401    return _PyUnicodeWriter_Finish(&writer);
5402
5403  onError:
5404    _PyUnicodeWriter_Dealloc(&writer);
5405    Py_XDECREF(errorHandler);
5406    Py_XDECREF(exc);
5407    return NULL;
5408}
5409
5410PyObject *
5411_PyUnicode_EncodeUTF16(PyObject *str,
5412                       const char *errors,
5413                       int byteorder)
5414{
5415    enum PyUnicode_Kind kind;
5416    const void *data;
5417    Py_ssize_t len;
5418    PyObject *v;
5419    unsigned short *out;
5420    Py_ssize_t pairs;
5421#if PY_BIG_ENDIAN
5422    int native_ordering = byteorder >= 0;
5423#else
5424    int native_ordering = byteorder <= 0;
5425#endif
5426    const char *encoding;
5427    Py_ssize_t nsize, pos;
5428    PyObject *errorHandler = NULL;
5429    PyObject *exc = NULL;
5430    PyObject *rep = NULL;
5431
5432    if (!PyUnicode_Check(str)) {
5433        PyErr_BadArgument();
5434        return NULL;
5435    }
5436    if (PyUnicode_READY(str) == -1)
5437        return NULL;
5438    kind = PyUnicode_KIND(str);
5439    data = PyUnicode_DATA(str);
5440    len = PyUnicode_GET_LENGTH(str);
5441
5442    pairs = 0;
5443    if (kind == PyUnicode_4BYTE_KIND) {
5444        const Py_UCS4 *in = (const Py_UCS4 *)data;
5445        const Py_UCS4 *end = in + len;
5446        while (in < end)
5447            if (*in++ >= 0x10000)
5448                pairs++;
5449    }
5450    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5451        return PyErr_NoMemory();
5452    nsize = len + pairs + (byteorder == 0);
5453    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5454    if (v == NULL)
5455        return NULL;
5456
5457    /* output buffer is 2-bytes aligned */
5458    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5459    out = (unsigned short *)PyBytes_AS_STRING(v);
5460    if (byteorder == 0)
5461        *out++ = 0xFEFF;
5462    if (len == 0)
5463        goto done;
5464
5465    if (kind == PyUnicode_1BYTE_KIND) {
5466        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5467        goto done;
5468    }
5469
5470    if (byteorder < 0)
5471        encoding = "utf-16-le";
5472    else if (byteorder > 0)
5473        encoding = "utf-16-be";
5474    else
5475        encoding = "utf-16";
5476
5477    pos = 0;
5478    while (pos < len) {
5479        Py_ssize_t repsize, moreunits;
5480
5481        if (kind == PyUnicode_2BYTE_KIND) {
5482            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5483                                        &out, native_ordering);
5484        }
5485        else {
5486            assert(kind == PyUnicode_4BYTE_KIND);
5487            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5488                                        &out, native_ordering);
5489        }
5490        if (pos == len)
5491            break;
5492
5493        rep = unicode_encode_call_errorhandler(
5494                errors, &errorHandler,
5495                encoding, "surrogates not allowed",
5496                str, &exc, pos, pos + 1, &pos);
5497        if (!rep)
5498            goto error;
5499
5500        if (PyBytes_Check(rep)) {
5501            repsize = PyBytes_GET_SIZE(rep);
5502            if (repsize & 1) {
5503                raise_encode_exception(&exc, encoding,
5504                                       str, pos - 1, pos,
5505                                       "surrogates not allowed");
5506                goto error;
5507            }
5508            moreunits = repsize / 2;
5509        }
5510        else {
5511            assert(PyUnicode_Check(rep));
5512            if (PyUnicode_READY(rep) < 0)
5513                goto error;
5514            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5515            if (!PyUnicode_IS_ASCII(rep)) {
5516                raise_encode_exception(&exc, encoding,
5517                                       str, pos - 1, pos,
5518                                       "surrogates not allowed");
5519                goto error;
5520            }
5521        }
5522
5523        /* two bytes are reserved for each surrogate */
5524        if (moreunits > 1) {
5525            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5526            Py_ssize_t morebytes = 2 * (moreunits - 1);
5527            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5528                /* integer overflow */
5529                PyErr_NoMemory();
5530                goto error;
5531            }
5532            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5533                goto error;
5534            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5535        }
5536
5537        if (PyBytes_Check(rep)) {
5538            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5539            out += moreunits;
5540        } else /* rep is unicode */ {
5541            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5542            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5543                                 &out, native_ordering);
5544        }
5545
5546        Py_CLEAR(rep);
5547    }
5548
5549    /* Cut back to size actually needed. This is necessary for, for example,
5550    encoding of a string containing isolated surrogates and the 'ignore' handler
5551    is used. */
5552    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5553    if (nsize != PyBytes_GET_SIZE(v))
5554      _PyBytes_Resize(&v, nsize);
5555    Py_XDECREF(errorHandler);
5556    Py_XDECREF(exc);
5557  done:
5558    return v;
5559  error:
5560    Py_XDECREF(rep);
5561    Py_XDECREF(errorHandler);
5562    Py_XDECREF(exc);
5563    Py_XDECREF(v);
5564    return NULL;
5565#undef STORECHAR
5566}
5567
5568PyObject *
5569PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5570                      Py_ssize_t size,
5571                      const char *errors,
5572                      int byteorder)
5573{
5574    PyObject *result;
5575    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5576    if (tmp == NULL)
5577        return NULL;
5578    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5579    Py_DECREF(tmp);
5580    return result;
5581}
5582
5583PyObject *
5584PyUnicode_AsUTF16String(PyObject *unicode)
5585{
5586    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5587}
5588
5589/* --- Unicode Escape Codec ----------------------------------------------- */
5590
5591/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5592   if all the escapes in the string make it still a valid ASCII string.
5593   Returns -1 if any escapes were found which cause the string to
5594   pop out of ASCII range.  Otherwise returns the length of the
5595   required buffer to hold the string.
5596   */
5597static Py_ssize_t
5598length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5599{
5600    const unsigned char *p = (const unsigned char *)s;
5601    const unsigned char *end = p + size;
5602    Py_ssize_t length = 0;
5603
5604    if (size < 0)
5605        return -1;
5606
5607    for (; p < end; ++p) {
5608        if (*p > 127) {
5609            /* Non-ASCII */
5610            return -1;
5611        }
5612        else if (*p != '\\') {
5613            /* Normal character */
5614            ++length;
5615        }
5616        else {
5617            /* Backslash-escape, check next char */
5618            ++p;
5619            /* Escape sequence reaches till end of string or
5620               non-ASCII follow-up. */
5621            if (p >= end || *p > 127)
5622                return -1;
5623            switch (*p) {
5624            case '\n':
5625                /* backslash + \n result in zero characters */
5626                break;
5627            case '\\': case '\'': case '\"':
5628            case 'b': case 'f': case 't':
5629            case 'n': case 'r': case 'v': case 'a':
5630                ++length;
5631                break;
5632            case '0': case '1': case '2': case '3':
5633            case '4': case '5': case '6': case '7':
5634            case 'x': case 'u': case 'U': case 'N':
5635                /* these do not guarantee ASCII characters */
5636                return -1;
5637            default:
5638                /* count the backslash + the other character */
5639                length += 2;
5640            }
5641        }
5642    }
5643    return length;
5644}
5645
5646static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5647
5648PyObject *
5649PyUnicode_DecodeUnicodeEscape(const char *s,
5650                              Py_ssize_t size,
5651                              const char *errors)
5652{
5653    const char *starts = s;
5654    Py_ssize_t startinpos;
5655    Py_ssize_t endinpos;
5656    _PyUnicodeWriter writer;
5657    const char *end;
5658    char* message;
5659    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5660    PyObject *errorHandler = NULL;
5661    PyObject *exc = NULL;
5662    Py_ssize_t len;
5663
5664    len = length_of_escaped_ascii_string(s, size);
5665    if (len == 0)
5666        _Py_RETURN_UNICODE_EMPTY();
5667
5668    /* After length_of_escaped_ascii_string() there are two alternatives,
5669       either the string is pure ASCII with named escapes like \n, etc.
5670       and we determined it's exact size (common case)
5671       or it contains \x, \u, ... escape sequences.  then we create a
5672       legacy wchar string and resize it at the end of this function. */
5673    _PyUnicodeWriter_Init(&writer);
5674    if (len > 0) {
5675        writer.min_length = len;
5676    }
5677    else {
5678        /* Escaped strings will always be longer than the resulting
5679           Unicode string, so we start with size here and then reduce the
5680           length after conversion to the true value.
5681           (but if the error callback returns a long replacement string
5682           we'll have to allocate more space) */
5683        writer.min_length = size;
5684    }
5685
5686    if (size == 0)
5687        return _PyUnicodeWriter_Finish(&writer);
5688    end = s + size;
5689
5690    while (s < end) {
5691        unsigned char c;
5692        Py_UCS4 x;
5693        int digits;
5694
5695        /* Non-escape characters are interpreted as Unicode ordinals */
5696        if (*s != '\\') {
5697            x = (unsigned char)*s;
5698            s++;
5699            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5700                goto onError;
5701            continue;
5702        }
5703
5704        startinpos = s-starts;
5705        /* \ - Escapes */
5706        s++;
5707        c = *s++;
5708        if (s > end)
5709            c = '\0'; /* Invalid after \ */
5710
5711        switch (c) {
5712
5713            /* \x escapes */
5714#define WRITECHAR(ch)                                                      \
5715            do {                                                           \
5716                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5717                    goto onError;                                          \
5718            } while(0)
5719
5720        case '\n': break;
5721        case '\\': WRITECHAR('\\'); break;
5722        case '\'': WRITECHAR('\''); break;
5723        case '\"': WRITECHAR('\"'); break;
5724        case 'b': WRITECHAR('\b'); break;
5725        /* FF */
5726        case 'f': WRITECHAR('\014'); break;
5727        case 't': WRITECHAR('\t'); break;
5728        case 'n': WRITECHAR('\n'); break;
5729        case 'r': WRITECHAR('\r'); break;
5730        /* VT */
5731        case 'v': WRITECHAR('\013'); break;
5732        /* BEL, not classic C */
5733        case 'a': WRITECHAR('\007'); break;
5734
5735            /* \OOO (octal) escapes */
5736        case '0': case '1': case '2': case '3':
5737        case '4': case '5': case '6': case '7':
5738            x = s[-1] - '0';
5739            if (s < end && '0' <= *s && *s <= '7') {
5740                x = (x<<3) + *s++ - '0';
5741                if (s < end && '0' <= *s && *s <= '7')
5742                    x = (x<<3) + *s++ - '0';
5743            }
5744            WRITECHAR(x);
5745            break;
5746
5747            /* hex escapes */
5748            /* \xXX */
5749        case 'x':
5750            digits = 2;
5751            message = "truncated \\xXX escape";
5752            goto hexescape;
5753
5754            /* \uXXXX */
5755        case 'u':
5756            digits = 4;
5757            message = "truncated \\uXXXX escape";
5758            goto hexescape;
5759
5760            /* \UXXXXXXXX */
5761        case 'U':
5762            digits = 8;
5763            message = "truncated \\UXXXXXXXX escape";
5764        hexescape:
5765            chr = 0;
5766            if (end - s < digits) {
5767                /* count only hex digits */
5768                for (; s < end; ++s) {
5769                    c = (unsigned char)*s;
5770                    if (!Py_ISXDIGIT(c))
5771                        goto error;
5772                }
5773                goto error;
5774            }
5775            for (; digits--; ++s) {
5776                c = (unsigned char)*s;
5777                if (!Py_ISXDIGIT(c))
5778                    goto error;
5779                chr = (chr<<4) & ~0xF;
5780                if (c >= '0' && c <= '9')
5781                    chr += c - '0';
5782                else if (c >= 'a' && c <= 'f')
5783                    chr += 10 + c - 'a';
5784                else
5785                    chr += 10 + c - 'A';
5786            }
5787            if (chr == 0xffffffff && PyErr_Occurred())
5788                /* _decoding_error will have already written into the
5789                   target buffer. */
5790                break;
5791        store:
5792            /* when we get here, chr is a 32-bit unicode character */
5793            message = "illegal Unicode character";
5794            if (chr > MAX_UNICODE)
5795                goto error;
5796            WRITECHAR(chr);
5797            break;
5798
5799            /* \N{name} */
5800        case 'N':
5801            message = "malformed \\N character escape";
5802            if (ucnhash_CAPI == NULL) {
5803                /* load the unicode data module */
5804                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5805                                                PyUnicodeData_CAPSULE_NAME, 1);
5806                if (ucnhash_CAPI == NULL)
5807                    goto ucnhashError;
5808            }
5809            if (*s == '{') {
5810                const char *start = s+1;
5811                /* look for the closing brace */
5812                while (*s != '}' && s < end)
5813                    s++;
5814                if (s > start && s < end && *s == '}') {
5815                    /* found a name.  look it up in the unicode database */
5816                    message = "unknown Unicode character name";
5817                    s++;
5818                    if (s - start - 1 <= INT_MAX &&
5819                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5820                                              &chr, 0))
5821                        goto store;
5822                }
5823            }
5824            goto error;
5825
5826        default:
5827            if (s > end) {
5828                message = "\\ at end of string";
5829                s--;
5830                goto error;
5831            }
5832            else {
5833                WRITECHAR('\\');
5834                WRITECHAR((unsigned char)s[-1]);
5835            }
5836            break;
5837        }
5838        continue;
5839
5840      error:
5841        endinpos = s-starts;
5842        if (unicode_decode_call_errorhandler_writer(
5843                errors, &errorHandler,
5844                "unicodeescape", message,
5845                &starts, &end, &startinpos, &endinpos, &exc, &s,
5846                &writer))
5847            goto onError;
5848        continue;
5849    }
5850#undef WRITECHAR
5851
5852    Py_XDECREF(errorHandler);
5853    Py_XDECREF(exc);
5854    return _PyUnicodeWriter_Finish(&writer);
5855
5856  ucnhashError:
5857    PyErr_SetString(
5858        PyExc_UnicodeError,
5859        "\\N escapes not supported (can't load unicodedata module)"
5860        );
5861    _PyUnicodeWriter_Dealloc(&writer);
5862    Py_XDECREF(errorHandler);
5863    Py_XDECREF(exc);
5864    return NULL;
5865
5866  onError:
5867    _PyUnicodeWriter_Dealloc(&writer);
5868    Py_XDECREF(errorHandler);
5869    Py_XDECREF(exc);
5870    return NULL;
5871}
5872
5873/* Return a Unicode-Escape string version of the Unicode object.
5874
5875   If quotes is true, the string is enclosed in u"" or u'' quotes as
5876   appropriate.
5877
5878*/
5879
5880PyObject *
5881PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5882{
5883    Py_ssize_t i, len;
5884    PyObject *repr;
5885    char *p;
5886    int kind;
5887    void *data;
5888    Py_ssize_t expandsize = 0;
5889
5890    /* Initial allocation is based on the longest-possible character
5891       escape.
5892
5893       For UCS1 strings it's '\xxx', 4 bytes per source character.
5894       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5895       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5896    */
5897
5898    if (!PyUnicode_Check(unicode)) {
5899        PyErr_BadArgument();
5900        return NULL;
5901    }
5902    if (PyUnicode_READY(unicode) == -1)
5903        return NULL;
5904    len = PyUnicode_GET_LENGTH(unicode);
5905    kind = PyUnicode_KIND(unicode);
5906    data = PyUnicode_DATA(unicode);
5907    switch (kind) {
5908    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5909    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5910    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5911    }
5912
5913    if (len == 0)
5914        return PyBytes_FromStringAndSize(NULL, 0);
5915
5916    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5917        return PyErr_NoMemory();
5918
5919    repr = PyBytes_FromStringAndSize(NULL,
5920                                     2
5921                                     + expandsize*len
5922                                     + 1);
5923    if (repr == NULL)
5924        return NULL;
5925
5926    p = PyBytes_AS_STRING(repr);
5927
5928    for (i = 0; i < len; i++) {
5929        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5930
5931        /* Escape backslashes */
5932        if (ch == '\\') {
5933            *p++ = '\\';
5934            *p++ = (char) ch;
5935            continue;
5936        }
5937
5938        /* Map 21-bit characters to '\U00xxxxxx' */
5939        else if (ch >= 0x10000) {
5940            assert(ch <= MAX_UNICODE);
5941            *p++ = '\\';
5942            *p++ = 'U';
5943            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5944            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5945            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5946            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5947            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5948            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5949            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5950            *p++ = Py_hexdigits[ch & 0x0000000F];
5951            continue;
5952        }
5953
5954        /* Map 16-bit characters to '\uxxxx' */
5955        if (ch >= 256) {
5956            *p++ = '\\';
5957            *p++ = 'u';
5958            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5959            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5960            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5961            *p++ = Py_hexdigits[ch & 0x000F];
5962        }
5963
5964        /* Map special whitespace to '\t', \n', '\r' */
5965        else if (ch == '\t') {
5966            *p++ = '\\';
5967            *p++ = 't';
5968        }
5969        else if (ch == '\n') {
5970            *p++ = '\\';
5971            *p++ = 'n';
5972        }
5973        else if (ch == '\r') {
5974            *p++ = '\\';
5975            *p++ = 'r';
5976        }
5977
5978        /* Map non-printable US ASCII to '\xhh' */
5979        else if (ch < ' ' || ch >= 0x7F) {
5980            *p++ = '\\';
5981            *p++ = 'x';
5982            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5983            *p++ = Py_hexdigits[ch & 0x000F];
5984        }
5985
5986        /* Copy everything else as-is */
5987        else
5988            *p++ = (char) ch;
5989    }
5990
5991    assert(p - PyBytes_AS_STRING(repr) > 0);
5992    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5993        return NULL;
5994    return repr;
5995}
5996
5997PyObject *
5998PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5999                              Py_ssize_t size)
6000{
6001    PyObject *result;
6002    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6003    if (tmp == NULL)
6004        return NULL;
6005    result = PyUnicode_AsUnicodeEscapeString(tmp);
6006    Py_DECREF(tmp);
6007    return result;
6008}
6009
6010/* --- Raw Unicode Escape Codec ------------------------------------------- */
6011
6012PyObject *
6013PyUnicode_DecodeRawUnicodeEscape(const char *s,
6014                                 Py_ssize_t size,
6015                                 const char *errors)
6016{
6017    const char *starts = s;
6018    Py_ssize_t startinpos;
6019    Py_ssize_t endinpos;
6020    _PyUnicodeWriter writer;
6021    const char *end;
6022    const char *bs;
6023    PyObject *errorHandler = NULL;
6024    PyObject *exc = NULL;
6025
6026    if (size == 0)
6027        _Py_RETURN_UNICODE_EMPTY();
6028
6029    /* Escaped strings will always be longer than the resulting
6030       Unicode string, so we start with size here and then reduce the
6031       length after conversion to the true value. (But decoding error
6032       handler might have to resize the string) */
6033    _PyUnicodeWriter_Init(&writer);
6034    writer.min_length = size;
6035
6036    end = s + size;
6037    while (s < end) {
6038        unsigned char c;
6039        Py_UCS4 x;
6040        int i;
6041        int count;
6042
6043        /* Non-escape characters are interpreted as Unicode ordinals */
6044        if (*s != '\\') {
6045            x = (unsigned char)*s++;
6046            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6047                goto onError;
6048            continue;
6049        }
6050        startinpos = s-starts;
6051
6052        /* \u-escapes are only interpreted iff the number of leading
6053           backslashes if odd */
6054        bs = s;
6055        for (;s < end;) {
6056            if (*s != '\\')
6057                break;
6058            x = (unsigned char)*s++;
6059            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6060                goto onError;
6061        }
6062        if (((s - bs) & 1) == 0 ||
6063            s >= end ||
6064            (*s != 'u' && *s != 'U')) {
6065            continue;
6066        }
6067        writer.pos--;
6068        count = *s=='u' ? 4 : 8;
6069        s++;
6070
6071        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6072        for (x = 0, i = 0; i < count; ++i, ++s) {
6073            c = (unsigned char)*s;
6074            if (!Py_ISXDIGIT(c)) {
6075                endinpos = s-starts;
6076                if (unicode_decode_call_errorhandler_writer(
6077                        errors, &errorHandler,
6078                        "rawunicodeescape", "truncated \\uXXXX",
6079                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6080                        &writer))
6081                    goto onError;
6082                goto nextByte;
6083            }
6084            x = (x<<4) & ~0xF;
6085            if (c >= '0' && c <= '9')
6086                x += c - '0';
6087            else if (c >= 'a' && c <= 'f')
6088                x += 10 + c - 'a';
6089            else
6090                x += 10 + c - 'A';
6091        }
6092        if (x <= MAX_UNICODE) {
6093            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6094                goto onError;
6095        }
6096        else {
6097            endinpos = s-starts;
6098            if (unicode_decode_call_errorhandler_writer(
6099                    errors, &errorHandler,
6100                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6101                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6102                    &writer))
6103                goto onError;
6104        }
6105      nextByte:
6106        ;
6107    }
6108    Py_XDECREF(errorHandler);
6109    Py_XDECREF(exc);
6110    return _PyUnicodeWriter_Finish(&writer);
6111
6112  onError:
6113    _PyUnicodeWriter_Dealloc(&writer);
6114    Py_XDECREF(errorHandler);
6115    Py_XDECREF(exc);
6116    return NULL;
6117}
6118
6119
6120PyObject *
6121PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6122{
6123    PyObject *repr;
6124    char *p;
6125    char *q;
6126    Py_ssize_t expandsize, pos;
6127    int kind;
6128    void *data;
6129    Py_ssize_t len;
6130
6131    if (!PyUnicode_Check(unicode)) {
6132        PyErr_BadArgument();
6133        return NULL;
6134    }
6135    if (PyUnicode_READY(unicode) == -1)
6136        return NULL;
6137    kind = PyUnicode_KIND(unicode);
6138    data = PyUnicode_DATA(unicode);
6139    len = PyUnicode_GET_LENGTH(unicode);
6140    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6141       bytes, and 1 byte characters 4. */
6142    expandsize = kind * 2 + 2;
6143
6144    if (len > PY_SSIZE_T_MAX / expandsize)
6145        return PyErr_NoMemory();
6146
6147    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6148    if (repr == NULL)
6149        return NULL;
6150    if (len == 0)
6151        return repr;
6152
6153    p = q = PyBytes_AS_STRING(repr);
6154    for (pos = 0; pos < len; pos++) {
6155        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6156        /* Map 32-bit characters to '\Uxxxxxxxx' */
6157        if (ch >= 0x10000) {
6158            assert(ch <= MAX_UNICODE);
6159            *p++ = '\\';
6160            *p++ = 'U';
6161            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6162            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6163            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6164            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6165            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6166            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6167            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6168            *p++ = Py_hexdigits[ch & 15];
6169        }
6170        /* Map 16-bit characters to '\uxxxx' */
6171        else if (ch >= 256) {
6172            *p++ = '\\';
6173            *p++ = 'u';
6174            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6175            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6176            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6177            *p++ = Py_hexdigits[ch & 15];
6178        }
6179        /* Copy everything else as-is */
6180        else
6181            *p++ = (char) ch;
6182    }
6183
6184    assert(p > q);
6185    if (_PyBytes_Resize(&repr, p - q) < 0)
6186        return NULL;
6187    return repr;
6188}
6189
6190PyObject *
6191PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6192                                 Py_ssize_t size)
6193{
6194    PyObject *result;
6195    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6196    if (tmp == NULL)
6197        return NULL;
6198    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6199    Py_DECREF(tmp);
6200    return result;
6201}
6202
6203/* --- Unicode Internal Codec ------------------------------------------- */
6204
6205PyObject *
6206_PyUnicode_DecodeUnicodeInternal(const char *s,
6207                                 Py_ssize_t size,
6208                                 const char *errors)
6209{
6210    const char *starts = s;
6211    Py_ssize_t startinpos;
6212    Py_ssize_t endinpos;
6213    _PyUnicodeWriter writer;
6214    const char *end;
6215    const char *reason;
6216    PyObject *errorHandler = NULL;
6217    PyObject *exc = NULL;
6218
6219    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6220                     "unicode_internal codec has been deprecated",
6221                     1))
6222        return NULL;
6223
6224    if (size == 0)
6225        _Py_RETURN_UNICODE_EMPTY();
6226
6227    _PyUnicodeWriter_Init(&writer);
6228    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6229        PyErr_NoMemory();
6230        goto onError;
6231    }
6232    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6233
6234    end = s + size;
6235    while (s < end) {
6236        Py_UNICODE uch;
6237        Py_UCS4 ch;
6238        if (end - s < Py_UNICODE_SIZE) {
6239            endinpos = end-starts;
6240            reason = "truncated input";
6241            goto error;
6242        }
6243        /* We copy the raw representation one byte at a time because the
6244           pointer may be unaligned (see test_codeccallbacks). */
6245        ((char *) &uch)[0] = s[0];
6246        ((char *) &uch)[1] = s[1];
6247#ifdef Py_UNICODE_WIDE
6248        ((char *) &uch)[2] = s[2];
6249        ((char *) &uch)[3] = s[3];
6250#endif
6251        ch = uch;
6252#ifdef Py_UNICODE_WIDE
6253        /* We have to sanity check the raw data, otherwise doom looms for
6254           some malformed UCS-4 data. */
6255        if (ch > 0x10ffff) {
6256            endinpos = s - starts + Py_UNICODE_SIZE;
6257            reason = "illegal code point (> 0x10FFFF)";
6258            goto error;
6259        }
6260#endif
6261        s += Py_UNICODE_SIZE;
6262#ifndef Py_UNICODE_WIDE
6263        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6264        {
6265            Py_UNICODE uch2;
6266            ((char *) &uch2)[0] = s[0];
6267            ((char *) &uch2)[1] = s[1];
6268            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6269            {
6270                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6271                s += Py_UNICODE_SIZE;
6272            }
6273        }
6274#endif
6275
6276        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6277            goto onError;
6278        continue;
6279
6280  error:
6281        startinpos = s - starts;
6282        if (unicode_decode_call_errorhandler_writer(
6283                errors, &errorHandler,
6284                "unicode_internal", reason,
6285                &starts, &end, &startinpos, &endinpos, &exc, &s,
6286                &writer))
6287            goto onError;
6288    }
6289
6290    Py_XDECREF(errorHandler);
6291    Py_XDECREF(exc);
6292    return _PyUnicodeWriter_Finish(&writer);
6293
6294  onError:
6295    _PyUnicodeWriter_Dealloc(&writer);
6296    Py_XDECREF(errorHandler);
6297    Py_XDECREF(exc);
6298    return NULL;
6299}
6300
6301/* --- Latin-1 Codec ------------------------------------------------------ */
6302
6303PyObject *
6304PyUnicode_DecodeLatin1(const char *s,
6305                       Py_ssize_t size,
6306                       const char *errors)
6307{
6308    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6309    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6310}
6311
6312/* create or adjust a UnicodeEncodeError */
6313static void
6314make_encode_exception(PyObject **exceptionObject,
6315                      const char *encoding,
6316                      PyObject *unicode,
6317                      Py_ssize_t startpos, Py_ssize_t endpos,
6318                      const char *reason)
6319{
6320    if (*exceptionObject == NULL) {
6321        *exceptionObject = PyObject_CallFunction(
6322            PyExc_UnicodeEncodeError, "sOnns",
6323            encoding, unicode, startpos, endpos, reason);
6324    }
6325    else {
6326        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6327            goto onError;
6328        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6329            goto onError;
6330        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6331            goto onError;
6332        return;
6333      onError:
6334        Py_CLEAR(*exceptionObject);
6335    }
6336}
6337
6338/* raises a UnicodeEncodeError */
6339static void
6340raise_encode_exception(PyObject **exceptionObject,
6341                       const char *encoding,
6342                       PyObject *unicode,
6343                       Py_ssize_t startpos, Py_ssize_t endpos,
6344                       const char *reason)
6345{
6346    make_encode_exception(exceptionObject,
6347                          encoding, unicode, startpos, endpos, reason);
6348    if (*exceptionObject != NULL)
6349        PyCodec_StrictErrors(*exceptionObject);
6350}
6351
6352/* error handling callback helper:
6353   build arguments, call the callback and check the arguments,
6354   put the result into newpos and return the replacement string, which
6355   has to be freed by the caller */
6356static PyObject *
6357unicode_encode_call_errorhandler(const char *errors,
6358                                 PyObject **errorHandler,
6359                                 const char *encoding, const char *reason,
6360                                 PyObject *unicode, PyObject **exceptionObject,
6361                                 Py_ssize_t startpos, Py_ssize_t endpos,
6362                                 Py_ssize_t *newpos)
6363{
6364    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6365    Py_ssize_t len;
6366    PyObject *restuple;
6367    PyObject *resunicode;
6368
6369    if (*errorHandler == NULL) {
6370        *errorHandler = PyCodec_LookupError(errors);
6371        if (*errorHandler == NULL)
6372            return NULL;
6373    }
6374
6375    if (PyUnicode_READY(unicode) == -1)
6376        return NULL;
6377    len = PyUnicode_GET_LENGTH(unicode);
6378
6379    make_encode_exception(exceptionObject,
6380                          encoding, unicode, startpos, endpos, reason);
6381    if (*exceptionObject == NULL)
6382        return NULL;
6383
6384    restuple = PyObject_CallFunctionObjArgs(
6385        *errorHandler, *exceptionObject, NULL);
6386    if (restuple == NULL)
6387        return NULL;
6388    if (!PyTuple_Check(restuple)) {
6389        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6390        Py_DECREF(restuple);
6391        return NULL;
6392    }
6393    if (!PyArg_ParseTuple(restuple, argparse,
6394                          &resunicode, newpos)) {
6395        Py_DECREF(restuple);
6396        return NULL;
6397    }
6398    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6399        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6400        Py_DECREF(restuple);
6401        return NULL;
6402    }
6403    if (*newpos<0)
6404        *newpos = len + *newpos;
6405    if (*newpos<0 || *newpos>len) {
6406        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6407        Py_DECREF(restuple);
6408        return NULL;
6409    }
6410    Py_INCREF(resunicode);
6411    Py_DECREF(restuple);
6412    return resunicode;
6413}
6414
6415static PyObject *
6416unicode_encode_ucs1(PyObject *unicode,
6417                    const char *errors,
6418                    const Py_UCS4 limit)
6419{
6420    /* input state */
6421    Py_ssize_t pos=0, size;
6422    int kind;
6423    void *data;
6424    /* output object */
6425    PyObject *res;
6426    /* pointer into the output */
6427    char *str;
6428    /* current output position */
6429    Py_ssize_t ressize;
6430    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6431    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6432    PyObject *error_handler_obj = NULL;
6433    PyObject *exc = NULL;
6434    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6435
6436    if (PyUnicode_READY(unicode) == -1)
6437        return NULL;
6438    size = PyUnicode_GET_LENGTH(unicode);
6439    kind = PyUnicode_KIND(unicode);
6440    data = PyUnicode_DATA(unicode);
6441    /* allocate enough for a simple encoding without
6442       replacements, if we need more, we'll resize */
6443    if (size == 0)
6444        return PyBytes_FromStringAndSize(NULL, 0);
6445    res = PyBytes_FromStringAndSize(NULL, size);
6446    if (res == NULL)
6447        return NULL;
6448    str = PyBytes_AS_STRING(res);
6449    ressize = size;
6450
6451    while (pos < size) {
6452        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6453
6454        /* can we encode this? */
6455        if (ch < limit) {
6456            /* no overflow check, because we know that the space is enough */
6457            *str++ = (char)ch;
6458            ++pos;
6459        }
6460        else {
6461            Py_ssize_t requiredsize;
6462            PyObject *repunicode;
6463            Py_ssize_t repsize, newpos, respos, i;
6464            /* startpos for collecting unencodable chars */
6465            Py_ssize_t collstart = pos;
6466            Py_ssize_t collend = pos;
6467            /* find all unecodable characters */
6468
6469            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6470                ++collend;
6471
6472            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6473            if (error_handler == _Py_ERROR_UNKNOWN)
6474                error_handler = get_error_handler(errors);
6475
6476            switch (error_handler) {
6477            case _Py_ERROR_STRICT:
6478                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6479                goto onError;
6480
6481            case _Py_ERROR_REPLACE:
6482                while (collstart++ < collend)
6483                    *str++ = '?';
6484                /* fall through ignore error handler */
6485            case _Py_ERROR_IGNORE:
6486                pos = collend;
6487                break;
6488
6489            case _Py_ERROR_XMLCHARREFREPLACE:
6490                respos = str - PyBytes_AS_STRING(res);
6491                requiredsize = respos;
6492                /* determine replacement size */
6493                for (i = collstart; i < collend; ++i) {
6494                    Py_ssize_t incr;
6495
6496                    ch = PyUnicode_READ(kind, data, i);
6497                    if (ch < 10)
6498                        incr = 2+1+1;
6499                    else if (ch < 100)
6500                        incr = 2+2+1;
6501                    else if (ch < 1000)
6502                        incr = 2+3+1;
6503                    else if (ch < 10000)
6504                        incr = 2+4+1;
6505                    else if (ch < 100000)
6506                        incr = 2+5+1;
6507                    else if (ch < 1000000)
6508                        incr = 2+6+1;
6509                    else {
6510                        assert(ch <= MAX_UNICODE);
6511                        incr = 2+7+1;
6512                    }
6513                    if (requiredsize > PY_SSIZE_T_MAX - incr)
6514                        goto overflow;
6515                    requiredsize += incr;
6516                }
6517                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6518                    goto overflow;
6519                requiredsize += size - collend;
6520                if (requiredsize > ressize) {
6521                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6522                        requiredsize = 2*ressize;
6523                    if (_PyBytes_Resize(&res, requiredsize))
6524                        goto onError;
6525                    str = PyBytes_AS_STRING(res) + respos;
6526                    ressize = requiredsize;
6527                }
6528                /* generate replacement */
6529                for (i = collstart; i < collend; ++i) {
6530                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6531                }
6532                pos = collend;
6533                break;
6534
6535            default:
6536                repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6537                                                              encoding, reason, unicode, &exc,
6538                                                              collstart, collend, &newpos);
6539                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6540                                           PyUnicode_READY(repunicode) == -1))
6541                    goto onError;
6542
6543                if (PyBytes_Check(repunicode)) {
6544                    /* Directly copy bytes result to output. */
6545                    repsize = PyBytes_Size(repunicode);
6546                    if (repsize > 1) {
6547                        /* Make room for all additional bytes. */
6548                        respos = str - PyBytes_AS_STRING(res);
6549                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6550                            Py_DECREF(repunicode);
6551                            goto overflow;
6552                        }
6553                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6554                            Py_DECREF(repunicode);
6555                            goto onError;
6556                        }
6557                        str = PyBytes_AS_STRING(res) + respos;
6558                        ressize += repsize-1;
6559                    }
6560                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6561                    str += repsize;
6562                    pos = newpos;
6563                    Py_DECREF(repunicode);
6564                    break;
6565                }
6566
6567                /* need more space? (at least enough for what we
6568                   have+the replacement+the rest of the string, so
6569                   we won't have to check space for encodable characters) */
6570                respos = str - PyBytes_AS_STRING(res);
6571                repsize = PyUnicode_GET_LENGTH(repunicode);
6572                requiredsize = respos;
6573                if (requiredsize > PY_SSIZE_T_MAX - repsize)
6574                    goto overflow;
6575                requiredsize += repsize;
6576                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6577                    goto overflow;
6578                requiredsize += size - collend;
6579                if (requiredsize > ressize) {
6580                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6581                        requiredsize = 2*ressize;
6582                    if (_PyBytes_Resize(&res, requiredsize)) {
6583                        Py_DECREF(repunicode);
6584                        goto onError;
6585                    }
6586                    str = PyBytes_AS_STRING(res) + respos;
6587                    ressize = requiredsize;
6588                }
6589
6590                /* check if there is anything unencodable in the replacement
6591                   and copy it to the output */
6592                for (i = 0; repsize-->0; ++i, ++str) {
6593                    ch = PyUnicode_READ_CHAR(repunicode, i);
6594                    if (ch >= limit) {
6595                        raise_encode_exception(&exc, encoding, unicode,
6596                                               pos, pos+1, reason);
6597                        Py_DECREF(repunicode);
6598                        goto onError;
6599                    }
6600                    *str = (char)ch;
6601                }
6602                pos = newpos;
6603                Py_DECREF(repunicode);
6604            }
6605        }
6606    }
6607    /* Resize if we allocated to much */
6608    size = str - PyBytes_AS_STRING(res);
6609    if (size < ressize) { /* If this falls res will be NULL */
6610        assert(size >= 0);
6611        if (_PyBytes_Resize(&res, size) < 0)
6612            goto onError;
6613    }
6614
6615    Py_XDECREF(error_handler_obj);
6616    Py_XDECREF(exc);
6617    return res;
6618
6619  overflow:
6620    PyErr_SetString(PyExc_OverflowError,
6621                    "encoded result is too long for a Python string");
6622
6623  onError:
6624    Py_XDECREF(res);
6625    Py_XDECREF(error_handler_obj);
6626    Py_XDECREF(exc);
6627    return NULL;
6628}
6629
6630/* Deprecated */
6631PyObject *
6632PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6633                       Py_ssize_t size,
6634                       const char *errors)
6635{
6636    PyObject *result;
6637    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6638    if (unicode == NULL)
6639        return NULL;
6640    result = unicode_encode_ucs1(unicode, errors, 256);
6641    Py_DECREF(unicode);
6642    return result;
6643}
6644
6645PyObject *
6646_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6647{
6648    if (!PyUnicode_Check(unicode)) {
6649        PyErr_BadArgument();
6650        return NULL;
6651    }
6652    if (PyUnicode_READY(unicode) == -1)
6653        return NULL;
6654    /* Fast path: if it is a one-byte string, construct
6655       bytes object directly. */
6656    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6657        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6658                                         PyUnicode_GET_LENGTH(unicode));
6659    /* Non-Latin-1 characters present. Defer to above function to
6660       raise the exception. */
6661    return unicode_encode_ucs1(unicode, errors, 256);
6662}
6663
6664PyObject*
6665PyUnicode_AsLatin1String(PyObject *unicode)
6666{
6667    return _PyUnicode_AsLatin1String(unicode, NULL);
6668}
6669
6670/* --- 7-bit ASCII Codec -------------------------------------------------- */
6671
6672PyObject *
6673PyUnicode_DecodeASCII(const char *s,
6674                      Py_ssize_t size,
6675                      const char *errors)
6676{
6677    const char *starts = s;
6678    _PyUnicodeWriter writer;
6679    int kind;
6680    void *data;
6681    Py_ssize_t startinpos;
6682    Py_ssize_t endinpos;
6683    Py_ssize_t outpos;
6684    const char *e;
6685    PyObject *error_handler_obj = NULL;
6686    PyObject *exc = NULL;
6687    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6688
6689    if (size == 0)
6690        _Py_RETURN_UNICODE_EMPTY();
6691
6692    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6693    if (size == 1 && (unsigned char)s[0] < 128)
6694        return get_latin1_char((unsigned char)s[0]);
6695
6696    _PyUnicodeWriter_Init(&writer);
6697    writer.min_length = size;
6698    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6699        return NULL;
6700
6701    e = s + size;
6702    data = writer.data;
6703    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6704    writer.pos = outpos;
6705    if (writer.pos == size)
6706        return _PyUnicodeWriter_Finish(&writer);
6707
6708    s += writer.pos;
6709    kind = writer.kind;
6710    while (s < e) {
6711        unsigned char c = (unsigned char)*s;
6712        if (c < 128) {
6713            PyUnicode_WRITE(kind, data, writer.pos, c);
6714            writer.pos++;
6715            ++s;
6716            continue;
6717        }
6718
6719        /* byte outsize range 0x00..0x7f: call the error handler */
6720
6721        if (error_handler == _Py_ERROR_UNKNOWN)
6722            error_handler = get_error_handler(errors);
6723
6724        switch (error_handler)
6725        {
6726        case _Py_ERROR_REPLACE:
6727        case _Py_ERROR_SURROGATEESCAPE:
6728            /* Fast-path: the error handler only writes one character,
6729               but we may switch to UCS2 at the first write */
6730            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6731                goto onError;
6732            kind = writer.kind;
6733            data = writer.data;
6734
6735            if (error_handler == _Py_ERROR_REPLACE)
6736                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6737            else
6738                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6739            writer.pos++;
6740            ++s;
6741            break;
6742
6743        case _Py_ERROR_IGNORE:
6744            ++s;
6745            break;
6746
6747        default:
6748            startinpos = s-starts;
6749            endinpos = startinpos + 1;
6750            if (unicode_decode_call_errorhandler_writer(
6751                    errors, &error_handler_obj,
6752                    "ascii", "ordinal not in range(128)",
6753                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6754                    &writer))
6755                goto onError;
6756            kind = writer.kind;
6757            data = writer.data;
6758        }
6759    }
6760    Py_XDECREF(error_handler_obj);
6761    Py_XDECREF(exc);
6762    return _PyUnicodeWriter_Finish(&writer);
6763
6764  onError:
6765    _PyUnicodeWriter_Dealloc(&writer);
6766    Py_XDECREF(error_handler_obj);
6767    Py_XDECREF(exc);
6768    return NULL;
6769}
6770
6771/* Deprecated */
6772PyObject *
6773PyUnicode_EncodeASCII(const Py_UNICODE *p,
6774                      Py_ssize_t size,
6775                      const char *errors)
6776{
6777    PyObject *result;
6778    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6779    if (unicode == NULL)
6780        return NULL;
6781    result = unicode_encode_ucs1(unicode, errors, 128);
6782    Py_DECREF(unicode);
6783    return result;
6784}
6785
6786PyObject *
6787_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6788{
6789    if (!PyUnicode_Check(unicode)) {
6790        PyErr_BadArgument();
6791        return NULL;
6792    }
6793    if (PyUnicode_READY(unicode) == -1)
6794        return NULL;
6795    /* Fast path: if it is an ASCII-only string, construct bytes object
6796       directly. Else defer to above function to raise the exception. */
6797    if (PyUnicode_IS_ASCII(unicode))
6798        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6799                                         PyUnicode_GET_LENGTH(unicode));
6800    return unicode_encode_ucs1(unicode, errors, 128);
6801}
6802
6803PyObject *
6804PyUnicode_AsASCIIString(PyObject *unicode)
6805{
6806    return _PyUnicode_AsASCIIString(unicode, NULL);
6807}
6808
6809#ifdef HAVE_MBCS
6810
6811/* --- MBCS codecs for Windows -------------------------------------------- */
6812
6813#if SIZEOF_INT < SIZEOF_SIZE_T
6814#define NEED_RETRY
6815#endif
6816
6817#ifndef WC_ERR_INVALID_CHARS
6818#  define WC_ERR_INVALID_CHARS 0x0080
6819#endif
6820
6821static char*
6822code_page_name(UINT code_page, PyObject **obj)
6823{
6824    *obj = NULL;
6825    if (code_page == CP_ACP)
6826        return "mbcs";
6827    if (code_page == CP_UTF7)
6828        return "CP_UTF7";
6829    if (code_page == CP_UTF8)
6830        return "CP_UTF8";
6831
6832    *obj = PyBytes_FromFormat("cp%u", code_page);
6833    if (*obj == NULL)
6834        return NULL;
6835    return PyBytes_AS_STRING(*obj);
6836}
6837
6838static DWORD
6839decode_code_page_flags(UINT code_page)
6840{
6841    if (code_page == CP_UTF7) {
6842        /* The CP_UTF7 decoder only supports flags=0 */
6843        return 0;
6844    }
6845    else
6846        return MB_ERR_INVALID_CHARS;
6847}
6848
6849/*
6850 * Decode a byte string from a Windows code page into unicode object in strict
6851 * mode.
6852 *
6853 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6854 * OSError and returns -1 on other error.
6855 */
6856static int
6857decode_code_page_strict(UINT code_page,
6858                        PyObject **v,
6859                        const char *in,
6860                        int insize)
6861{
6862    const DWORD flags = decode_code_page_flags(code_page);
6863    wchar_t *out;
6864    DWORD outsize;
6865
6866    /* First get the size of the result */
6867    assert(insize > 0);
6868    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6869    if (outsize <= 0)
6870        goto error;
6871
6872    if (*v == NULL) {
6873        /* Create unicode object */
6874        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6875        *v = (PyObject*)_PyUnicode_New(outsize);
6876        if (*v == NULL)
6877            return -1;
6878        out = PyUnicode_AS_UNICODE(*v);
6879    }
6880    else {
6881        /* Extend unicode object */
6882        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6883        if (unicode_resize(v, n + outsize) < 0)
6884            return -1;
6885        out = PyUnicode_AS_UNICODE(*v) + n;
6886    }
6887
6888    /* Do the conversion */
6889    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6890    if (outsize <= 0)
6891        goto error;
6892    return insize;
6893
6894error:
6895    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6896        return -2;
6897    PyErr_SetFromWindowsErr(0);
6898    return -1;
6899}
6900
6901/*
6902 * Decode a byte string from a code page into unicode object with an error
6903 * handler.
6904 *
6905 * Returns consumed size if succeed, or raise an OSError or
6906 * UnicodeDecodeError exception and returns -1 on error.
6907 */
6908static int
6909decode_code_page_errors(UINT code_page,
6910                        PyObject **v,
6911                        const char *in, const int size,
6912                        const char *errors, int final)
6913{
6914    const char *startin = in;
6915    const char *endin = in + size;
6916    const DWORD flags = decode_code_page_flags(code_page);
6917    /* Ideally, we should get reason from FormatMessage. This is the Windows
6918       2000 English version of the message. */
6919    const char *reason = "No mapping for the Unicode character exists "
6920                         "in the target code page.";
6921    /* each step cannot decode more than 1 character, but a character can be
6922       represented as a surrogate pair */
6923    wchar_t buffer[2], *startout, *out;
6924    int insize;
6925    Py_ssize_t outsize;
6926    PyObject *errorHandler = NULL;
6927    PyObject *exc = NULL;
6928    PyObject *encoding_obj = NULL;
6929    char *encoding;
6930    DWORD err;
6931    int ret = -1;
6932
6933    assert(size > 0);
6934
6935    encoding = code_page_name(code_page, &encoding_obj);
6936    if (encoding == NULL)
6937        return -1;
6938
6939    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6940        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6941           UnicodeDecodeError. */
6942        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6943        if (exc != NULL) {
6944            PyCodec_StrictErrors(exc);
6945            Py_CLEAR(exc);
6946        }
6947        goto error;
6948    }
6949
6950    if (*v == NULL) {
6951        /* Create unicode object */
6952        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6953            PyErr_NoMemory();
6954            goto error;
6955        }
6956        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6957        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6958        if (*v == NULL)
6959            goto error;
6960        startout = PyUnicode_AS_UNICODE(*v);
6961    }
6962    else {
6963        /* Extend unicode object */
6964        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6965        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6966            PyErr_NoMemory();
6967            goto error;
6968        }
6969        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6970            goto error;
6971        startout = PyUnicode_AS_UNICODE(*v) + n;
6972    }
6973
6974    /* Decode the byte string character per character */
6975    out = startout;
6976    while (in < endin)
6977    {
6978        /* Decode a character */
6979        insize = 1;
6980        do
6981        {
6982            outsize = MultiByteToWideChar(code_page, flags,
6983                                          in, insize,
6984                                          buffer, Py_ARRAY_LENGTH(buffer));
6985            if (outsize > 0)
6986                break;
6987            err = GetLastError();
6988            if (err != ERROR_NO_UNICODE_TRANSLATION
6989                && err != ERROR_INSUFFICIENT_BUFFER)
6990            {
6991                PyErr_SetFromWindowsErr(0);
6992                goto error;
6993            }
6994            insize++;
6995        }
6996        /* 4=maximum length of a UTF-8 sequence */
6997        while (insize <= 4 && (in + insize) <= endin);
6998
6999        if (outsize <= 0) {
7000            Py_ssize_t startinpos, endinpos, outpos;
7001
7002            /* last character in partial decode? */
7003            if (in + insize >= endin && !final)
7004                break;
7005
7006            startinpos = in - startin;
7007            endinpos = startinpos + 1;
7008            outpos = out - PyUnicode_AS_UNICODE(*v);
7009            if (unicode_decode_call_errorhandler_wchar(
7010                    errors, &errorHandler,
7011                    encoding, reason,
7012                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7013                    v, &outpos))
7014            {
7015                goto error;
7016            }
7017            out = PyUnicode_AS_UNICODE(*v) + outpos;
7018        }
7019        else {
7020            in += insize;
7021            memcpy(out, buffer, outsize * sizeof(wchar_t));
7022            out += outsize;
7023        }
7024    }
7025
7026    /* write a NUL character at the end */
7027    *out = 0;
7028
7029    /* Extend unicode object */
7030    outsize = out - startout;
7031    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7032    if (unicode_resize(v, outsize) < 0)
7033        goto error;
7034    /* (in - startin) <= size and size is an int */
7035    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7036
7037error:
7038    Py_XDECREF(encoding_obj);
7039    Py_XDECREF(errorHandler);
7040    Py_XDECREF(exc);
7041    return ret;
7042}
7043
7044static PyObject *
7045decode_code_page_stateful(int code_page,
7046                          const char *s, Py_ssize_t size,
7047                          const char *errors, Py_ssize_t *consumed)
7048{
7049    PyObject *v = NULL;
7050    int chunk_size, final, converted, done;
7051
7052    if (code_page < 0) {
7053        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7054        return NULL;
7055    }
7056
7057    if (consumed)
7058        *consumed = 0;
7059
7060    do
7061    {
7062#ifdef NEED_RETRY
7063        if (size > INT_MAX) {
7064            chunk_size = INT_MAX;
7065            final = 0;
7066            done = 0;
7067        }
7068        else
7069#endif
7070        {
7071            chunk_size = (int)size;
7072            final = (consumed == NULL);
7073            done = 1;
7074        }
7075
7076        if (chunk_size == 0 && done) {
7077            if (v != NULL)
7078                break;
7079            _Py_RETURN_UNICODE_EMPTY();
7080        }
7081
7082        converted = decode_code_page_strict(code_page, &v,
7083                                            s, chunk_size);
7084        if (converted == -2)
7085            converted = decode_code_page_errors(code_page, &v,
7086                                                s, chunk_size,
7087                                                errors, final);
7088        assert(converted != 0 || done);
7089
7090        if (converted < 0) {
7091            Py_XDECREF(v);
7092            return NULL;
7093        }
7094
7095        if (consumed)
7096            *consumed += converted;
7097
7098        s += converted;
7099        size -= converted;
7100    } while (!done);
7101
7102    return unicode_result(v);
7103}
7104
7105PyObject *
7106PyUnicode_DecodeCodePageStateful(int code_page,
7107                                 const char *s,
7108                                 Py_ssize_t size,
7109                                 const char *errors,
7110                                 Py_ssize_t *consumed)
7111{
7112    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7113}
7114
7115PyObject *
7116PyUnicode_DecodeMBCSStateful(const char *s,
7117                             Py_ssize_t size,
7118                             const char *errors,
7119                             Py_ssize_t *consumed)
7120{
7121    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7122}
7123
7124PyObject *
7125PyUnicode_DecodeMBCS(const char *s,
7126                     Py_ssize_t size,
7127                     const char *errors)
7128{
7129    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7130}
7131
7132static DWORD
7133encode_code_page_flags(UINT code_page, const char *errors)
7134{
7135    if (code_page == CP_UTF8) {
7136        return WC_ERR_INVALID_CHARS;
7137    }
7138    else if (code_page == CP_UTF7) {
7139        /* CP_UTF7 only supports flags=0 */
7140        return 0;
7141    }
7142    else {
7143        if (errors != NULL && strcmp(errors, "replace") == 0)
7144            return 0;
7145        else
7146            return WC_NO_BEST_FIT_CHARS;
7147    }
7148}
7149
7150/*
7151 * Encode a Unicode string to a Windows code page into a byte string in strict
7152 * mode.
7153 *
7154 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7155 * an OSError and returns -1 on other error.
7156 */
7157static int
7158encode_code_page_strict(UINT code_page, PyObject **outbytes,
7159                        PyObject *unicode, Py_ssize_t offset, int len,
7160                        const char* errors)
7161{
7162    BOOL usedDefaultChar = FALSE;
7163    BOOL *pusedDefaultChar = &usedDefaultChar;
7164    int outsize;
7165    PyObject *exc = NULL;
7166    wchar_t *p;
7167    Py_ssize_t size;
7168    const DWORD flags = encode_code_page_flags(code_page, NULL);
7169    char *out;
7170    /* Create a substring so that we can get the UTF-16 representation
7171       of just the slice under consideration. */
7172    PyObject *substring;
7173
7174    assert(len > 0);
7175
7176    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7177        pusedDefaultChar = &usedDefaultChar;
7178    else
7179        pusedDefaultChar = NULL;
7180
7181    substring = PyUnicode_Substring(unicode, offset, offset+len);
7182    if (substring == NULL)
7183        return -1;
7184    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7185    if (p == NULL) {
7186        Py_DECREF(substring);
7187        return -1;
7188    }
7189    assert(size <= INT_MAX);
7190
7191    /* First get the size of the result */
7192    outsize = WideCharToMultiByte(code_page, flags,
7193                                  p, (int)size,
7194                                  NULL, 0,
7195                                  NULL, pusedDefaultChar);
7196    if (outsize <= 0)
7197        goto error;
7198    /* If we used a default char, then we failed! */
7199    if (pusedDefaultChar && *pusedDefaultChar) {
7200        Py_DECREF(substring);
7201        return -2;
7202    }
7203
7204    if (*outbytes == NULL) {
7205        /* Create string object */
7206        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7207        if (*outbytes == NULL) {
7208            Py_DECREF(substring);
7209            return -1;
7210        }
7211        out = PyBytes_AS_STRING(*outbytes);
7212    }
7213    else {
7214        /* Extend string object */
7215        const Py_ssize_t n = PyBytes_Size(*outbytes);
7216        if (outsize > PY_SSIZE_T_MAX - n) {
7217            PyErr_NoMemory();
7218            Py_DECREF(substring);
7219            return -1;
7220        }
7221        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7222            Py_DECREF(substring);
7223            return -1;
7224        }
7225        out = PyBytes_AS_STRING(*outbytes) + n;
7226    }
7227
7228    /* Do the conversion */
7229    outsize = WideCharToMultiByte(code_page, flags,
7230                                  p, (int)size,
7231                                  out, outsize,
7232                                  NULL, pusedDefaultChar);
7233    Py_CLEAR(substring);
7234    if (outsize <= 0)
7235        goto error;
7236    if (pusedDefaultChar && *pusedDefaultChar)
7237        return -2;
7238    return 0;
7239
7240error:
7241    Py_XDECREF(substring);
7242    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7243        return -2;
7244    PyErr_SetFromWindowsErr(0);
7245    return -1;
7246}
7247
7248/*
7249 * Encode a Unicode string to a Windows code page into a byte string using a
7250 * error handler.
7251 *
7252 * Returns consumed characters if succeed, or raise an OSError and returns
7253 * -1 on other error.
7254 */
7255static int
7256encode_code_page_errors(UINT code_page, PyObject **outbytes,
7257                        PyObject *unicode, Py_ssize_t unicode_offset,
7258                        Py_ssize_t insize, const char* errors)
7259{
7260    const DWORD flags = encode_code_page_flags(code_page, errors);
7261    Py_ssize_t pos = unicode_offset;
7262    Py_ssize_t endin = unicode_offset + insize;
7263    /* Ideally, we should get reason from FormatMessage. This is the Windows
7264       2000 English version of the message. */
7265    const char *reason = "invalid character";
7266    /* 4=maximum length of a UTF-8 sequence */
7267    char buffer[4];
7268    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7269    Py_ssize_t outsize;
7270    char *out;
7271    PyObject *errorHandler = NULL;
7272    PyObject *exc = NULL;
7273    PyObject *encoding_obj = NULL;
7274    char *encoding;
7275    Py_ssize_t newpos, newoutsize;
7276    PyObject *rep;
7277    int ret = -1;
7278
7279    assert(insize > 0);
7280
7281    encoding = code_page_name(code_page, &encoding_obj);
7282    if (encoding == NULL)
7283        return -1;
7284
7285    if (errors == NULL || strcmp(errors, "strict") == 0) {
7286        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7287           then we raise a UnicodeEncodeError. */
7288        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7289        if (exc != NULL) {
7290            PyCodec_StrictErrors(exc);
7291            Py_DECREF(exc);
7292        }
7293        Py_XDECREF(encoding_obj);
7294        return -1;
7295    }
7296
7297    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7298        pusedDefaultChar = &usedDefaultChar;
7299    else
7300        pusedDefaultChar = NULL;
7301
7302    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7303        PyErr_NoMemory();
7304        goto error;
7305    }
7306    outsize = insize * Py_ARRAY_LENGTH(buffer);
7307
7308    if (*outbytes == NULL) {
7309        /* Create string object */
7310        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7311        if (*outbytes == NULL)
7312            goto error;
7313        out = PyBytes_AS_STRING(*outbytes);
7314    }
7315    else {
7316        /* Extend string object */
7317        Py_ssize_t n = PyBytes_Size(*outbytes);
7318        if (n > PY_SSIZE_T_MAX - outsize) {
7319            PyErr_NoMemory();
7320            goto error;
7321        }
7322        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7323            goto error;
7324        out = PyBytes_AS_STRING(*outbytes) + n;
7325    }
7326
7327    /* Encode the string character per character */
7328    while (pos < endin)
7329    {
7330        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7331        wchar_t chars[2];
7332        int charsize;
7333        if (ch < 0x10000) {
7334            chars[0] = (wchar_t)ch;
7335            charsize = 1;
7336        }
7337        else {
7338            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7339            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7340            charsize = 2;
7341        }
7342
7343        outsize = WideCharToMultiByte(code_page, flags,
7344                                      chars, charsize,
7345                                      buffer, Py_ARRAY_LENGTH(buffer),
7346                                      NULL, pusedDefaultChar);
7347        if (outsize > 0) {
7348            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7349            {
7350                pos++;
7351                memcpy(out, buffer, outsize);
7352                out += outsize;
7353                continue;
7354            }
7355        }
7356        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7357            PyErr_SetFromWindowsErr(0);
7358            goto error;
7359        }
7360
7361        rep = unicode_encode_call_errorhandler(
7362                  errors, &errorHandler, encoding, reason,
7363                  unicode, &exc,
7364                  pos, pos + 1, &newpos);
7365        if (rep == NULL)
7366            goto error;
7367        pos = newpos;
7368
7369        if (PyBytes_Check(rep)) {
7370            outsize = PyBytes_GET_SIZE(rep);
7371            if (outsize != 1) {
7372                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7373                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7374                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7375                    Py_DECREF(rep);
7376                    goto error;
7377                }
7378                out = PyBytes_AS_STRING(*outbytes) + offset;
7379            }
7380            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7381            out += outsize;
7382        }
7383        else {
7384            Py_ssize_t i;
7385            enum PyUnicode_Kind kind;
7386            void *data;
7387
7388            if (PyUnicode_READY(rep) == -1) {
7389                Py_DECREF(rep);
7390                goto error;
7391            }
7392
7393            outsize = PyUnicode_GET_LENGTH(rep);
7394            if (outsize != 1) {
7395                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7396                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7397                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7398                    Py_DECREF(rep);
7399                    goto error;
7400                }
7401                out = PyBytes_AS_STRING(*outbytes) + offset;
7402            }
7403            kind = PyUnicode_KIND(rep);
7404            data = PyUnicode_DATA(rep);
7405            for (i=0; i < outsize; i++) {
7406                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7407                if (ch > 127) {
7408                    raise_encode_exception(&exc,
7409                        encoding, unicode,
7410                        pos, pos + 1,
7411                        "unable to encode error handler result to ASCII");
7412                    Py_DECREF(rep);
7413                    goto error;
7414                }
7415                *out = (unsigned char)ch;
7416                out++;
7417            }
7418        }
7419        Py_DECREF(rep);
7420    }
7421    /* write a NUL byte */
7422    *out = 0;
7423    outsize = out - PyBytes_AS_STRING(*outbytes);
7424    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7425    if (_PyBytes_Resize(outbytes, outsize) < 0)
7426        goto error;
7427    ret = 0;
7428
7429error:
7430    Py_XDECREF(encoding_obj);
7431    Py_XDECREF(errorHandler);
7432    Py_XDECREF(exc);
7433    return ret;
7434}
7435
7436static PyObject *
7437encode_code_page(int code_page,
7438                 PyObject *unicode,
7439                 const char *errors)
7440{
7441    Py_ssize_t len;
7442    PyObject *outbytes = NULL;
7443    Py_ssize_t offset;
7444    int chunk_len, ret, done;
7445
7446    if (!PyUnicode_Check(unicode)) {
7447        PyErr_BadArgument();
7448        return NULL;
7449    }
7450
7451    if (PyUnicode_READY(unicode) == -1)
7452        return NULL;
7453    len = PyUnicode_GET_LENGTH(unicode);
7454
7455    if (code_page < 0) {
7456        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7457        return NULL;
7458    }
7459
7460    if (len == 0)
7461        return PyBytes_FromStringAndSize(NULL, 0);
7462
7463    offset = 0;
7464    do
7465    {
7466#ifdef NEED_RETRY
7467        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7468           chunks. */
7469        if (len > INT_MAX/2) {
7470            chunk_len = INT_MAX/2;
7471            done = 0;
7472        }
7473        else
7474#endif
7475        {
7476            chunk_len = (int)len;
7477            done = 1;
7478        }
7479
7480        ret = encode_code_page_strict(code_page, &outbytes,
7481                                      unicode, offset, chunk_len,
7482                                      errors);
7483        if (ret == -2)
7484            ret = encode_code_page_errors(code_page, &outbytes,
7485                                          unicode, offset,
7486                                          chunk_len, errors);
7487        if (ret < 0) {
7488            Py_XDECREF(outbytes);
7489            return NULL;
7490        }
7491
7492        offset += chunk_len;
7493        len -= chunk_len;
7494    } while (!done);
7495
7496    return outbytes;
7497}
7498
7499PyObject *
7500PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7501                     Py_ssize_t size,
7502                     const char *errors)
7503{
7504    PyObject *unicode, *res;
7505    unicode = PyUnicode_FromUnicode(p, size);
7506    if (unicode == NULL)
7507        return NULL;
7508    res = encode_code_page(CP_ACP, unicode, errors);
7509    Py_DECREF(unicode);
7510    return res;
7511}
7512
7513PyObject *
7514PyUnicode_EncodeCodePage(int code_page,
7515                         PyObject *unicode,
7516                         const char *errors)
7517{
7518    return encode_code_page(code_page, unicode, errors);
7519}
7520
7521PyObject *
7522PyUnicode_AsMBCSString(PyObject *unicode)
7523{
7524    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7525}
7526
7527#undef NEED_RETRY
7528
7529#endif /* HAVE_MBCS */
7530
7531/* --- Character Mapping Codec -------------------------------------------- */
7532
7533static int
7534charmap_decode_string(const char *s,
7535                      Py_ssize_t size,
7536                      PyObject *mapping,
7537                      const char *errors,
7538                      _PyUnicodeWriter *writer)
7539{
7540    const char *starts = s;
7541    const char *e;
7542    Py_ssize_t startinpos, endinpos;
7543    PyObject *errorHandler = NULL, *exc = NULL;
7544    Py_ssize_t maplen;
7545    enum PyUnicode_Kind mapkind;
7546    void *mapdata;
7547    Py_UCS4 x;
7548    unsigned char ch;
7549
7550    if (PyUnicode_READY(mapping) == -1)
7551        return -1;
7552
7553    maplen = PyUnicode_GET_LENGTH(mapping);
7554    mapdata = PyUnicode_DATA(mapping);
7555    mapkind = PyUnicode_KIND(mapping);
7556
7557    e = s + size;
7558
7559    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7560        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7561         * is disabled in encoding aliases, latin1 is preferred because
7562         * its implementation is faster. */
7563        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7564        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7565        Py_UCS4 maxchar = writer->maxchar;
7566
7567        assert (writer->kind == PyUnicode_1BYTE_KIND);
7568        while (s < e) {
7569            ch = *s;
7570            x = mapdata_ucs1[ch];
7571            if (x > maxchar) {
7572                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7573                    goto onError;
7574                maxchar = writer->maxchar;
7575                outdata = (Py_UCS1 *)writer->data;
7576            }
7577            outdata[writer->pos] = x;
7578            writer->pos++;
7579            ++s;
7580        }
7581        return 0;
7582    }
7583
7584    while (s < e) {
7585        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7586            enum PyUnicode_Kind outkind = writer->kind;
7587            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7588            if (outkind == PyUnicode_1BYTE_KIND) {
7589                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7590                Py_UCS4 maxchar = writer->maxchar;
7591                while (s < e) {
7592                    ch = *s;
7593                    x = mapdata_ucs2[ch];
7594                    if (x > maxchar)
7595                        goto Error;
7596                    outdata[writer->pos] = x;
7597                    writer->pos++;
7598                    ++s;
7599                }
7600                break;
7601            }
7602            else if (outkind == PyUnicode_2BYTE_KIND) {
7603                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7604                while (s < e) {
7605                    ch = *s;
7606                    x = mapdata_ucs2[ch];
7607                    if (x == 0xFFFE)
7608                        goto Error;
7609                    outdata[writer->pos] = x;
7610                    writer->pos++;
7611                    ++s;
7612                }
7613                break;
7614            }
7615        }
7616        ch = *s;
7617
7618        if (ch < maplen)
7619            x = PyUnicode_READ(mapkind, mapdata, ch);
7620        else
7621            x = 0xfffe; /* invalid value */
7622Error:
7623        if (x == 0xfffe)
7624        {
7625            /* undefined mapping */
7626            startinpos = s-starts;
7627            endinpos = startinpos+1;
7628            if (unicode_decode_call_errorhandler_writer(
7629                    errors, &errorHandler,
7630                    "charmap", "character maps to <undefined>",
7631                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7632                    writer)) {
7633                goto onError;
7634            }
7635            continue;
7636        }
7637
7638        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7639            goto onError;
7640        ++s;
7641    }
7642    Py_XDECREF(errorHandler);
7643    Py_XDECREF(exc);
7644    return 0;
7645
7646onError:
7647    Py_XDECREF(errorHandler);
7648    Py_XDECREF(exc);
7649    return -1;
7650}
7651
7652static int
7653charmap_decode_mapping(const char *s,
7654                       Py_ssize_t size,
7655                       PyObject *mapping,
7656                       const char *errors,
7657                       _PyUnicodeWriter *writer)
7658{
7659    const char *starts = s;
7660    const char *e;
7661    Py_ssize_t startinpos, endinpos;
7662    PyObject *errorHandler = NULL, *exc = NULL;
7663    unsigned char ch;
7664    PyObject *key, *item = NULL;
7665
7666    e = s + size;
7667
7668    while (s < e) {
7669        ch = *s;
7670
7671        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7672        key = PyLong_FromLong((long)ch);
7673        if (key == NULL)
7674            goto onError;
7675
7676        item = PyObject_GetItem(mapping, key);
7677        Py_DECREF(key);
7678        if (item == NULL) {
7679            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7680                /* No mapping found means: mapping is undefined. */
7681                PyErr_Clear();
7682                goto Undefined;
7683            } else
7684                goto onError;
7685        }
7686
7687        /* Apply mapping */
7688        if (item == Py_None)
7689            goto Undefined;
7690        if (PyLong_Check(item)) {
7691            long value = PyLong_AS_LONG(item);
7692            if (value == 0xFFFE)
7693                goto Undefined;
7694            if (value < 0 || value > MAX_UNICODE) {
7695                PyErr_Format(PyExc_TypeError,
7696                             "character mapping must be in range(0x%lx)",
7697                             (unsigned long)MAX_UNICODE + 1);
7698                goto onError;
7699            }
7700
7701            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7702                goto onError;
7703        }
7704        else if (PyUnicode_Check(item)) {
7705            if (PyUnicode_READY(item) == -1)
7706                goto onError;
7707            if (PyUnicode_GET_LENGTH(item) == 1) {
7708                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7709                if (value == 0xFFFE)
7710                    goto Undefined;
7711                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7712                    goto onError;
7713            }
7714            else {
7715                writer->overallocate = 1;
7716                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7717                    goto onError;
7718            }
7719        }
7720        else {
7721            /* wrong return value */
7722            PyErr_SetString(PyExc_TypeError,
7723                            "character mapping must return integer, None or str");
7724            goto onError;
7725        }
7726        Py_CLEAR(item);
7727        ++s;
7728        continue;
7729
7730Undefined:
7731        /* undefined mapping */
7732        Py_CLEAR(item);
7733        startinpos = s-starts;
7734        endinpos = startinpos+1;
7735        if (unicode_decode_call_errorhandler_writer(
7736                errors, &errorHandler,
7737                "charmap", "character maps to <undefined>",
7738                &starts, &e, &startinpos, &endinpos, &exc, &s,
7739                writer)) {
7740            goto onError;
7741        }
7742    }
7743    Py_XDECREF(errorHandler);
7744    Py_XDECREF(exc);
7745    return 0;
7746
7747onError:
7748    Py_XDECREF(item);
7749    Py_XDECREF(errorHandler);
7750    Py_XDECREF(exc);
7751    return -1;
7752}
7753
7754PyObject *
7755PyUnicode_DecodeCharmap(const char *s,
7756                        Py_ssize_t size,
7757                        PyObject *mapping,
7758                        const char *errors)
7759{
7760    _PyUnicodeWriter writer;
7761
7762    /* Default to Latin-1 */
7763    if (mapping == NULL)
7764        return PyUnicode_DecodeLatin1(s, size, errors);
7765
7766    if (size == 0)
7767        _Py_RETURN_UNICODE_EMPTY();
7768    _PyUnicodeWriter_Init(&writer);
7769    writer.min_length = size;
7770    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7771        goto onError;
7772
7773    if (PyUnicode_CheckExact(mapping)) {
7774        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7775            goto onError;
7776    }
7777    else {
7778        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7779            goto onError;
7780    }
7781    return _PyUnicodeWriter_Finish(&writer);
7782
7783  onError:
7784    _PyUnicodeWriter_Dealloc(&writer);
7785    return NULL;
7786}
7787
7788/* Charmap encoding: the lookup table */
7789
7790struct encoding_map {
7791    PyObject_HEAD
7792    unsigned char level1[32];
7793    int count2, count3;
7794    unsigned char level23[1];
7795};
7796
7797static PyObject*
7798encoding_map_size(PyObject *obj, PyObject* args)
7799{
7800    struct encoding_map *map = (struct encoding_map*)obj;
7801    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7802                           128*map->count3);
7803}
7804
7805static PyMethodDef encoding_map_methods[] = {
7806    {"size", encoding_map_size, METH_NOARGS,
7807     PyDoc_STR("Return the size (in bytes) of this object") },
7808    { 0 }
7809};
7810
7811static void
7812encoding_map_dealloc(PyObject* o)
7813{
7814    PyObject_FREE(o);
7815}
7816
7817static PyTypeObject EncodingMapType = {
7818    PyVarObject_HEAD_INIT(NULL, 0)
7819    "EncodingMap",          /*tp_name*/
7820    sizeof(struct encoding_map),   /*tp_basicsize*/
7821    0,                      /*tp_itemsize*/
7822    /* methods */
7823    encoding_map_dealloc,   /*tp_dealloc*/
7824    0,                      /*tp_print*/
7825    0,                      /*tp_getattr*/
7826    0,                      /*tp_setattr*/
7827    0,                      /*tp_reserved*/
7828    0,                      /*tp_repr*/
7829    0,                      /*tp_as_number*/
7830    0,                      /*tp_as_sequence*/
7831    0,                      /*tp_as_mapping*/
7832    0,                      /*tp_hash*/
7833    0,                      /*tp_call*/
7834    0,                      /*tp_str*/
7835    0,                      /*tp_getattro*/
7836    0,                      /*tp_setattro*/
7837    0,                      /*tp_as_buffer*/
7838    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7839    0,                      /*tp_doc*/
7840    0,                      /*tp_traverse*/
7841    0,                      /*tp_clear*/
7842    0,                      /*tp_richcompare*/
7843    0,                      /*tp_weaklistoffset*/
7844    0,                      /*tp_iter*/
7845    0,                      /*tp_iternext*/
7846    encoding_map_methods,   /*tp_methods*/
7847    0,                      /*tp_members*/
7848    0,                      /*tp_getset*/
7849    0,                      /*tp_base*/
7850    0,                      /*tp_dict*/
7851    0,                      /*tp_descr_get*/
7852    0,                      /*tp_descr_set*/
7853    0,                      /*tp_dictoffset*/
7854    0,                      /*tp_init*/
7855    0,                      /*tp_alloc*/
7856    0,                      /*tp_new*/
7857    0,                      /*tp_free*/
7858    0,                      /*tp_is_gc*/
7859};
7860
7861PyObject*
7862PyUnicode_BuildEncodingMap(PyObject* string)
7863{
7864    PyObject *result;
7865    struct encoding_map *mresult;
7866    int i;
7867    int need_dict = 0;
7868    unsigned char level1[32];
7869    unsigned char level2[512];
7870    unsigned char *mlevel1, *mlevel2, *mlevel3;
7871    int count2 = 0, count3 = 0;
7872    int kind;
7873    void *data;
7874    Py_ssize_t length;
7875    Py_UCS4 ch;
7876
7877    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7878        PyErr_BadArgument();
7879        return NULL;
7880    }
7881    kind = PyUnicode_KIND(string);
7882    data = PyUnicode_DATA(string);
7883    length = PyUnicode_GET_LENGTH(string);
7884    length = Py_MIN(length, 256);
7885    memset(level1, 0xFF, sizeof level1);
7886    memset(level2, 0xFF, sizeof level2);
7887
7888    /* If there isn't a one-to-one mapping of NULL to \0,
7889       or if there are non-BMP characters, we need to use
7890       a mapping dictionary. */
7891    if (PyUnicode_READ(kind, data, 0) != 0)
7892        need_dict = 1;
7893    for (i = 1; i < length; i++) {
7894        int l1, l2;
7895        ch = PyUnicode_READ(kind, data, i);
7896        if (ch == 0 || ch > 0xFFFF) {
7897            need_dict = 1;
7898            break;
7899        }
7900        if (ch == 0xFFFE)
7901            /* unmapped character */
7902            continue;
7903        l1 = ch >> 11;
7904        l2 = ch >> 7;
7905        if (level1[l1] == 0xFF)
7906            level1[l1] = count2++;
7907        if (level2[l2] == 0xFF)
7908            level2[l2] = count3++;
7909    }
7910
7911    if (count2 >= 0xFF || count3 >= 0xFF)
7912        need_dict = 1;
7913
7914    if (need_dict) {
7915        PyObject *result = PyDict_New();
7916        PyObject *key, *value;
7917        if (!result)
7918            return NULL;
7919        for (i = 0; i < length; i++) {
7920            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7921            value = PyLong_FromLong(i);
7922            if (!key || !value)
7923                goto failed1;
7924            if (PyDict_SetItem(result, key, value) == -1)
7925                goto failed1;
7926            Py_DECREF(key);
7927            Py_DECREF(value);
7928        }
7929        return result;
7930      failed1:
7931        Py_XDECREF(key);
7932        Py_XDECREF(value);
7933        Py_DECREF(result);
7934        return NULL;
7935    }
7936
7937    /* Create a three-level trie */
7938    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7939                             16*count2 + 128*count3 - 1);
7940    if (!result)
7941        return PyErr_NoMemory();
7942    PyObject_Init(result, &EncodingMapType);
7943    mresult = (struct encoding_map*)result;
7944    mresult->count2 = count2;
7945    mresult->count3 = count3;
7946    mlevel1 = mresult->level1;
7947    mlevel2 = mresult->level23;
7948    mlevel3 = mresult->level23 + 16*count2;
7949    memcpy(mlevel1, level1, 32);
7950    memset(mlevel2, 0xFF, 16*count2);
7951    memset(mlevel3, 0, 128*count3);
7952    count3 = 0;
7953    for (i = 1; i < length; i++) {
7954        int o1, o2, o3, i2, i3;
7955        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7956        if (ch == 0xFFFE)
7957            /* unmapped character */
7958            continue;
7959        o1 = ch>>11;
7960        o2 = (ch>>7) & 0xF;
7961        i2 = 16*mlevel1[o1] + o2;
7962        if (mlevel2[i2] == 0xFF)
7963            mlevel2[i2] = count3++;
7964        o3 = ch & 0x7F;
7965        i3 = 128*mlevel2[i2] + o3;
7966        mlevel3[i3] = i;
7967    }
7968    return result;
7969}
7970
7971static int
7972encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7973{
7974    struct encoding_map *map = (struct encoding_map*)mapping;
7975    int l1 = c>>11;
7976    int l2 = (c>>7) & 0xF;
7977    int l3 = c & 0x7F;
7978    int i;
7979
7980    if (c > 0xFFFF)
7981        return -1;
7982    if (c == 0)
7983        return 0;
7984    /* level 1*/
7985    i = map->level1[l1];
7986    if (i == 0xFF) {
7987        return -1;
7988    }
7989    /* level 2*/
7990    i = map->level23[16*i+l2];
7991    if (i == 0xFF) {
7992        return -1;
7993    }
7994    /* level 3 */
7995    i = map->level23[16*map->count2 + 128*i + l3];
7996    if (i == 0) {
7997        return -1;
7998    }
7999    return i;
8000}
8001
8002/* Lookup the character ch in the mapping. If the character
8003   can't be found, Py_None is returned (or NULL, if another
8004   error occurred). */
8005static PyObject *
8006charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8007{
8008    PyObject *w = PyLong_FromLong((long)c);
8009    PyObject *x;
8010
8011    if (w == NULL)
8012        return NULL;
8013    x = PyObject_GetItem(mapping, w);
8014    Py_DECREF(w);
8015    if (x == NULL) {
8016        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8017            /* No mapping found means: mapping is undefined. */
8018            PyErr_Clear();
8019            x = Py_None;
8020            Py_INCREF(x);
8021            return x;
8022        } else
8023            return NULL;
8024    }
8025    else if (x == Py_None)
8026        return x;
8027    else if (PyLong_Check(x)) {
8028        long value = PyLong_AS_LONG(x);
8029        if (value < 0 || value > 255) {
8030            PyErr_SetString(PyExc_TypeError,
8031                            "character mapping must be in range(256)");
8032            Py_DECREF(x);
8033            return NULL;
8034        }
8035        return x;
8036    }
8037    else if (PyBytes_Check(x))
8038        return x;
8039    else {
8040        /* wrong return value */
8041        PyErr_Format(PyExc_TypeError,
8042                     "character mapping must return integer, bytes or None, not %.400s",
8043                     x->ob_type->tp_name);
8044        Py_DECREF(x);
8045        return NULL;
8046    }
8047}
8048
8049static int
8050charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8051{
8052    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8053    /* exponentially overallocate to minimize reallocations */
8054    if (requiredsize < 2*outsize)
8055        requiredsize = 2*outsize;
8056    if (_PyBytes_Resize(outobj, requiredsize))
8057        return -1;
8058    return 0;
8059}
8060
8061typedef enum charmapencode_result {
8062    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8063} charmapencode_result;
8064/* lookup the character, put the result in the output string and adjust
8065   various state variables. Resize the output bytes object if not enough
8066   space is available. Return a new reference to the object that
8067   was put in the output buffer, or Py_None, if the mapping was undefined
8068   (in which case no character was written) or NULL, if a
8069   reallocation error occurred. The caller must decref the result */
8070static charmapencode_result
8071charmapencode_output(Py_UCS4 c, PyObject *mapping,
8072                     PyObject **outobj, Py_ssize_t *outpos)
8073{
8074    PyObject *rep;
8075    char *outstart;
8076    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8077
8078    if (Py_TYPE(mapping) == &EncodingMapType) {
8079        int res = encoding_map_lookup(c, mapping);
8080        Py_ssize_t requiredsize = *outpos+1;
8081        if (res == -1)
8082            return enc_FAILED;
8083        if (outsize<requiredsize)
8084            if (charmapencode_resize(outobj, outpos, requiredsize))
8085                return enc_EXCEPTION;
8086        outstart = PyBytes_AS_STRING(*outobj);
8087        outstart[(*outpos)++] = (char)res;
8088        return enc_SUCCESS;
8089    }
8090
8091    rep = charmapencode_lookup(c, mapping);
8092    if (rep==NULL)
8093        return enc_EXCEPTION;
8094    else if (rep==Py_None) {
8095        Py_DECREF(rep);
8096        return enc_FAILED;
8097    } else {
8098        if (PyLong_Check(rep)) {
8099            Py_ssize_t requiredsize = *outpos+1;
8100            if (outsize<requiredsize)
8101                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8102                    Py_DECREF(rep);
8103                    return enc_EXCEPTION;
8104                }
8105            outstart = PyBytes_AS_STRING(*outobj);
8106            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8107        }
8108        else {
8109            const char *repchars = PyBytes_AS_STRING(rep);
8110            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8111            Py_ssize_t requiredsize = *outpos+repsize;
8112            if (outsize<requiredsize)
8113                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8114                    Py_DECREF(rep);
8115                    return enc_EXCEPTION;
8116                }
8117            outstart = PyBytes_AS_STRING(*outobj);
8118            memcpy(outstart + *outpos, repchars, repsize);
8119            *outpos += repsize;
8120        }
8121    }
8122    Py_DECREF(rep);
8123    return enc_SUCCESS;
8124}
8125
8126/* handle an error in PyUnicode_EncodeCharmap
8127   Return 0 on success, -1 on error */
8128static int
8129charmap_encoding_error(
8130    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8131    PyObject **exceptionObject,
8132    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8133    PyObject **res, Py_ssize_t *respos)
8134{
8135    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8136    Py_ssize_t size, repsize;
8137    Py_ssize_t newpos;
8138    enum PyUnicode_Kind kind;
8139    void *data;
8140    Py_ssize_t index;
8141    /* startpos for collecting unencodable chars */
8142    Py_ssize_t collstartpos = *inpos;
8143    Py_ssize_t collendpos = *inpos+1;
8144    Py_ssize_t collpos;
8145    char *encoding = "charmap";
8146    char *reason = "character maps to <undefined>";
8147    charmapencode_result x;
8148    Py_UCS4 ch;
8149    int val;
8150
8151    if (PyUnicode_READY(unicode) == -1)
8152        return -1;
8153    size = PyUnicode_GET_LENGTH(unicode);
8154    /* find all unencodable characters */
8155    while (collendpos < size) {
8156        PyObject *rep;
8157        if (Py_TYPE(mapping) == &EncodingMapType) {
8158            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8159            val = encoding_map_lookup(ch, mapping);
8160            if (val != -1)
8161                break;
8162            ++collendpos;
8163            continue;
8164        }
8165
8166        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8167        rep = charmapencode_lookup(ch, mapping);
8168        if (rep==NULL)
8169            return -1;
8170        else if (rep!=Py_None) {
8171            Py_DECREF(rep);
8172            break;
8173        }
8174        Py_DECREF(rep);
8175        ++collendpos;
8176    }
8177    /* cache callback name lookup
8178     * (if not done yet, i.e. it's the first error) */
8179    if (*error_handler == _Py_ERROR_UNKNOWN)
8180        *error_handler = get_error_handler(errors);
8181
8182    switch (*error_handler) {
8183    case _Py_ERROR_STRICT:
8184        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8185        return -1;
8186
8187    case _Py_ERROR_REPLACE:
8188        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8189            x = charmapencode_output('?', mapping, res, respos);
8190            if (x==enc_EXCEPTION) {
8191                return -1;
8192            }
8193            else if (x==enc_FAILED) {
8194                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8195                return -1;
8196            }
8197        }
8198        /* fall through */
8199    case _Py_ERROR_IGNORE:
8200        *inpos = collendpos;
8201        break;
8202
8203    case _Py_ERROR_XMLCHARREFREPLACE:
8204        /* generate replacement (temporarily (mis)uses p) */
8205        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8206            char buffer[2+29+1+1];
8207            char *cp;
8208            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8209            for (cp = buffer; *cp; ++cp) {
8210                x = charmapencode_output(*cp, mapping, res, respos);
8211                if (x==enc_EXCEPTION)
8212                    return -1;
8213                else if (x==enc_FAILED) {
8214                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8215                    return -1;
8216                }
8217            }
8218        }
8219        *inpos = collendpos;
8220        break;
8221
8222    default:
8223        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8224                                                      encoding, reason, unicode, exceptionObject,
8225                                                      collstartpos, collendpos, &newpos);
8226        if (repunicode == NULL)
8227            return -1;
8228        if (PyBytes_Check(repunicode)) {
8229            /* Directly copy bytes result to output. */
8230            Py_ssize_t outsize = PyBytes_Size(*res);
8231            Py_ssize_t requiredsize;
8232            repsize = PyBytes_Size(repunicode);
8233            requiredsize = *respos + repsize;
8234            if (requiredsize > outsize)
8235                /* Make room for all additional bytes. */
8236                if (charmapencode_resize(res, respos, requiredsize)) {
8237                    Py_DECREF(repunicode);
8238                    return -1;
8239                }
8240            memcpy(PyBytes_AsString(*res) + *respos,
8241                   PyBytes_AsString(repunicode),  repsize);
8242            *respos += repsize;
8243            *inpos = newpos;
8244            Py_DECREF(repunicode);
8245            break;
8246        }
8247        /* generate replacement  */
8248        if (PyUnicode_READY(repunicode) == -1) {
8249            Py_DECREF(repunicode);
8250            return -1;
8251        }
8252        repsize = PyUnicode_GET_LENGTH(repunicode);
8253        data = PyUnicode_DATA(repunicode);
8254        kind = PyUnicode_KIND(repunicode);
8255        for (index = 0; index < repsize; index++) {
8256            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8257            x = charmapencode_output(repch, mapping, res, respos);
8258            if (x==enc_EXCEPTION) {
8259                Py_DECREF(repunicode);
8260                return -1;
8261            }
8262            else if (x==enc_FAILED) {
8263                Py_DECREF(repunicode);
8264                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8265                return -1;
8266            }
8267        }
8268        *inpos = newpos;
8269        Py_DECREF(repunicode);
8270    }
8271    return 0;
8272}
8273
8274PyObject *
8275_PyUnicode_EncodeCharmap(PyObject *unicode,
8276                         PyObject *mapping,
8277                         const char *errors)
8278{
8279    /* output object */
8280    PyObject *res = NULL;
8281    /* current input position */
8282    Py_ssize_t inpos = 0;
8283    Py_ssize_t size;
8284    /* current output position */
8285    Py_ssize_t respos = 0;
8286    PyObject *error_handler_obj = NULL;
8287    PyObject *exc = NULL;
8288    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8289    void *data;
8290    int kind;
8291
8292    if (PyUnicode_READY(unicode) == -1)
8293        return NULL;
8294    size = PyUnicode_GET_LENGTH(unicode);
8295    data = PyUnicode_DATA(unicode);
8296    kind = PyUnicode_KIND(unicode);
8297
8298    /* Default to Latin-1 */
8299    if (mapping == NULL)
8300        return unicode_encode_ucs1(unicode, errors, 256);
8301
8302    /* allocate enough for a simple encoding without
8303       replacements, if we need more, we'll resize */
8304    res = PyBytes_FromStringAndSize(NULL, size);
8305    if (res == NULL)
8306        goto onError;
8307    if (size == 0)
8308        return res;
8309
8310    while (inpos<size) {
8311        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8312        /* try to encode it */
8313        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8314        if (x==enc_EXCEPTION) /* error */
8315            goto onError;
8316        if (x==enc_FAILED) { /* unencodable character */
8317            if (charmap_encoding_error(unicode, &inpos, mapping,
8318                                       &exc,
8319                                       &error_handler, &error_handler_obj, errors,
8320                                       &res, &respos)) {
8321                goto onError;
8322            }
8323        }
8324        else
8325            /* done with this character => adjust input position */
8326            ++inpos;
8327    }
8328
8329    /* Resize if we allocated to much */
8330    if (respos<PyBytes_GET_SIZE(res))
8331        if (_PyBytes_Resize(&res, respos) < 0)
8332            goto onError;
8333
8334    Py_XDECREF(exc);
8335    Py_XDECREF(error_handler_obj);
8336    return res;
8337
8338  onError:
8339    Py_XDECREF(res);
8340    Py_XDECREF(exc);
8341    Py_XDECREF(error_handler_obj);
8342    return NULL;
8343}
8344
8345/* Deprecated */
8346PyObject *
8347PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8348                        Py_ssize_t size,
8349                        PyObject *mapping,
8350                        const char *errors)
8351{
8352    PyObject *result;
8353    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8354    if (unicode == NULL)
8355        return NULL;
8356    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8357    Py_DECREF(unicode);
8358    return result;
8359}
8360
8361PyObject *
8362PyUnicode_AsCharmapString(PyObject *unicode,
8363                          PyObject *mapping)
8364{
8365    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8366        PyErr_BadArgument();
8367        return NULL;
8368    }
8369    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8370}
8371
8372/* create or adjust a UnicodeTranslateError */
8373static void
8374make_translate_exception(PyObject **exceptionObject,
8375                         PyObject *unicode,
8376                         Py_ssize_t startpos, Py_ssize_t endpos,
8377                         const char *reason)
8378{
8379    if (*exceptionObject == NULL) {
8380        *exceptionObject = _PyUnicodeTranslateError_Create(
8381            unicode, startpos, endpos, reason);
8382    }
8383    else {
8384        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8385            goto onError;
8386        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8387            goto onError;
8388        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8389            goto onError;
8390        return;
8391      onError:
8392        Py_CLEAR(*exceptionObject);
8393    }
8394}
8395
8396/* error handling callback helper:
8397   build arguments, call the callback and check the arguments,
8398   put the result into newpos and return the replacement string, which
8399   has to be freed by the caller */
8400static PyObject *
8401unicode_translate_call_errorhandler(const char *errors,
8402                                    PyObject **errorHandler,
8403                                    const char *reason,
8404                                    PyObject *unicode, PyObject **exceptionObject,
8405                                    Py_ssize_t startpos, Py_ssize_t endpos,
8406                                    Py_ssize_t *newpos)
8407{
8408    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8409
8410    Py_ssize_t i_newpos;
8411    PyObject *restuple;
8412    PyObject *resunicode;
8413
8414    if (*errorHandler == NULL) {
8415        *errorHandler = PyCodec_LookupError(errors);
8416        if (*errorHandler == NULL)
8417            return NULL;
8418    }
8419
8420    make_translate_exception(exceptionObject,
8421                             unicode, startpos, endpos, reason);
8422    if (*exceptionObject == NULL)
8423        return NULL;
8424
8425    restuple = PyObject_CallFunctionObjArgs(
8426        *errorHandler, *exceptionObject, NULL);
8427    if (restuple == NULL)
8428        return NULL;
8429    if (!PyTuple_Check(restuple)) {
8430        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8431        Py_DECREF(restuple);
8432        return NULL;
8433    }
8434    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8435                          &resunicode, &i_newpos)) {
8436        Py_DECREF(restuple);
8437        return NULL;
8438    }
8439    if (i_newpos<0)
8440        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8441    else
8442        *newpos = i_newpos;
8443    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8444        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8445        Py_DECREF(restuple);
8446        return NULL;
8447    }
8448    Py_INCREF(resunicode);
8449    Py_DECREF(restuple);
8450    return resunicode;
8451}
8452
8453/* Lookup the character ch in the mapping and put the result in result,
8454   which must be decrefed by the caller.
8455   Return 0 on success, -1 on error */
8456static int
8457charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8458{
8459    PyObject *w = PyLong_FromLong((long)c);
8460    PyObject *x;
8461
8462    if (w == NULL)
8463        return -1;
8464    x = PyObject_GetItem(mapping, w);
8465    Py_DECREF(w);
8466    if (x == NULL) {
8467        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8468            /* No mapping found means: use 1:1 mapping. */
8469            PyErr_Clear();
8470            *result = NULL;
8471            return 0;
8472        } else
8473            return -1;
8474    }
8475    else if (x == Py_None) {
8476        *result = x;
8477        return 0;
8478    }
8479    else if (PyLong_Check(x)) {
8480        long value = PyLong_AS_LONG(x);
8481        if (value < 0 || value > MAX_UNICODE) {
8482            PyErr_Format(PyExc_ValueError,
8483                         "character mapping must be in range(0x%x)",
8484                         MAX_UNICODE+1);
8485            Py_DECREF(x);
8486            return -1;
8487        }
8488        *result = x;
8489        return 0;
8490    }
8491    else if (PyUnicode_Check(x)) {
8492        *result = x;
8493        return 0;
8494    }
8495    else {
8496        /* wrong return value */
8497        PyErr_SetString(PyExc_TypeError,
8498                        "character mapping must return integer, None or str");
8499        Py_DECREF(x);
8500        return -1;
8501    }
8502}
8503
8504/* lookup the character, write the result into the writer.
8505   Return 1 if the result was written into the writer, return 0 if the mapping
8506   was undefined, raise an exception return -1 on error. */
8507static int
8508charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8509                        _PyUnicodeWriter *writer)
8510{
8511    PyObject *item;
8512
8513    if (charmaptranslate_lookup(ch, mapping, &item))
8514        return -1;
8515
8516    if (item == NULL) {
8517        /* not found => default to 1:1 mapping */
8518        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8519            return -1;
8520        }
8521        return 1;
8522    }
8523
8524    if (item == Py_None) {
8525        Py_DECREF(item);
8526        return 0;
8527    }
8528
8529    if (PyLong_Check(item)) {
8530        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8531        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8532           used it */
8533        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8534            Py_DECREF(item);
8535            return -1;
8536        }
8537        Py_DECREF(item);
8538        return 1;
8539    }
8540
8541    if (!PyUnicode_Check(item)) {
8542        Py_DECREF(item);
8543        return -1;
8544    }
8545
8546    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8547        Py_DECREF(item);
8548        return -1;
8549    }
8550
8551    Py_DECREF(item);
8552    return 1;
8553}
8554
8555static int
8556unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8557                              Py_UCS1 *translate)
8558{
8559    PyObject *item = NULL;
8560    int ret = 0;
8561
8562    if (charmaptranslate_lookup(ch, mapping, &item)) {
8563        return -1;
8564    }
8565
8566    if (item == Py_None) {
8567        /* deletion */
8568        translate[ch] = 0xfe;
8569    }
8570    else if (item == NULL) {
8571        /* not found => default to 1:1 mapping */
8572        translate[ch] = ch;
8573        return 1;
8574    }
8575    else if (PyLong_Check(item)) {
8576        long replace = PyLong_AS_LONG(item);
8577        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8578           used it */
8579        if (127 < replace) {
8580            /* invalid character or character outside ASCII:
8581               skip the fast translate */
8582            goto exit;
8583        }
8584        translate[ch] = (Py_UCS1)replace;
8585    }
8586    else if (PyUnicode_Check(item)) {
8587        Py_UCS4 replace;
8588
8589        if (PyUnicode_READY(item) == -1) {
8590            Py_DECREF(item);
8591            return -1;
8592        }
8593        if (PyUnicode_GET_LENGTH(item) != 1)
8594            goto exit;
8595
8596        replace = PyUnicode_READ_CHAR(item, 0);
8597        if (replace > 127)
8598            goto exit;
8599        translate[ch] = (Py_UCS1)replace;
8600    }
8601    else {
8602        /* not None, NULL, long or unicode */
8603        goto exit;
8604    }
8605    ret = 1;
8606
8607  exit:
8608    Py_DECREF(item);
8609    return ret;
8610}
8611
8612/* Fast path for ascii => ascii translation. Return 1 if the whole string
8613   was translated into writer, return 0 if the input string was partially
8614   translated into writer, raise an exception and return -1 on error. */
8615static int
8616unicode_fast_translate(PyObject *input, PyObject *mapping,
8617                       _PyUnicodeWriter *writer, int ignore)
8618{
8619    Py_UCS1 ascii_table[128], ch, ch2;
8620    Py_ssize_t len;
8621    Py_UCS1 *in, *end, *out;
8622    int res = 0;
8623
8624    if (PyUnicode_READY(input) == -1)
8625        return -1;
8626    if (!PyUnicode_IS_ASCII(input))
8627        return 0;
8628    len = PyUnicode_GET_LENGTH(input);
8629
8630    memset(ascii_table, 0xff, 128);
8631
8632    in = PyUnicode_1BYTE_DATA(input);
8633    end = in + len;
8634
8635    assert(PyUnicode_IS_ASCII(writer->buffer));
8636    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8637    out = PyUnicode_1BYTE_DATA(writer->buffer);
8638
8639    for (; in < end; in++) {
8640        ch = *in;
8641        ch2 = ascii_table[ch];
8642        if (ch2 == 0xff) {
8643            int translate = unicode_fast_translate_lookup(mapping, ch,
8644                                                          ascii_table);
8645            if (translate < 0)
8646                return -1;
8647            if (translate == 0)
8648                goto exit;
8649            ch2 = ascii_table[ch];
8650        }
8651        if (ch2 == 0xfe) {
8652            if (ignore)
8653                continue;
8654            goto exit;
8655        }
8656        assert(ch2 < 128);
8657        *out = ch2;
8658        out++;
8659    }
8660    res = 1;
8661
8662exit:
8663    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8664    return res;
8665}
8666
8667PyObject *
8668_PyUnicode_TranslateCharmap(PyObject *input,
8669                            PyObject *mapping,
8670                            const char *errors)
8671{
8672    /* input object */
8673    char *data;
8674    Py_ssize_t size, i;
8675    int kind;
8676    /* output buffer */
8677    _PyUnicodeWriter writer;
8678    /* error handler */
8679    char *reason = "character maps to <undefined>";
8680    PyObject *errorHandler = NULL;
8681    PyObject *exc = NULL;
8682    int ignore;
8683    int res;
8684
8685    if (mapping == NULL) {
8686        PyErr_BadArgument();
8687        return NULL;
8688    }
8689
8690    if (PyUnicode_READY(input) == -1)
8691        return NULL;
8692    data = (char*)PyUnicode_DATA(input);
8693    kind = PyUnicode_KIND(input);
8694    size = PyUnicode_GET_LENGTH(input);
8695
8696    if (size == 0) {
8697        Py_INCREF(input);
8698        return input;
8699    }
8700
8701    /* allocate enough for a simple 1:1 translation without
8702       replacements, if we need more, we'll resize */
8703    _PyUnicodeWriter_Init(&writer);
8704    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8705        goto onError;
8706
8707    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8708
8709    res = unicode_fast_translate(input, mapping, &writer, ignore);
8710    if (res < 0) {
8711        _PyUnicodeWriter_Dealloc(&writer);
8712        return NULL;
8713    }
8714    if (res == 1)
8715        return _PyUnicodeWriter_Finish(&writer);
8716
8717    i = writer.pos;
8718    while (i<size) {
8719        /* try to encode it */
8720        int translate;
8721        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8722        Py_ssize_t newpos;
8723        /* startpos for collecting untranslatable chars */
8724        Py_ssize_t collstart;
8725        Py_ssize_t collend;
8726        Py_UCS4 ch;
8727
8728        ch = PyUnicode_READ(kind, data, i);
8729        translate = charmaptranslate_output(ch, mapping, &writer);
8730        if (translate < 0)
8731            goto onError;
8732
8733        if (translate != 0) {
8734            /* it worked => adjust input pointer */
8735            ++i;
8736            continue;
8737        }
8738
8739        /* untranslatable character */
8740        collstart = i;
8741        collend = i+1;
8742
8743        /* find all untranslatable characters */
8744        while (collend < size) {
8745            PyObject *x;
8746            ch = PyUnicode_READ(kind, data, collend);
8747            if (charmaptranslate_lookup(ch, mapping, &x))
8748                goto onError;
8749            Py_XDECREF(x);
8750            if (x != Py_None)
8751                break;
8752            ++collend;
8753        }
8754
8755        if (ignore) {
8756            i = collend;
8757        }
8758        else {
8759            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8760                                                             reason, input, &exc,
8761                                                             collstart, collend, &newpos);
8762            if (repunicode == NULL)
8763                goto onError;
8764            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
8765                Py_DECREF(repunicode);
8766                goto onError;
8767            }
8768            Py_DECREF(repunicode);
8769            i = newpos;
8770        }
8771    }
8772    Py_XDECREF(exc);
8773    Py_XDECREF(errorHandler);
8774    return _PyUnicodeWriter_Finish(&writer);
8775
8776  onError:
8777    _PyUnicodeWriter_Dealloc(&writer);
8778    Py_XDECREF(exc);
8779    Py_XDECREF(errorHandler);
8780    return NULL;
8781}
8782
8783/* Deprecated. Use PyUnicode_Translate instead. */
8784PyObject *
8785PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8786                           Py_ssize_t size,
8787                           PyObject *mapping,
8788                           const char *errors)
8789{
8790    PyObject *result;
8791    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8792    if (!unicode)
8793        return NULL;
8794    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8795    Py_DECREF(unicode);
8796    return result;
8797}
8798
8799PyObject *
8800PyUnicode_Translate(PyObject *str,
8801                    PyObject *mapping,
8802                    const char *errors)
8803{
8804    PyObject *result;
8805
8806    str = PyUnicode_FromObject(str);
8807    if (str == NULL)
8808        return NULL;
8809    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8810    Py_DECREF(str);
8811    return result;
8812}
8813
8814static Py_UCS4
8815fix_decimal_and_space_to_ascii(PyObject *self)
8816{
8817    /* No need to call PyUnicode_READY(self) because this function is only
8818       called as a callback from fixup() which does it already. */
8819    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8820    const int kind = PyUnicode_KIND(self);
8821    void *data = PyUnicode_DATA(self);
8822    Py_UCS4 maxchar = 127, ch, fixed;
8823    int modified = 0;
8824    Py_ssize_t i;
8825
8826    for (i = 0; i < len; ++i) {
8827        ch = PyUnicode_READ(kind, data, i);
8828        fixed = 0;
8829        if (ch > 127) {
8830            if (Py_UNICODE_ISSPACE(ch))
8831                fixed = ' ';
8832            else {
8833                const int decimal = Py_UNICODE_TODECIMAL(ch);
8834                if (decimal >= 0)
8835                    fixed = '0' + decimal;
8836            }
8837            if (fixed != 0) {
8838                modified = 1;
8839                maxchar = Py_MAX(maxchar, fixed);
8840                PyUnicode_WRITE(kind, data, i, fixed);
8841            }
8842            else
8843                maxchar = Py_MAX(maxchar, ch);
8844        }
8845    }
8846
8847    return (modified) ? maxchar : 0;
8848}
8849
8850PyObject *
8851_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8852{
8853    if (!PyUnicode_Check(unicode)) {
8854        PyErr_BadInternalCall();
8855        return NULL;
8856    }
8857    if (PyUnicode_READY(unicode) == -1)
8858        return NULL;
8859    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8860        /* If the string is already ASCII, just return the same string */
8861        Py_INCREF(unicode);
8862        return unicode;
8863    }
8864    return fixup(unicode, fix_decimal_and_space_to_ascii);
8865}
8866
8867PyObject *
8868PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8869                                  Py_ssize_t length)
8870{
8871    PyObject *decimal;
8872    Py_ssize_t i;
8873    Py_UCS4 maxchar;
8874    enum PyUnicode_Kind kind;
8875    void *data;
8876
8877    maxchar = 127;
8878    for (i = 0; i < length; i++) {
8879        Py_UCS4 ch = s[i];
8880        if (ch > 127) {
8881            int decimal = Py_UNICODE_TODECIMAL(ch);
8882            if (decimal >= 0)
8883                ch = '0' + decimal;
8884            maxchar = Py_MAX(maxchar, ch);
8885        }
8886    }
8887
8888    /* Copy to a new string */
8889    decimal = PyUnicode_New(length, maxchar);
8890    if (decimal == NULL)
8891        return decimal;
8892    kind = PyUnicode_KIND(decimal);
8893    data = PyUnicode_DATA(decimal);
8894    /* Iterate over code points */
8895    for (i = 0; i < length; i++) {
8896        Py_UCS4 ch = s[i];
8897        if (ch > 127) {
8898            int decimal = Py_UNICODE_TODECIMAL(ch);
8899            if (decimal >= 0)
8900                ch = '0' + decimal;
8901        }
8902        PyUnicode_WRITE(kind, data, i, ch);
8903    }
8904    return unicode_result(decimal);
8905}
8906/* --- Decimal Encoder ---------------------------------------------------- */
8907
8908int
8909PyUnicode_EncodeDecimal(Py_UNICODE *s,
8910                        Py_ssize_t length,
8911                        char *output,
8912                        const char *errors)
8913{
8914    PyObject *unicode;
8915    Py_ssize_t i;
8916    enum PyUnicode_Kind kind;
8917    void *data;
8918
8919    if (output == NULL) {
8920        PyErr_BadArgument();
8921        return -1;
8922    }
8923
8924    unicode = PyUnicode_FromUnicode(s, length);
8925    if (unicode == NULL)
8926        return -1;
8927
8928    if (PyUnicode_READY(unicode) == -1) {
8929        Py_DECREF(unicode);
8930        return -1;
8931    }
8932    kind = PyUnicode_KIND(unicode);
8933    data = PyUnicode_DATA(unicode);
8934
8935    for (i=0; i < length; ) {
8936        PyObject *exc;
8937        Py_UCS4 ch;
8938        int decimal;
8939        Py_ssize_t startpos;
8940
8941        ch = PyUnicode_READ(kind, data, i);
8942
8943        if (Py_UNICODE_ISSPACE(ch)) {
8944            *output++ = ' ';
8945            i++;
8946            continue;
8947        }
8948        decimal = Py_UNICODE_TODECIMAL(ch);
8949        if (decimal >= 0) {
8950            *output++ = '0' + decimal;
8951            i++;
8952            continue;
8953        }
8954        if (0 < ch && ch < 256) {
8955            *output++ = (char)ch;
8956            i++;
8957            continue;
8958        }
8959
8960        startpos = i;
8961        exc = NULL;
8962        raise_encode_exception(&exc, "decimal", unicode,
8963                               startpos, startpos+1,
8964                               "invalid decimal Unicode string");
8965        Py_XDECREF(exc);
8966        Py_DECREF(unicode);
8967        return -1;
8968    }
8969    /* 0-terminate the output string */
8970    *output++ = '\0';
8971    Py_DECREF(unicode);
8972    return 0;
8973}
8974
8975/* --- Helpers ------------------------------------------------------------ */
8976
8977/* helper macro to fixup start/end slice values */
8978#define ADJUST_INDICES(start, end, len)         \
8979    if (end > len)                              \
8980        end = len;                              \
8981    else if (end < 0) {                         \
8982        end += len;                             \
8983        if (end < 0)                            \
8984            end = 0;                            \
8985    }                                           \
8986    if (start < 0) {                            \
8987        start += len;                           \
8988        if (start < 0)                          \
8989            start = 0;                          \
8990    }
8991
8992static Py_ssize_t
8993any_find_slice(int direction, PyObject* s1, PyObject* s2,
8994               Py_ssize_t start,
8995               Py_ssize_t end)
8996{
8997    int kind1, kind2;
8998    void *buf1, *buf2;
8999    Py_ssize_t len1, len2, result;
9000
9001    kind1 = PyUnicode_KIND(s1);
9002    kind2 = PyUnicode_KIND(s2);
9003    if (kind1 < kind2)
9004        return -1;
9005
9006    len1 = PyUnicode_GET_LENGTH(s1);
9007    len2 = PyUnicode_GET_LENGTH(s2);
9008    ADJUST_INDICES(start, end, len1);
9009    if (end - start < len2)
9010        return -1;
9011
9012    buf1 = PyUnicode_DATA(s1);
9013    buf2 = PyUnicode_DATA(s2);
9014    if (len2 == 1) {
9015        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9016        result = findchar((const char *)buf1 + kind1*start,
9017                          kind1, end - start, ch, direction);
9018        if (result == -1)
9019            return -1;
9020        else
9021            return start + result;
9022    }
9023
9024    if (kind2 != kind1) {
9025        buf2 = _PyUnicode_AsKind(s2, kind1);
9026        if (!buf2)
9027            return -2;
9028    }
9029
9030    if (direction > 0) {
9031        switch (kind1) {
9032        case PyUnicode_1BYTE_KIND:
9033            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9034                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9035            else
9036                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9037            break;
9038        case PyUnicode_2BYTE_KIND:
9039            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9040            break;
9041        case PyUnicode_4BYTE_KIND:
9042            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9043            break;
9044        default:
9045            assert(0); result = -2;
9046        }
9047    }
9048    else {
9049        switch (kind1) {
9050        case PyUnicode_1BYTE_KIND:
9051            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9052                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9053            else
9054                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055            break;
9056        case PyUnicode_2BYTE_KIND:
9057            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9058            break;
9059        case PyUnicode_4BYTE_KIND:
9060            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9061            break;
9062        default:
9063            assert(0); result = -2;
9064        }
9065    }
9066
9067    if (kind2 != kind1)
9068        PyMem_Free(buf2);
9069
9070    return result;
9071}
9072
9073Py_ssize_t
9074_PyUnicode_InsertThousandsGrouping(
9075    PyObject *unicode, Py_ssize_t index,
9076    Py_ssize_t n_buffer,
9077    void *digits, Py_ssize_t n_digits,
9078    Py_ssize_t min_width,
9079    const char *grouping, PyObject *thousands_sep,
9080    Py_UCS4 *maxchar)
9081{
9082    unsigned int kind, thousands_sep_kind;
9083    char *data, *thousands_sep_data;
9084    Py_ssize_t thousands_sep_len;
9085    Py_ssize_t len;
9086
9087    if (unicode != NULL) {
9088        kind = PyUnicode_KIND(unicode);
9089        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9090    }
9091    else {
9092        kind = PyUnicode_1BYTE_KIND;
9093        data = NULL;
9094    }
9095    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9096    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9097    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9098    if (unicode != NULL && thousands_sep_kind != kind) {
9099        if (thousands_sep_kind < kind) {
9100            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9101            if (!thousands_sep_data)
9102                return -1;
9103        }
9104        else {
9105            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9106            if (!data)
9107                return -1;
9108        }
9109    }
9110
9111    switch (kind) {
9112    case PyUnicode_1BYTE_KIND:
9113        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9114            len = asciilib_InsertThousandsGrouping(
9115                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9116                min_width, grouping,
9117                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9118        else
9119            len = ucs1lib_InsertThousandsGrouping(
9120                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9121                min_width, grouping,
9122                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9123        break;
9124    case PyUnicode_2BYTE_KIND:
9125        len = ucs2lib_InsertThousandsGrouping(
9126            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9127            min_width, grouping,
9128            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9129        break;
9130    case PyUnicode_4BYTE_KIND:
9131        len = ucs4lib_InsertThousandsGrouping(
9132            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9133            min_width, grouping,
9134            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9135        break;
9136    default:
9137        assert(0);
9138        return -1;
9139    }
9140    if (unicode != NULL && thousands_sep_kind != kind) {
9141        if (thousands_sep_kind < kind)
9142            PyMem_Free(thousands_sep_data);
9143        else
9144            PyMem_Free(data);
9145    }
9146    if (unicode == NULL) {
9147        *maxchar = 127;
9148        if (len != n_digits) {
9149            *maxchar = Py_MAX(*maxchar,
9150                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9151        }
9152    }
9153    return len;
9154}
9155
9156
9157Py_ssize_t
9158PyUnicode_Count(PyObject *str,
9159                PyObject *substr,
9160                Py_ssize_t start,
9161                Py_ssize_t end)
9162{
9163    Py_ssize_t result;
9164    PyObject* str_obj;
9165    PyObject* sub_obj;
9166    int kind1, kind2;
9167    void *buf1 = NULL, *buf2 = NULL;
9168    Py_ssize_t len1, len2;
9169
9170    str_obj = PyUnicode_FromObject(str);
9171    if (!str_obj)
9172        return -1;
9173    sub_obj = PyUnicode_FromObject(substr);
9174    if (!sub_obj) {
9175        Py_DECREF(str_obj);
9176        return -1;
9177    }
9178    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9179        Py_DECREF(sub_obj);
9180        Py_DECREF(str_obj);
9181        return -1;
9182    }
9183
9184    kind1 = PyUnicode_KIND(str_obj);
9185    kind2 = PyUnicode_KIND(sub_obj);
9186    if (kind1 < kind2) {
9187        Py_DECREF(sub_obj);
9188        Py_DECREF(str_obj);
9189        return 0;
9190    }
9191
9192    len1 = PyUnicode_GET_LENGTH(str_obj);
9193    len2 = PyUnicode_GET_LENGTH(sub_obj);
9194    ADJUST_INDICES(start, end, len1);
9195    if (end - start < len2) {
9196        Py_DECREF(sub_obj);
9197        Py_DECREF(str_obj);
9198        return 0;
9199    }
9200
9201    buf1 = PyUnicode_DATA(str_obj);
9202    buf2 = PyUnicode_DATA(sub_obj);
9203    if (kind2 != kind1) {
9204        buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9205        if (!buf2)
9206            goto onError;
9207    }
9208
9209    switch (kind1) {
9210    case PyUnicode_1BYTE_KIND:
9211        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9212            result = asciilib_count(
9213                ((Py_UCS1*)buf1) + start, end - start,
9214                buf2, len2, PY_SSIZE_T_MAX
9215                );
9216        else
9217            result = ucs1lib_count(
9218                ((Py_UCS1*)buf1) + start, end - start,
9219                buf2, len2, PY_SSIZE_T_MAX
9220                );
9221        break;
9222    case PyUnicode_2BYTE_KIND:
9223        result = ucs2lib_count(
9224            ((Py_UCS2*)buf1) + start, end - start,
9225            buf2, len2, PY_SSIZE_T_MAX
9226            );
9227        break;
9228    case PyUnicode_4BYTE_KIND:
9229        result = ucs4lib_count(
9230            ((Py_UCS4*)buf1) + start, end - start,
9231            buf2, len2, PY_SSIZE_T_MAX
9232            );
9233        break;
9234    default:
9235        assert(0); result = 0;
9236    }
9237
9238    Py_DECREF(sub_obj);
9239    Py_DECREF(str_obj);
9240
9241    if (kind2 != kind1)
9242        PyMem_Free(buf2);
9243
9244    return result;
9245  onError:
9246    Py_DECREF(sub_obj);
9247    Py_DECREF(str_obj);
9248    if (kind2 != kind1 && buf2)
9249        PyMem_Free(buf2);
9250    return -1;
9251}
9252
9253Py_ssize_t
9254PyUnicode_Find(PyObject *str,
9255               PyObject *sub,
9256               Py_ssize_t start,
9257               Py_ssize_t end,
9258               int direction)
9259{
9260    Py_ssize_t result;
9261
9262    str = PyUnicode_FromObject(str);
9263    if (!str)
9264        return -2;
9265    sub = PyUnicode_FromObject(sub);
9266    if (!sub) {
9267        Py_DECREF(str);
9268        return -2;
9269    }
9270    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9271        Py_DECREF(sub);
9272        Py_DECREF(str);
9273        return -2;
9274    }
9275
9276    result = any_find_slice(direction,
9277        str, sub, start, end
9278        );
9279
9280    Py_DECREF(str);
9281    Py_DECREF(sub);
9282
9283    return result;
9284}
9285
9286Py_ssize_t
9287PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9288                   Py_ssize_t start, Py_ssize_t end,
9289                   int direction)
9290{
9291    int kind;
9292    Py_ssize_t result;
9293    if (PyUnicode_READY(str) == -1)
9294        return -2;
9295    if (start < 0 || end < 0) {
9296        PyErr_SetString(PyExc_IndexError, "string index out of range");
9297        return -2;
9298    }
9299    if (end > PyUnicode_GET_LENGTH(str))
9300        end = PyUnicode_GET_LENGTH(str);
9301    if (start >= end)
9302        return -1;
9303    kind = PyUnicode_KIND(str);
9304    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9305                      kind, end-start, ch, direction);
9306    if (result == -1)
9307        return -1;
9308    else
9309        return start + result;
9310}
9311
9312static int
9313tailmatch(PyObject *self,
9314          PyObject *substring,
9315          Py_ssize_t start,
9316          Py_ssize_t end,
9317          int direction)
9318{
9319    int kind_self;
9320    int kind_sub;
9321    void *data_self;
9322    void *data_sub;
9323    Py_ssize_t offset;
9324    Py_ssize_t i;
9325    Py_ssize_t end_sub;
9326
9327    if (PyUnicode_READY(self) == -1 ||
9328        PyUnicode_READY(substring) == -1)
9329        return -1;
9330
9331    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9332    end -= PyUnicode_GET_LENGTH(substring);
9333    if (end < start)
9334        return 0;
9335
9336    if (PyUnicode_GET_LENGTH(substring) == 0)
9337        return 1;
9338
9339    kind_self = PyUnicode_KIND(self);
9340    data_self = PyUnicode_DATA(self);
9341    kind_sub = PyUnicode_KIND(substring);
9342    data_sub = PyUnicode_DATA(substring);
9343    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9344
9345    if (direction > 0)
9346        offset = end;
9347    else
9348        offset = start;
9349
9350    if (PyUnicode_READ(kind_self, data_self, offset) ==
9351        PyUnicode_READ(kind_sub, data_sub, 0) &&
9352        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9353        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9354        /* If both are of the same kind, memcmp is sufficient */
9355        if (kind_self == kind_sub) {
9356            return ! memcmp((char *)data_self +
9357                                (offset * PyUnicode_KIND(substring)),
9358                            data_sub,
9359                            PyUnicode_GET_LENGTH(substring) *
9360                                PyUnicode_KIND(substring));
9361        }
9362        /* otherwise we have to compare each character by first accesing it */
9363        else {
9364            /* We do not need to compare 0 and len(substring)-1 because
9365               the if statement above ensured already that they are equal
9366               when we end up here. */
9367            for (i = 1; i < end_sub; ++i) {
9368                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9369                    PyUnicode_READ(kind_sub, data_sub, i))
9370                    return 0;
9371            }
9372            return 1;
9373        }
9374    }
9375
9376    return 0;
9377}
9378
9379Py_ssize_t
9380PyUnicode_Tailmatch(PyObject *str,
9381                    PyObject *substr,
9382                    Py_ssize_t start,
9383                    Py_ssize_t end,
9384                    int direction)
9385{
9386    Py_ssize_t result;
9387
9388    str = PyUnicode_FromObject(str);
9389    if (str == NULL)
9390        return -1;
9391    substr = PyUnicode_FromObject(substr);
9392    if (substr == NULL) {
9393        Py_DECREF(str);
9394        return -1;
9395    }
9396
9397    result = tailmatch(str, substr,
9398                       start, end, direction);
9399    Py_DECREF(str);
9400    Py_DECREF(substr);
9401    return result;
9402}
9403
9404/* Apply fixfct filter to the Unicode object self and return a
9405   reference to the modified object */
9406
9407static PyObject *
9408fixup(PyObject *self,
9409      Py_UCS4 (*fixfct)(PyObject *s))
9410{
9411    PyObject *u;
9412    Py_UCS4 maxchar_old, maxchar_new = 0;
9413    PyObject *v;
9414
9415    u = _PyUnicode_Copy(self);
9416    if (u == NULL)
9417        return NULL;
9418    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9419
9420    /* fix functions return the new maximum character in a string,
9421       if the kind of the resulting unicode object does not change,
9422       everything is fine.  Otherwise we need to change the string kind
9423       and re-run the fix function. */
9424    maxchar_new = fixfct(u);
9425
9426    if (maxchar_new == 0) {
9427        /* no changes */;
9428        if (PyUnicode_CheckExact(self)) {
9429            Py_DECREF(u);
9430            Py_INCREF(self);
9431            return self;
9432        }
9433        else
9434            return u;
9435    }
9436
9437    maxchar_new = align_maxchar(maxchar_new);
9438
9439    if (maxchar_new == maxchar_old)
9440        return u;
9441
9442    /* In case the maximum character changed, we need to
9443       convert the string to the new category. */
9444    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9445    if (v == NULL) {
9446        Py_DECREF(u);
9447        return NULL;
9448    }
9449    if (maxchar_new > maxchar_old) {
9450        /* If the maxchar increased so that the kind changed, not all
9451           characters are representable anymore and we need to fix the
9452           string again. This only happens in very few cases. */
9453        _PyUnicode_FastCopyCharacters(v, 0,
9454                                      self, 0, PyUnicode_GET_LENGTH(self));
9455        maxchar_old = fixfct(v);
9456        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9457    }
9458    else {
9459        _PyUnicode_FastCopyCharacters(v, 0,
9460                                      u, 0, PyUnicode_GET_LENGTH(self));
9461    }
9462    Py_DECREF(u);
9463    assert(_PyUnicode_CheckConsistency(v, 1));
9464    return v;
9465}
9466
9467static PyObject *
9468ascii_upper_or_lower(PyObject *self, int lower)
9469{
9470    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9471    char *resdata, *data = PyUnicode_DATA(self);
9472    PyObject *res;
9473
9474    res = PyUnicode_New(len, 127);
9475    if (res == NULL)
9476        return NULL;
9477    resdata = PyUnicode_DATA(res);
9478    if (lower)
9479        _Py_bytes_lower(resdata, data, len);
9480    else
9481        _Py_bytes_upper(resdata, data, len);
9482    return res;
9483}
9484
9485static Py_UCS4
9486handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9487{
9488    Py_ssize_t j;
9489    int final_sigma;
9490    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9491    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9492
9493     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9494
9495    where ! is a negation and \p{xxx} is a character with property xxx.
9496    */
9497    for (j = i - 1; j >= 0; j--) {
9498        c = PyUnicode_READ(kind, data, j);
9499        if (!_PyUnicode_IsCaseIgnorable(c))
9500            break;
9501    }
9502    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9503    if (final_sigma) {
9504        for (j = i + 1; j < length; j++) {
9505            c = PyUnicode_READ(kind, data, j);
9506            if (!_PyUnicode_IsCaseIgnorable(c))
9507                break;
9508        }
9509        final_sigma = j == length || !_PyUnicode_IsCased(c);
9510    }
9511    return (final_sigma) ? 0x3C2 : 0x3C3;
9512}
9513
9514static int
9515lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9516           Py_UCS4 c, Py_UCS4 *mapped)
9517{
9518    /* Obscure special case. */
9519    if (c == 0x3A3) {
9520        mapped[0] = handle_capital_sigma(kind, data, length, i);
9521        return 1;
9522    }
9523    return _PyUnicode_ToLowerFull(c, mapped);
9524}
9525
9526static Py_ssize_t
9527do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9528{
9529    Py_ssize_t i, k = 0;
9530    int n_res, j;
9531    Py_UCS4 c, mapped[3];
9532
9533    c = PyUnicode_READ(kind, data, 0);
9534    n_res = _PyUnicode_ToUpperFull(c, mapped);
9535    for (j = 0; j < n_res; j++) {
9536        *maxchar = Py_MAX(*maxchar, mapped[j]);
9537        res[k++] = mapped[j];
9538    }
9539    for (i = 1; i < length; i++) {
9540        c = PyUnicode_READ(kind, data, i);
9541        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9542        for (j = 0; j < n_res; j++) {
9543            *maxchar = Py_MAX(*maxchar, mapped[j]);
9544            res[k++] = mapped[j];
9545        }
9546    }
9547    return k;
9548}
9549
9550static Py_ssize_t
9551do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9552    Py_ssize_t i, k = 0;
9553
9554    for (i = 0; i < length; i++) {
9555        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9556        int n_res, j;
9557        if (Py_UNICODE_ISUPPER(c)) {
9558            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9559        }
9560        else if (Py_UNICODE_ISLOWER(c)) {
9561            n_res = _PyUnicode_ToUpperFull(c, mapped);
9562        }
9563        else {
9564            n_res = 1;
9565            mapped[0] = c;
9566        }
9567        for (j = 0; j < n_res; j++) {
9568            *maxchar = Py_MAX(*maxchar, mapped[j]);
9569            res[k++] = mapped[j];
9570        }
9571    }
9572    return k;
9573}
9574
9575static Py_ssize_t
9576do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9577                  Py_UCS4 *maxchar, int lower)
9578{
9579    Py_ssize_t i, k = 0;
9580
9581    for (i = 0; i < length; i++) {
9582        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9583        int n_res, j;
9584        if (lower)
9585            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9586        else
9587            n_res = _PyUnicode_ToUpperFull(c, mapped);
9588        for (j = 0; j < n_res; j++) {
9589            *maxchar = Py_MAX(*maxchar, mapped[j]);
9590            res[k++] = mapped[j];
9591        }
9592    }
9593    return k;
9594}
9595
9596static Py_ssize_t
9597do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9598{
9599    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9600}
9601
9602static Py_ssize_t
9603do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9604{
9605    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9606}
9607
9608static Py_ssize_t
9609do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9610{
9611    Py_ssize_t i, k = 0;
9612
9613    for (i = 0; i < length; i++) {
9614        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9615        Py_UCS4 mapped[3];
9616        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9617        for (j = 0; j < n_res; j++) {
9618            *maxchar = Py_MAX(*maxchar, mapped[j]);
9619            res[k++] = mapped[j];
9620        }
9621    }
9622    return k;
9623}
9624
9625static Py_ssize_t
9626do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9627{
9628    Py_ssize_t i, k = 0;
9629    int previous_is_cased;
9630
9631    previous_is_cased = 0;
9632    for (i = 0; i < length; i++) {
9633        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9634        Py_UCS4 mapped[3];
9635        int n_res, j;
9636
9637        if (previous_is_cased)
9638            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9639        else
9640            n_res = _PyUnicode_ToTitleFull(c, mapped);
9641
9642        for (j = 0; j < n_res; j++) {
9643            *maxchar = Py_MAX(*maxchar, mapped[j]);
9644            res[k++] = mapped[j];
9645        }
9646
9647        previous_is_cased = _PyUnicode_IsCased(c);
9648    }
9649    return k;
9650}
9651
9652static PyObject *
9653case_operation(PyObject *self,
9654               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9655{
9656    PyObject *res = NULL;
9657    Py_ssize_t length, newlength = 0;
9658    int kind, outkind;
9659    void *data, *outdata;
9660    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9661
9662    assert(PyUnicode_IS_READY(self));
9663
9664    kind = PyUnicode_KIND(self);
9665    data = PyUnicode_DATA(self);
9666    length = PyUnicode_GET_LENGTH(self);
9667    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9668        PyErr_SetString(PyExc_OverflowError, "string is too long");
9669        return NULL;
9670    }
9671    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9672    if (tmp == NULL)
9673        return PyErr_NoMemory();
9674    newlength = perform(kind, data, length, tmp, &maxchar);
9675    res = PyUnicode_New(newlength, maxchar);
9676    if (res == NULL)
9677        goto leave;
9678    tmpend = tmp + newlength;
9679    outdata = PyUnicode_DATA(res);
9680    outkind = PyUnicode_KIND(res);
9681    switch (outkind) {
9682    case PyUnicode_1BYTE_KIND:
9683        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9684        break;
9685    case PyUnicode_2BYTE_KIND:
9686        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9687        break;
9688    case PyUnicode_4BYTE_KIND:
9689        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9690        break;
9691    default:
9692        assert(0);
9693        break;
9694    }
9695  leave:
9696    PyMem_FREE(tmp);
9697    return res;
9698}
9699
9700PyObject *
9701PyUnicode_Join(PyObject *separator, PyObject *seq)
9702{
9703    PyObject *sep = NULL;
9704    Py_ssize_t seplen;
9705    PyObject *res = NULL; /* the result */
9706    PyObject *fseq;          /* PySequence_Fast(seq) */
9707    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9708    PyObject **items;
9709    PyObject *item;
9710    Py_ssize_t sz, i, res_offset;
9711    Py_UCS4 maxchar;
9712    Py_UCS4 item_maxchar;
9713    int use_memcpy;
9714    unsigned char *res_data = NULL, *sep_data = NULL;
9715    PyObject *last_obj;
9716    unsigned int kind = 0;
9717
9718    fseq = PySequence_Fast(seq, "can only join an iterable");
9719    if (fseq == NULL) {
9720        return NULL;
9721    }
9722
9723    /* NOTE: the following code can't call back into Python code,
9724     * so we are sure that fseq won't be mutated.
9725     */
9726
9727    seqlen = PySequence_Fast_GET_SIZE(fseq);
9728    /* If empty sequence, return u"". */
9729    if (seqlen == 0) {
9730        Py_DECREF(fseq);
9731        _Py_RETURN_UNICODE_EMPTY();
9732    }
9733
9734    /* If singleton sequence with an exact Unicode, return that. */
9735    last_obj = NULL;
9736    items = PySequence_Fast_ITEMS(fseq);
9737    if (seqlen == 1) {
9738        if (PyUnicode_CheckExact(items[0])) {
9739            res = items[0];
9740            Py_INCREF(res);
9741            Py_DECREF(fseq);
9742            return res;
9743        }
9744        seplen = 0;
9745        maxchar = 0;
9746    }
9747    else {
9748        /* Set up sep and seplen */
9749        if (separator == NULL) {
9750            /* fall back to a blank space separator */
9751            sep = PyUnicode_FromOrdinal(' ');
9752            if (!sep)
9753                goto onError;
9754            seplen = 1;
9755            maxchar = 32;
9756        }
9757        else {
9758            if (!PyUnicode_Check(separator)) {
9759                PyErr_Format(PyExc_TypeError,
9760                             "separator: expected str instance,"
9761                             " %.80s found",
9762                             Py_TYPE(separator)->tp_name);
9763                goto onError;
9764            }
9765            if (PyUnicode_READY(separator))
9766                goto onError;
9767            sep = separator;
9768            seplen = PyUnicode_GET_LENGTH(separator);
9769            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9770            /* inc refcount to keep this code path symmetric with the
9771               above case of a blank separator */
9772            Py_INCREF(sep);
9773        }
9774        last_obj = sep;
9775    }
9776
9777    /* There are at least two things to join, or else we have a subclass
9778     * of str in the sequence.
9779     * Do a pre-pass to figure out the total amount of space we'll
9780     * need (sz), and see whether all argument are strings.
9781     */
9782    sz = 0;
9783#ifdef Py_DEBUG
9784    use_memcpy = 0;
9785#else
9786    use_memcpy = 1;
9787#endif
9788    for (i = 0; i < seqlen; i++) {
9789        const Py_ssize_t old_sz = sz;
9790        item = items[i];
9791        if (!PyUnicode_Check(item)) {
9792            PyErr_Format(PyExc_TypeError,
9793                         "sequence item %zd: expected str instance,"
9794                         " %.80s found",
9795                         i, Py_TYPE(item)->tp_name);
9796            goto onError;
9797        }
9798        if (PyUnicode_READY(item) == -1)
9799            goto onError;
9800        sz += PyUnicode_GET_LENGTH(item);
9801        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9802        maxchar = Py_MAX(maxchar, item_maxchar);
9803        if (i != 0)
9804            sz += seplen;
9805        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9806            PyErr_SetString(PyExc_OverflowError,
9807                            "join() result is too long for a Python string");
9808            goto onError;
9809        }
9810        if (use_memcpy && last_obj != NULL) {
9811            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9812                use_memcpy = 0;
9813        }
9814        last_obj = item;
9815    }
9816
9817    res = PyUnicode_New(sz, maxchar);
9818    if (res == NULL)
9819        goto onError;
9820
9821    /* Catenate everything. */
9822#ifdef Py_DEBUG
9823    use_memcpy = 0;
9824#else
9825    if (use_memcpy) {
9826        res_data = PyUnicode_1BYTE_DATA(res);
9827        kind = PyUnicode_KIND(res);
9828        if (seplen != 0)
9829            sep_data = PyUnicode_1BYTE_DATA(sep);
9830    }
9831#endif
9832    if (use_memcpy) {
9833        for (i = 0; i < seqlen; ++i) {
9834            Py_ssize_t itemlen;
9835            item = items[i];
9836
9837            /* Copy item, and maybe the separator. */
9838            if (i && seplen != 0) {
9839                Py_MEMCPY(res_data,
9840                          sep_data,
9841                          kind * seplen);
9842                res_data += kind * seplen;
9843            }
9844
9845            itemlen = PyUnicode_GET_LENGTH(item);
9846            if (itemlen != 0) {
9847                Py_MEMCPY(res_data,
9848                          PyUnicode_DATA(item),
9849                          kind * itemlen);
9850                res_data += kind * itemlen;
9851            }
9852        }
9853        assert(res_data == PyUnicode_1BYTE_DATA(res)
9854                           + kind * PyUnicode_GET_LENGTH(res));
9855    }
9856    else {
9857        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9858            Py_ssize_t itemlen;
9859            item = items[i];
9860
9861            /* Copy item, and maybe the separator. */
9862            if (i && seplen != 0) {
9863                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9864                res_offset += seplen;
9865            }
9866
9867            itemlen = PyUnicode_GET_LENGTH(item);
9868            if (itemlen != 0) {
9869                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9870                res_offset += itemlen;
9871            }
9872        }
9873        assert(res_offset == PyUnicode_GET_LENGTH(res));
9874    }
9875
9876    Py_DECREF(fseq);
9877    Py_XDECREF(sep);
9878    assert(_PyUnicode_CheckConsistency(res, 1));
9879    return res;
9880
9881  onError:
9882    Py_DECREF(fseq);
9883    Py_XDECREF(sep);
9884    Py_XDECREF(res);
9885    return NULL;
9886}
9887
9888#define FILL(kind, data, value, start, length) \
9889    do { \
9890        Py_ssize_t i_ = 0; \
9891        assert(kind != PyUnicode_WCHAR_KIND); \
9892        switch ((kind)) { \
9893        case PyUnicode_1BYTE_KIND: { \
9894            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9895            memset(to_, (unsigned char)value, (length)); \
9896            break; \
9897        } \
9898        case PyUnicode_2BYTE_KIND: { \
9899            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9900            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9901            break; \
9902        } \
9903        case PyUnicode_4BYTE_KIND: { \
9904            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9905            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9906            break; \
9907        } \
9908        default: assert(0); \
9909        } \
9910    } while (0)
9911
9912void
9913_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9914                    Py_UCS4 fill_char)
9915{
9916    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9917    const void *data = PyUnicode_DATA(unicode);
9918    assert(PyUnicode_IS_READY(unicode));
9919    assert(unicode_modifiable(unicode));
9920    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9921    assert(start >= 0);
9922    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9923    FILL(kind, data, fill_char, start, length);
9924}
9925
9926Py_ssize_t
9927PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9928               Py_UCS4 fill_char)
9929{
9930    Py_ssize_t maxlen;
9931
9932    if (!PyUnicode_Check(unicode)) {
9933        PyErr_BadInternalCall();
9934        return -1;
9935    }
9936    if (PyUnicode_READY(unicode) == -1)
9937        return -1;
9938    if (unicode_check_modifiable(unicode))
9939        return -1;
9940
9941    if (start < 0) {
9942        PyErr_SetString(PyExc_IndexError, "string index out of range");
9943        return -1;
9944    }
9945    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9946        PyErr_SetString(PyExc_ValueError,
9947                         "fill character is bigger than "
9948                         "the string maximum character");
9949        return -1;
9950    }
9951
9952    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9953    length = Py_MIN(maxlen, length);
9954    if (length <= 0)
9955        return 0;
9956
9957    _PyUnicode_FastFill(unicode, start, length, fill_char);
9958    return length;
9959}
9960
9961static PyObject *
9962pad(PyObject *self,
9963    Py_ssize_t left,
9964    Py_ssize_t right,
9965    Py_UCS4 fill)
9966{
9967    PyObject *u;
9968    Py_UCS4 maxchar;
9969    int kind;
9970    void *data;
9971
9972    if (left < 0)
9973        left = 0;
9974    if (right < 0)
9975        right = 0;
9976
9977    if (left == 0 && right == 0)
9978        return unicode_result_unchanged(self);
9979
9980    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9981        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9982        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9983        return NULL;
9984    }
9985    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9986    maxchar = Py_MAX(maxchar, fill);
9987    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9988    if (!u)
9989        return NULL;
9990
9991    kind = PyUnicode_KIND(u);
9992    data = PyUnicode_DATA(u);
9993    if (left)
9994        FILL(kind, data, fill, 0, left);
9995    if (right)
9996        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9997    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9998    assert(_PyUnicode_CheckConsistency(u, 1));
9999    return u;
10000}
10001
10002PyObject *
10003PyUnicode_Splitlines(PyObject *string, int keepends)
10004{
10005    PyObject *list;
10006
10007    string = PyUnicode_FromObject(string);
10008    if (string == NULL)
10009        return NULL;
10010    if (PyUnicode_READY(string) == -1) {
10011        Py_DECREF(string);
10012        return NULL;
10013    }
10014
10015    switch (PyUnicode_KIND(string)) {
10016    case PyUnicode_1BYTE_KIND:
10017        if (PyUnicode_IS_ASCII(string))
10018            list = asciilib_splitlines(
10019                string, PyUnicode_1BYTE_DATA(string),
10020                PyUnicode_GET_LENGTH(string), keepends);
10021        else
10022            list = ucs1lib_splitlines(
10023                string, PyUnicode_1BYTE_DATA(string),
10024                PyUnicode_GET_LENGTH(string), keepends);
10025        break;
10026    case PyUnicode_2BYTE_KIND:
10027        list = ucs2lib_splitlines(
10028            string, PyUnicode_2BYTE_DATA(string),
10029            PyUnicode_GET_LENGTH(string), keepends);
10030        break;
10031    case PyUnicode_4BYTE_KIND:
10032        list = ucs4lib_splitlines(
10033            string, PyUnicode_4BYTE_DATA(string),
10034            PyUnicode_GET_LENGTH(string), keepends);
10035        break;
10036    default:
10037        assert(0);
10038        list = 0;
10039    }
10040    Py_DECREF(string);
10041    return list;
10042}
10043
10044static PyObject *
10045split(PyObject *self,
10046      PyObject *substring,
10047      Py_ssize_t maxcount)
10048{
10049    int kind1, kind2;
10050    void *buf1, *buf2;
10051    Py_ssize_t len1, len2;
10052    PyObject* out;
10053
10054    if (maxcount < 0)
10055        maxcount = PY_SSIZE_T_MAX;
10056
10057    if (PyUnicode_READY(self) == -1)
10058        return NULL;
10059
10060    if (substring == NULL)
10061        switch (PyUnicode_KIND(self)) {
10062        case PyUnicode_1BYTE_KIND:
10063            if (PyUnicode_IS_ASCII(self))
10064                return asciilib_split_whitespace(
10065                    self,  PyUnicode_1BYTE_DATA(self),
10066                    PyUnicode_GET_LENGTH(self), maxcount
10067                    );
10068            else
10069                return ucs1lib_split_whitespace(
10070                    self,  PyUnicode_1BYTE_DATA(self),
10071                    PyUnicode_GET_LENGTH(self), maxcount
10072                    );
10073        case PyUnicode_2BYTE_KIND:
10074            return ucs2lib_split_whitespace(
10075                self,  PyUnicode_2BYTE_DATA(self),
10076                PyUnicode_GET_LENGTH(self), maxcount
10077                );
10078        case PyUnicode_4BYTE_KIND:
10079            return ucs4lib_split_whitespace(
10080                self,  PyUnicode_4BYTE_DATA(self),
10081                PyUnicode_GET_LENGTH(self), maxcount
10082                );
10083        default:
10084            assert(0);
10085            return NULL;
10086        }
10087
10088    if (PyUnicode_READY(substring) == -1)
10089        return NULL;
10090
10091    kind1 = PyUnicode_KIND(self);
10092    kind2 = PyUnicode_KIND(substring);
10093    len1 = PyUnicode_GET_LENGTH(self);
10094    len2 = PyUnicode_GET_LENGTH(substring);
10095    if (kind1 < kind2 || len1 < len2) {
10096        out = PyList_New(1);
10097        if (out == NULL)
10098            return NULL;
10099        Py_INCREF(self);
10100        PyList_SET_ITEM(out, 0, self);
10101        return out;
10102    }
10103    buf1 = PyUnicode_DATA(self);
10104    buf2 = PyUnicode_DATA(substring);
10105    if (kind2 != kind1) {
10106        buf2 = _PyUnicode_AsKind(substring, kind1);
10107        if (!buf2)
10108            return NULL;
10109    }
10110
10111    switch (kind1) {
10112    case PyUnicode_1BYTE_KIND:
10113        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10114            out = asciilib_split(
10115                self,  buf1, len1, buf2, len2, maxcount);
10116        else
10117            out = ucs1lib_split(
10118                self,  buf1, len1, buf2, len2, maxcount);
10119        break;
10120    case PyUnicode_2BYTE_KIND:
10121        out = ucs2lib_split(
10122            self,  buf1, len1, buf2, len2, maxcount);
10123        break;
10124    case PyUnicode_4BYTE_KIND:
10125        out = ucs4lib_split(
10126            self,  buf1, len1, buf2, len2, maxcount);
10127        break;
10128    default:
10129        out = NULL;
10130    }
10131    if (kind2 != kind1)
10132        PyMem_Free(buf2);
10133    return out;
10134}
10135
10136static PyObject *
10137rsplit(PyObject *self,
10138       PyObject *substring,
10139       Py_ssize_t maxcount)
10140{
10141    int kind1, kind2;
10142    void *buf1, *buf2;
10143    Py_ssize_t len1, len2;
10144    PyObject* out;
10145
10146    if (maxcount < 0)
10147        maxcount = PY_SSIZE_T_MAX;
10148
10149    if (PyUnicode_READY(self) == -1)
10150        return NULL;
10151
10152    if (substring == NULL)
10153        switch (PyUnicode_KIND(self)) {
10154        case PyUnicode_1BYTE_KIND:
10155            if (PyUnicode_IS_ASCII(self))
10156                return asciilib_rsplit_whitespace(
10157                    self,  PyUnicode_1BYTE_DATA(self),
10158                    PyUnicode_GET_LENGTH(self), maxcount
10159                    );
10160            else
10161                return ucs1lib_rsplit_whitespace(
10162                    self,  PyUnicode_1BYTE_DATA(self),
10163                    PyUnicode_GET_LENGTH(self), maxcount
10164                    );
10165        case PyUnicode_2BYTE_KIND:
10166            return ucs2lib_rsplit_whitespace(
10167                self,  PyUnicode_2BYTE_DATA(self),
10168                PyUnicode_GET_LENGTH(self), maxcount
10169                );
10170        case PyUnicode_4BYTE_KIND:
10171            return ucs4lib_rsplit_whitespace(
10172                self,  PyUnicode_4BYTE_DATA(self),
10173                PyUnicode_GET_LENGTH(self), maxcount
10174                );
10175        default:
10176            assert(0);
10177            return NULL;
10178        }
10179
10180    if (PyUnicode_READY(substring) == -1)
10181        return NULL;
10182
10183    kind1 = PyUnicode_KIND(self);
10184    kind2 = PyUnicode_KIND(substring);
10185    len1 = PyUnicode_GET_LENGTH(self);
10186    len2 = PyUnicode_GET_LENGTH(substring);
10187    if (kind1 < kind2 || len1 < len2) {
10188        out = PyList_New(1);
10189        if (out == NULL)
10190            return NULL;
10191        Py_INCREF(self);
10192        PyList_SET_ITEM(out, 0, self);
10193        return out;
10194    }
10195    buf1 = PyUnicode_DATA(self);
10196    buf2 = PyUnicode_DATA(substring);
10197    if (kind2 != kind1) {
10198        buf2 = _PyUnicode_AsKind(substring, kind1);
10199        if (!buf2)
10200            return NULL;
10201    }
10202
10203    switch (kind1) {
10204    case PyUnicode_1BYTE_KIND:
10205        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10206            out = asciilib_rsplit(
10207                self,  buf1, len1, buf2, len2, maxcount);
10208        else
10209            out = ucs1lib_rsplit(
10210                self,  buf1, len1, buf2, len2, maxcount);
10211        break;
10212    case PyUnicode_2BYTE_KIND:
10213        out = ucs2lib_rsplit(
10214            self,  buf1, len1, buf2, len2, maxcount);
10215        break;
10216    case PyUnicode_4BYTE_KIND:
10217        out = ucs4lib_rsplit(
10218            self,  buf1, len1, buf2, len2, maxcount);
10219        break;
10220    default:
10221        out = NULL;
10222    }
10223    if (kind2 != kind1)
10224        PyMem_Free(buf2);
10225    return out;
10226}
10227
10228static Py_ssize_t
10229anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10230            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10231{
10232    switch (kind) {
10233    case PyUnicode_1BYTE_KIND:
10234        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10235            return asciilib_find(buf1, len1, buf2, len2, offset);
10236        else
10237            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10238    case PyUnicode_2BYTE_KIND:
10239        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10240    case PyUnicode_4BYTE_KIND:
10241        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10242    }
10243    assert(0);
10244    return -1;
10245}
10246
10247static Py_ssize_t
10248anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10249             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10250{
10251    switch (kind) {
10252    case PyUnicode_1BYTE_KIND:
10253        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10254            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10255        else
10256            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10257    case PyUnicode_2BYTE_KIND:
10258        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10259    case PyUnicode_4BYTE_KIND:
10260        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10261    }
10262    assert(0);
10263    return 0;
10264}
10265
10266static void
10267replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10268                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10269{
10270    int kind = PyUnicode_KIND(u);
10271    void *data = PyUnicode_DATA(u);
10272    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10273    if (kind == PyUnicode_1BYTE_KIND) {
10274        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10275                                      (Py_UCS1 *)data + len,
10276                                      u1, u2, maxcount);
10277    }
10278    else if (kind == PyUnicode_2BYTE_KIND) {
10279        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10280                                      (Py_UCS2 *)data + len,
10281                                      u1, u2, maxcount);
10282    }
10283    else {
10284        assert(kind == PyUnicode_4BYTE_KIND);
10285        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10286                                      (Py_UCS4 *)data + len,
10287                                      u1, u2, maxcount);
10288    }
10289}
10290
10291static PyObject *
10292replace(PyObject *self, PyObject *str1,
10293        PyObject *str2, Py_ssize_t maxcount)
10294{
10295    PyObject *u;
10296    char *sbuf = PyUnicode_DATA(self);
10297    char *buf1 = PyUnicode_DATA(str1);
10298    char *buf2 = PyUnicode_DATA(str2);
10299    int srelease = 0, release1 = 0, release2 = 0;
10300    int skind = PyUnicode_KIND(self);
10301    int kind1 = PyUnicode_KIND(str1);
10302    int kind2 = PyUnicode_KIND(str2);
10303    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10304    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10305    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10306    int mayshrink;
10307    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10308
10309    if (maxcount < 0)
10310        maxcount = PY_SSIZE_T_MAX;
10311    else if (maxcount == 0 || slen == 0)
10312        goto nothing;
10313
10314    if (str1 == str2)
10315        goto nothing;
10316
10317    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10318    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10319    if (maxchar < maxchar_str1)
10320        /* substring too wide to be present */
10321        goto nothing;
10322    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10323    /* Replacing str1 with str2 may cause a maxchar reduction in the
10324       result string. */
10325    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10326    maxchar = Py_MAX(maxchar, maxchar_str2);
10327
10328    if (len1 == len2) {
10329        /* same length */
10330        if (len1 == 0)
10331            goto nothing;
10332        if (len1 == 1) {
10333            /* replace characters */
10334            Py_UCS4 u1, u2;
10335            Py_ssize_t pos;
10336
10337            u1 = PyUnicode_READ(kind1, buf1, 0);
10338            pos = findchar(sbuf, skind, slen, u1, 1);
10339            if (pos < 0)
10340                goto nothing;
10341            u2 = PyUnicode_READ(kind2, buf2, 0);
10342            u = PyUnicode_New(slen, maxchar);
10343            if (!u)
10344                goto error;
10345
10346            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10347            replace_1char_inplace(u, pos, u1, u2, maxcount);
10348        }
10349        else {
10350            int rkind = skind;
10351            char *res;
10352            Py_ssize_t i;
10353
10354            if (kind1 < rkind) {
10355                /* widen substring */
10356                buf1 = _PyUnicode_AsKind(str1, rkind);
10357                if (!buf1) goto error;
10358                release1 = 1;
10359            }
10360            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10361            if (i < 0)
10362                goto nothing;
10363            if (rkind > kind2) {
10364                /* widen replacement */
10365                buf2 = _PyUnicode_AsKind(str2, rkind);
10366                if (!buf2) goto error;
10367                release2 = 1;
10368            }
10369            else if (rkind < kind2) {
10370                /* widen self and buf1 */
10371                rkind = kind2;
10372                if (release1) PyMem_Free(buf1);
10373                release1 = 0;
10374                sbuf = _PyUnicode_AsKind(self, rkind);
10375                if (!sbuf) goto error;
10376                srelease = 1;
10377                buf1 = _PyUnicode_AsKind(str1, rkind);
10378                if (!buf1) goto error;
10379                release1 = 1;
10380            }
10381            u = PyUnicode_New(slen, maxchar);
10382            if (!u)
10383                goto error;
10384            assert(PyUnicode_KIND(u) == rkind);
10385            res = PyUnicode_DATA(u);
10386
10387            memcpy(res, sbuf, rkind * slen);
10388            /* change everything in-place, starting with this one */
10389            memcpy(res + rkind * i,
10390                   buf2,
10391                   rkind * len2);
10392            i += len1;
10393
10394            while ( --maxcount > 0) {
10395                i = anylib_find(rkind, self,
10396                                sbuf+rkind*i, slen-i,
10397                                str1, buf1, len1, i);
10398                if (i == -1)
10399                    break;
10400                memcpy(res + rkind * i,
10401                       buf2,
10402                       rkind * len2);
10403                i += len1;
10404            }
10405        }
10406    }
10407    else {
10408        Py_ssize_t n, i, j, ires;
10409        Py_ssize_t new_size;
10410        int rkind = skind;
10411        char *res;
10412
10413        if (kind1 < rkind) {
10414            /* widen substring */
10415            buf1 = _PyUnicode_AsKind(str1, rkind);
10416            if (!buf1) goto error;
10417            release1 = 1;
10418        }
10419        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10420        if (n == 0)
10421            goto nothing;
10422        if (kind2 < rkind) {
10423            /* widen replacement */
10424            buf2 = _PyUnicode_AsKind(str2, rkind);
10425            if (!buf2) goto error;
10426            release2 = 1;
10427        }
10428        else if (kind2 > rkind) {
10429            /* widen self and buf1 */
10430            rkind = kind2;
10431            sbuf = _PyUnicode_AsKind(self, rkind);
10432            if (!sbuf) goto error;
10433            srelease = 1;
10434            if (release1) PyMem_Free(buf1);
10435            release1 = 0;
10436            buf1 = _PyUnicode_AsKind(str1, rkind);
10437            if (!buf1) goto error;
10438            release1 = 1;
10439        }
10440        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10441           PyUnicode_GET_LENGTH(str1))); */
10442        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10443                PyErr_SetString(PyExc_OverflowError,
10444                                "replace string is too long");
10445                goto error;
10446        }
10447        new_size = slen + n * (len2 - len1);
10448        if (new_size == 0) {
10449            _Py_INCREF_UNICODE_EMPTY();
10450            if (!unicode_empty)
10451                goto error;
10452            u = unicode_empty;
10453            goto done;
10454        }
10455        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10456            PyErr_SetString(PyExc_OverflowError,
10457                            "replace string is too long");
10458            goto error;
10459        }
10460        u = PyUnicode_New(new_size, maxchar);
10461        if (!u)
10462            goto error;
10463        assert(PyUnicode_KIND(u) == rkind);
10464        res = PyUnicode_DATA(u);
10465        ires = i = 0;
10466        if (len1 > 0) {
10467            while (n-- > 0) {
10468                /* look for next match */
10469                j = anylib_find(rkind, self,
10470                                sbuf + rkind * i, slen-i,
10471                                str1, buf1, len1, i);
10472                if (j == -1)
10473                    break;
10474                else if (j > i) {
10475                    /* copy unchanged part [i:j] */
10476                    memcpy(res + rkind * ires,
10477                           sbuf + rkind * i,
10478                           rkind * (j-i));
10479                    ires += j - i;
10480                }
10481                /* copy substitution string */
10482                if (len2 > 0) {
10483                    memcpy(res + rkind * ires,
10484                           buf2,
10485                           rkind * len2);
10486                    ires += len2;
10487                }
10488                i = j + len1;
10489            }
10490            if (i < slen)
10491                /* copy tail [i:] */
10492                memcpy(res + rkind * ires,
10493                       sbuf + rkind * i,
10494                       rkind * (slen-i));
10495        }
10496        else {
10497            /* interleave */
10498            while (n > 0) {
10499                memcpy(res + rkind * ires,
10500                       buf2,
10501                       rkind * len2);
10502                ires += len2;
10503                if (--n <= 0)
10504                    break;
10505                memcpy(res + rkind * ires,
10506                       sbuf + rkind * i,
10507                       rkind);
10508                ires++;
10509                i++;
10510            }
10511            memcpy(res + rkind * ires,
10512                   sbuf + rkind * i,
10513                   rkind * (slen-i));
10514        }
10515    }
10516
10517    if (mayshrink) {
10518        unicode_adjust_maxchar(&u);
10519        if (u == NULL)
10520            goto error;
10521    }
10522
10523  done:
10524    if (srelease)
10525        PyMem_FREE(sbuf);
10526    if (release1)
10527        PyMem_FREE(buf1);
10528    if (release2)
10529        PyMem_FREE(buf2);
10530    assert(_PyUnicode_CheckConsistency(u, 1));
10531    return u;
10532
10533  nothing:
10534    /* nothing to replace; return original string (when possible) */
10535    if (srelease)
10536        PyMem_FREE(sbuf);
10537    if (release1)
10538        PyMem_FREE(buf1);
10539    if (release2)
10540        PyMem_FREE(buf2);
10541    return unicode_result_unchanged(self);
10542
10543  error:
10544    if (srelease && sbuf)
10545        PyMem_FREE(sbuf);
10546    if (release1 && buf1)
10547        PyMem_FREE(buf1);
10548    if (release2 && buf2)
10549        PyMem_FREE(buf2);
10550    return NULL;
10551}
10552
10553/* --- Unicode Object Methods --------------------------------------------- */
10554
10555PyDoc_STRVAR(title__doc__,
10556             "S.title() -> str\n\
10557\n\
10558Return a titlecased version of S, i.e. words start with title case\n\
10559characters, all remaining cased characters have lower case.");
10560
10561static PyObject*
10562unicode_title(PyObject *self)
10563{
10564    if (PyUnicode_READY(self) == -1)
10565        return NULL;
10566    return case_operation(self, do_title);
10567}
10568
10569PyDoc_STRVAR(capitalize__doc__,
10570             "S.capitalize() -> str\n\
10571\n\
10572Return a capitalized version of S, i.e. make the first character\n\
10573have upper case and the rest lower case.");
10574
10575static PyObject*
10576unicode_capitalize(PyObject *self)
10577{
10578    if (PyUnicode_READY(self) == -1)
10579        return NULL;
10580    if (PyUnicode_GET_LENGTH(self) == 0)
10581        return unicode_result_unchanged(self);
10582    return case_operation(self, do_capitalize);
10583}
10584
10585PyDoc_STRVAR(casefold__doc__,
10586             "S.casefold() -> str\n\
10587\n\
10588Return a version of S suitable for caseless comparisons.");
10589
10590static PyObject *
10591unicode_casefold(PyObject *self)
10592{
10593    if (PyUnicode_READY(self) == -1)
10594        return NULL;
10595    if (PyUnicode_IS_ASCII(self))
10596        return ascii_upper_or_lower(self, 1);
10597    return case_operation(self, do_casefold);
10598}
10599
10600
10601/* Argument converter.  Coerces to a single unicode character */
10602
10603static int
10604convert_uc(PyObject *obj, void *addr)
10605{
10606    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10607    PyObject *uniobj;
10608
10609    uniobj = PyUnicode_FromObject(obj);
10610    if (uniobj == NULL) {
10611        PyErr_SetString(PyExc_TypeError,
10612                        "The fill character cannot be converted to Unicode");
10613        return 0;
10614    }
10615    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10616        PyErr_SetString(PyExc_TypeError,
10617                        "The fill character must be exactly one character long");
10618        Py_DECREF(uniobj);
10619        return 0;
10620    }
10621    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10622    Py_DECREF(uniobj);
10623    return 1;
10624}
10625
10626PyDoc_STRVAR(center__doc__,
10627             "S.center(width[, fillchar]) -> str\n\
10628\n\
10629Return S centered in a string of length width. Padding is\n\
10630done using the specified fill character (default is a space)");
10631
10632static PyObject *
10633unicode_center(PyObject *self, PyObject *args)
10634{
10635    Py_ssize_t marg, left;
10636    Py_ssize_t width;
10637    Py_UCS4 fillchar = ' ';
10638
10639    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10640        return NULL;
10641
10642    if (PyUnicode_READY(self) == -1)
10643        return NULL;
10644
10645    if (PyUnicode_GET_LENGTH(self) >= width)
10646        return unicode_result_unchanged(self);
10647
10648    marg = width - PyUnicode_GET_LENGTH(self);
10649    left = marg / 2 + (marg & width & 1);
10650
10651    return pad(self, left, marg - left, fillchar);
10652}
10653
10654/* This function assumes that str1 and str2 are readied by the caller. */
10655
10656static int
10657unicode_compare(PyObject *str1, PyObject *str2)
10658{
10659#define COMPARE(TYPE1, TYPE2) \
10660    do { \
10661        TYPE1* p1 = (TYPE1 *)data1; \
10662        TYPE2* p2 = (TYPE2 *)data2; \
10663        TYPE1* end = p1 + len; \
10664        Py_UCS4 c1, c2; \
10665        for (; p1 != end; p1++, p2++) { \
10666            c1 = *p1; \
10667            c2 = *p2; \
10668            if (c1 != c2) \
10669                return (c1 < c2) ? -1 : 1; \
10670        } \
10671    } \
10672    while (0)
10673
10674    int kind1, kind2;
10675    void *data1, *data2;
10676    Py_ssize_t len1, len2, len;
10677
10678    kind1 = PyUnicode_KIND(str1);
10679    kind2 = PyUnicode_KIND(str2);
10680    data1 = PyUnicode_DATA(str1);
10681    data2 = PyUnicode_DATA(str2);
10682    len1 = PyUnicode_GET_LENGTH(str1);
10683    len2 = PyUnicode_GET_LENGTH(str2);
10684    len = Py_MIN(len1, len2);
10685
10686    switch(kind1) {
10687    case PyUnicode_1BYTE_KIND:
10688    {
10689        switch(kind2) {
10690        case PyUnicode_1BYTE_KIND:
10691        {
10692            int cmp = memcmp(data1, data2, len);
10693            /* normalize result of memcmp() into the range [-1; 1] */
10694            if (cmp < 0)
10695                return -1;
10696            if (cmp > 0)
10697                return 1;
10698            break;
10699        }
10700        case PyUnicode_2BYTE_KIND:
10701            COMPARE(Py_UCS1, Py_UCS2);
10702            break;
10703        case PyUnicode_4BYTE_KIND:
10704            COMPARE(Py_UCS1, Py_UCS4);
10705            break;
10706        default:
10707            assert(0);
10708        }
10709        break;
10710    }
10711    case PyUnicode_2BYTE_KIND:
10712    {
10713        switch(kind2) {
10714        case PyUnicode_1BYTE_KIND:
10715            COMPARE(Py_UCS2, Py_UCS1);
10716            break;
10717        case PyUnicode_2BYTE_KIND:
10718        {
10719            COMPARE(Py_UCS2, Py_UCS2);
10720            break;
10721        }
10722        case PyUnicode_4BYTE_KIND:
10723            COMPARE(Py_UCS2, Py_UCS4);
10724            break;
10725        default:
10726            assert(0);
10727        }
10728        break;
10729    }
10730    case PyUnicode_4BYTE_KIND:
10731    {
10732        switch(kind2) {
10733        case PyUnicode_1BYTE_KIND:
10734            COMPARE(Py_UCS4, Py_UCS1);
10735            break;
10736        case PyUnicode_2BYTE_KIND:
10737            COMPARE(Py_UCS4, Py_UCS2);
10738            break;
10739        case PyUnicode_4BYTE_KIND:
10740        {
10741#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10742            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10743            /* normalize result of wmemcmp() into the range [-1; 1] */
10744            if (cmp < 0)
10745                return -1;
10746            if (cmp > 0)
10747                return 1;
10748#else
10749            COMPARE(Py_UCS4, Py_UCS4);
10750#endif
10751            break;
10752        }
10753        default:
10754            assert(0);
10755        }
10756        break;
10757    }
10758    default:
10759        assert(0);
10760    }
10761
10762    if (len1 == len2)
10763        return 0;
10764    if (len1 < len2)
10765        return -1;
10766    else
10767        return 1;
10768
10769#undef COMPARE
10770}
10771
10772Py_LOCAL(int)
10773unicode_compare_eq(PyObject *str1, PyObject *str2)
10774{
10775    int kind;
10776    void *data1, *data2;
10777    Py_ssize_t len;
10778    int cmp;
10779
10780    len = PyUnicode_GET_LENGTH(str1);
10781    if (PyUnicode_GET_LENGTH(str2) != len)
10782        return 0;
10783    kind = PyUnicode_KIND(str1);
10784    if (PyUnicode_KIND(str2) != kind)
10785        return 0;
10786    data1 = PyUnicode_DATA(str1);
10787    data2 = PyUnicode_DATA(str2);
10788
10789    cmp = memcmp(data1, data2, len * kind);
10790    return (cmp == 0);
10791}
10792
10793
10794int
10795PyUnicode_Compare(PyObject *left, PyObject *right)
10796{
10797    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10798        if (PyUnicode_READY(left) == -1 ||
10799            PyUnicode_READY(right) == -1)
10800            return -1;
10801
10802        /* a string is equal to itself */
10803        if (left == right)
10804            return 0;
10805
10806        return unicode_compare(left, right);
10807    }
10808    PyErr_Format(PyExc_TypeError,
10809                 "Can't compare %.100s and %.100s",
10810                 left->ob_type->tp_name,
10811                 right->ob_type->tp_name);
10812    return -1;
10813}
10814
10815int
10816_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10817{
10818    PyObject *right_str = _PyUnicode_FromId(right);   /* borrowed */
10819    if (right_str == NULL)
10820        return -1;
10821    return PyUnicode_Compare(left, right_str);
10822}
10823
10824int
10825PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10826{
10827    Py_ssize_t i;
10828    int kind;
10829    Py_UCS4 chr;
10830
10831    assert(_PyUnicode_CHECK(uni));
10832    if (PyUnicode_READY(uni) == -1)
10833        return -1;
10834    kind = PyUnicode_KIND(uni);
10835    if (kind == PyUnicode_1BYTE_KIND) {
10836        const void *data = PyUnicode_1BYTE_DATA(uni);
10837        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10838        size_t len, len2 = strlen(str);
10839        int cmp;
10840
10841        len = Py_MIN(len1, len2);
10842        cmp = memcmp(data, str, len);
10843        if (cmp != 0) {
10844            if (cmp < 0)
10845                return -1;
10846            else
10847                return 1;
10848        }
10849        if (len1 > len2)
10850            return 1; /* uni is longer */
10851        if (len1 < len2)
10852            return -1; /* str is longer */
10853        return 0;
10854    }
10855    else {
10856        void *data = PyUnicode_DATA(uni);
10857        /* Compare Unicode string and source character set string */
10858        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10859            if (chr != (unsigned char)str[i])
10860                return (chr < (unsigned char)(str[i])) ? -1 : 1;
10861        /* This check keeps Python strings that end in '\0' from comparing equal
10862         to C strings identical up to that point. */
10863        if (PyUnicode_GET_LENGTH(uni) != i || chr)
10864            return 1; /* uni is longer */
10865        if (str[i])
10866            return -1; /* str is longer */
10867        return 0;
10868    }
10869}
10870
10871
10872#define TEST_COND(cond)                         \
10873    ((cond) ? Py_True : Py_False)
10874
10875PyObject *
10876PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10877{
10878    int result;
10879    PyObject *v;
10880
10881    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10882        Py_RETURN_NOTIMPLEMENTED;
10883
10884    if (PyUnicode_READY(left) == -1 ||
10885        PyUnicode_READY(right) == -1)
10886        return NULL;
10887
10888    if (left == right) {
10889        switch (op) {
10890        case Py_EQ:
10891        case Py_LE:
10892        case Py_GE:
10893            /* a string is equal to itself */
10894            v = Py_True;
10895            break;
10896        case Py_NE:
10897        case Py_LT:
10898        case Py_GT:
10899            v = Py_False;
10900            break;
10901        default:
10902            PyErr_BadArgument();
10903            return NULL;
10904        }
10905    }
10906    else if (op == Py_EQ || op == Py_NE) {
10907        result = unicode_compare_eq(left, right);
10908        result ^= (op == Py_NE);
10909        v = TEST_COND(result);
10910    }
10911    else {
10912        result = unicode_compare(left, right);
10913
10914        /* Convert the return value to a Boolean */
10915        switch (op) {
10916        case Py_LE:
10917            v = TEST_COND(result <= 0);
10918            break;
10919        case Py_GE:
10920            v = TEST_COND(result >= 0);
10921            break;
10922        case Py_LT:
10923            v = TEST_COND(result == -1);
10924            break;
10925        case Py_GT:
10926            v = TEST_COND(result == 1);
10927            break;
10928        default:
10929            PyErr_BadArgument();
10930            return NULL;
10931        }
10932    }
10933    Py_INCREF(v);
10934    return v;
10935}
10936
10937int
10938_PyUnicode_EQ(PyObject *aa, PyObject *bb)
10939{
10940    return unicode_eq(aa, bb);
10941}
10942
10943int
10944PyUnicode_Contains(PyObject *container, PyObject *element)
10945{
10946    PyObject *str, *sub;
10947    int kind1, kind2;
10948    void *buf1, *buf2;
10949    Py_ssize_t len1, len2;
10950    int result;
10951
10952    /* Coerce the two arguments */
10953    sub = PyUnicode_FromObject(element);
10954    if (!sub) {
10955        PyErr_Format(PyExc_TypeError,
10956                     "'in <string>' requires string as left operand, not %s",
10957                     element->ob_type->tp_name);
10958        return -1;
10959    }
10960
10961    str = PyUnicode_FromObject(container);
10962    if (!str) {
10963        Py_DECREF(sub);
10964        return -1;
10965    }
10966
10967    kind1 = PyUnicode_KIND(str);
10968    kind2 = PyUnicode_KIND(sub);
10969    if (kind1 < kind2) {
10970        Py_DECREF(sub);
10971        Py_DECREF(str);
10972        return 0;
10973    }
10974    len1 = PyUnicode_GET_LENGTH(str);
10975    len2 = PyUnicode_GET_LENGTH(sub);
10976    if (len1 < len2) {
10977        Py_DECREF(sub);
10978        Py_DECREF(str);
10979        return 0;
10980    }
10981    buf1 = PyUnicode_DATA(str);
10982    buf2 = PyUnicode_DATA(sub);
10983    if (len2 == 1) {
10984        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
10985        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
10986        Py_DECREF(sub);
10987        Py_DECREF(str);
10988        return result;
10989    }
10990    if (kind2 != kind1) {
10991        buf2 = _PyUnicode_AsKind(sub, kind1);
10992        if (!buf2) {
10993            Py_DECREF(sub);
10994            Py_DECREF(str);
10995            return -1;
10996        }
10997    }
10998
10999    switch (kind1) {
11000    case PyUnicode_1BYTE_KIND:
11001        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11002        break;
11003    case PyUnicode_2BYTE_KIND:
11004        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11005        break;
11006    case PyUnicode_4BYTE_KIND:
11007        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11008        break;
11009    default:
11010        result = -1;
11011        assert(0);
11012    }
11013
11014    Py_DECREF(str);
11015    Py_DECREF(sub);
11016
11017    if (kind2 != kind1)
11018        PyMem_Free(buf2);
11019
11020    return result;
11021}
11022
11023/* Concat to string or Unicode object giving a new Unicode object. */
11024
11025PyObject *
11026PyUnicode_Concat(PyObject *left, PyObject *right)
11027{
11028    PyObject *u = NULL, *v = NULL, *w;
11029    Py_UCS4 maxchar, maxchar2;
11030    Py_ssize_t u_len, v_len, new_len;
11031
11032    /* Coerce the two arguments */
11033    u = PyUnicode_FromObject(left);
11034    if (u == NULL)
11035        goto onError;
11036    v = PyUnicode_FromObject(right);
11037    if (v == NULL)
11038        goto onError;
11039
11040    /* Shortcuts */
11041    if (v == unicode_empty) {
11042        Py_DECREF(v);
11043        return u;
11044    }
11045    if (u == unicode_empty) {
11046        Py_DECREF(u);
11047        return v;
11048    }
11049
11050    u_len = PyUnicode_GET_LENGTH(u);
11051    v_len = PyUnicode_GET_LENGTH(v);
11052    if (u_len > PY_SSIZE_T_MAX - v_len) {
11053        PyErr_SetString(PyExc_OverflowError,
11054                        "strings are too large to concat");
11055        goto onError;
11056    }
11057    new_len = u_len + v_len;
11058
11059    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
11060    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11061    maxchar = Py_MAX(maxchar, maxchar2);
11062
11063    /* Concat the two Unicode strings */
11064    w = PyUnicode_New(new_len, maxchar);
11065    if (w == NULL)
11066        goto onError;
11067    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11068    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
11069    Py_DECREF(u);
11070    Py_DECREF(v);
11071    assert(_PyUnicode_CheckConsistency(w, 1));
11072    return w;
11073
11074  onError:
11075    Py_XDECREF(u);
11076    Py_XDECREF(v);
11077    return NULL;
11078}
11079
11080void
11081PyUnicode_Append(PyObject **p_left, PyObject *right)
11082{
11083    PyObject *left, *res;
11084    Py_UCS4 maxchar, maxchar2;
11085    Py_ssize_t left_len, right_len, new_len;
11086
11087    if (p_left == NULL) {
11088        if (!PyErr_Occurred())
11089            PyErr_BadInternalCall();
11090        return;
11091    }
11092    left = *p_left;
11093    if (right == NULL || left == NULL
11094        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11095        if (!PyErr_Occurred())
11096            PyErr_BadInternalCall();
11097        goto error;
11098    }
11099
11100    if (PyUnicode_READY(left) == -1)
11101        goto error;
11102    if (PyUnicode_READY(right) == -1)
11103        goto error;
11104
11105    /* Shortcuts */
11106    if (left == unicode_empty) {
11107        Py_DECREF(left);
11108        Py_INCREF(right);
11109        *p_left = right;
11110        return;
11111    }
11112    if (right == unicode_empty)
11113        return;
11114
11115    left_len = PyUnicode_GET_LENGTH(left);
11116    right_len = PyUnicode_GET_LENGTH(right);
11117    if (left_len > PY_SSIZE_T_MAX - right_len) {
11118        PyErr_SetString(PyExc_OverflowError,
11119                        "strings are too large to concat");
11120        goto error;
11121    }
11122    new_len = left_len + right_len;
11123
11124    if (unicode_modifiable(left)
11125        && PyUnicode_CheckExact(right)
11126        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11127        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11128           to change the structure size, but characters are stored just after
11129           the structure, and so it requires to move all characters which is
11130           not so different than duplicating the string. */
11131        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11132    {
11133        /* append inplace */
11134        if (unicode_resize(p_left, new_len) != 0)
11135            goto error;
11136
11137        /* copy 'right' into the newly allocated area of 'left' */
11138        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11139    }
11140    else {
11141        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11142        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11143        maxchar = Py_MAX(maxchar, maxchar2);
11144
11145        /* Concat the two Unicode strings */
11146        res = PyUnicode_New(new_len, maxchar);
11147        if (res == NULL)
11148            goto error;
11149        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11150        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11151        Py_DECREF(left);
11152        *p_left = res;
11153    }
11154    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11155    return;
11156
11157error:
11158    Py_CLEAR(*p_left);
11159}
11160
11161void
11162PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11163{
11164    PyUnicode_Append(pleft, right);
11165    Py_XDECREF(right);
11166}
11167
11168PyDoc_STRVAR(count__doc__,
11169             "S.count(sub[, start[, end]]) -> int\n\
11170\n\
11171Return the number of non-overlapping occurrences of substring sub in\n\
11172string S[start:end].  Optional arguments start and end are\n\
11173interpreted as in slice notation.");
11174
11175static PyObject *
11176unicode_count(PyObject *self, PyObject *args)
11177{
11178    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11179    Py_ssize_t start = 0;
11180    Py_ssize_t end = PY_SSIZE_T_MAX;
11181    PyObject *result;
11182    int kind1, kind2;
11183    void *buf1, *buf2;
11184    Py_ssize_t len1, len2, iresult;
11185
11186    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11187                                            &start, &end))
11188        return NULL;
11189
11190    kind1 = PyUnicode_KIND(self);
11191    kind2 = PyUnicode_KIND(substring);
11192    if (kind1 < kind2) {
11193        Py_DECREF(substring);
11194        return PyLong_FromLong(0);
11195    }
11196    len1 = PyUnicode_GET_LENGTH(self);
11197    len2 = PyUnicode_GET_LENGTH(substring);
11198    ADJUST_INDICES(start, end, len1);
11199    if (end - start < len2) {
11200        Py_DECREF(substring);
11201        return PyLong_FromLong(0);
11202    }
11203    buf1 = PyUnicode_DATA(self);
11204    buf2 = PyUnicode_DATA(substring);
11205    if (kind2 != kind1) {
11206        buf2 = _PyUnicode_AsKind(substring, kind1);
11207        if (!buf2) {
11208            Py_DECREF(substring);
11209            return NULL;
11210        }
11211    }
11212    switch (kind1) {
11213    case PyUnicode_1BYTE_KIND:
11214        iresult = ucs1lib_count(
11215            ((Py_UCS1*)buf1) + start, end - start,
11216            buf2, len2, PY_SSIZE_T_MAX
11217            );
11218        break;
11219    case PyUnicode_2BYTE_KIND:
11220        iresult = ucs2lib_count(
11221            ((Py_UCS2*)buf1) + start, end - start,
11222            buf2, len2, PY_SSIZE_T_MAX
11223            );
11224        break;
11225    case PyUnicode_4BYTE_KIND:
11226        iresult = ucs4lib_count(
11227            ((Py_UCS4*)buf1) + start, end - start,
11228            buf2, len2, PY_SSIZE_T_MAX
11229            );
11230        break;
11231    default:
11232        assert(0); iresult = 0;
11233    }
11234
11235    result = PyLong_FromSsize_t(iresult);
11236
11237    if (kind2 != kind1)
11238        PyMem_Free(buf2);
11239
11240    Py_DECREF(substring);
11241
11242    return result;
11243}
11244
11245PyDoc_STRVAR(encode__doc__,
11246             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11247\n\
11248Encode S using the codec registered for encoding. Default encoding\n\
11249is 'utf-8'. errors may be given to set a different error\n\
11250handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11251a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11252'xmlcharrefreplace' as well as any other name registered with\n\
11253codecs.register_error that can handle UnicodeEncodeErrors.");
11254
11255static PyObject *
11256unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11257{
11258    static char *kwlist[] = {"encoding", "errors", 0};
11259    char *encoding = NULL;
11260    char *errors = NULL;
11261
11262    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11263                                     kwlist, &encoding, &errors))
11264        return NULL;
11265    return PyUnicode_AsEncodedString(self, encoding, errors);
11266}
11267
11268PyDoc_STRVAR(expandtabs__doc__,
11269             "S.expandtabs(tabsize=8) -> str\n\
11270\n\
11271Return a copy of S where all tab characters are expanded using spaces.\n\
11272If tabsize is not given, a tab size of 8 characters is assumed.");
11273
11274static PyObject*
11275unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11276{
11277    Py_ssize_t i, j, line_pos, src_len, incr;
11278    Py_UCS4 ch;
11279    PyObject *u;
11280    void *src_data, *dest_data;
11281    static char *kwlist[] = {"tabsize", 0};
11282    int tabsize = 8;
11283    int kind;
11284    int found;
11285
11286    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11287                                     kwlist, &tabsize))
11288        return NULL;
11289
11290    if (PyUnicode_READY(self) == -1)
11291        return NULL;
11292
11293    /* First pass: determine size of output string */
11294    src_len = PyUnicode_GET_LENGTH(self);
11295    i = j = line_pos = 0;
11296    kind = PyUnicode_KIND(self);
11297    src_data = PyUnicode_DATA(self);
11298    found = 0;
11299    for (; i < src_len; i++) {
11300        ch = PyUnicode_READ(kind, src_data, i);
11301        if (ch == '\t') {
11302            found = 1;
11303            if (tabsize > 0) {
11304                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11305                if (j > PY_SSIZE_T_MAX - incr)
11306                    goto overflow;
11307                line_pos += incr;
11308                j += incr;
11309            }
11310        }
11311        else {
11312            if (j > PY_SSIZE_T_MAX - 1)
11313                goto overflow;
11314            line_pos++;
11315            j++;
11316            if (ch == '\n' || ch == '\r')
11317                line_pos = 0;
11318        }
11319    }
11320    if (!found)
11321        return unicode_result_unchanged(self);
11322
11323    /* Second pass: create output string and fill it */
11324    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11325    if (!u)
11326        return NULL;
11327    dest_data = PyUnicode_DATA(u);
11328
11329    i = j = line_pos = 0;
11330
11331    for (; i < src_len; i++) {
11332        ch = PyUnicode_READ(kind, src_data, i);
11333        if (ch == '\t') {
11334            if (tabsize > 0) {
11335                incr = tabsize - (line_pos % tabsize);
11336                line_pos += incr;
11337                FILL(kind, dest_data, ' ', j, incr);
11338                j += incr;
11339            }
11340        }
11341        else {
11342            line_pos++;
11343            PyUnicode_WRITE(kind, dest_data, j, ch);
11344            j++;
11345            if (ch == '\n' || ch == '\r')
11346                line_pos = 0;
11347        }
11348    }
11349    assert (j == PyUnicode_GET_LENGTH(u));
11350    return unicode_result(u);
11351
11352  overflow:
11353    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11354    return NULL;
11355}
11356
11357PyDoc_STRVAR(find__doc__,
11358             "S.find(sub[, start[, end]]) -> int\n\
11359\n\
11360Return the lowest index in S where substring sub is found,\n\
11361such that sub is contained within S[start:end].  Optional\n\
11362arguments start and end are interpreted as in slice notation.\n\
11363\n\
11364Return -1 on failure.");
11365
11366static PyObject *
11367unicode_find(PyObject *self, PyObject *args)
11368{
11369    /* initialize variables to prevent gcc warning */
11370    PyObject *substring = NULL;
11371    Py_ssize_t start = 0;
11372    Py_ssize_t end = 0;
11373    Py_ssize_t result;
11374
11375    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11376                                            &start, &end))
11377        return NULL;
11378
11379    if (PyUnicode_READY(self) == -1) {
11380        Py_DECREF(substring);
11381        return NULL;
11382    }
11383    if (PyUnicode_READY(substring) == -1) {
11384        Py_DECREF(substring);
11385        return NULL;
11386    }
11387
11388    result = any_find_slice(1, self, substring, start, end);
11389
11390    Py_DECREF(substring);
11391
11392    if (result == -2)
11393        return NULL;
11394
11395    return PyLong_FromSsize_t(result);
11396}
11397
11398static PyObject *
11399unicode_getitem(PyObject *self, Py_ssize_t index)
11400{
11401    void *data;
11402    enum PyUnicode_Kind kind;
11403    Py_UCS4 ch;
11404
11405    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11406        PyErr_BadArgument();
11407        return NULL;
11408    }
11409    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11410        PyErr_SetString(PyExc_IndexError, "string index out of range");
11411        return NULL;
11412    }
11413    kind = PyUnicode_KIND(self);
11414    data = PyUnicode_DATA(self);
11415    ch = PyUnicode_READ(kind, data, index);
11416    return unicode_char(ch);
11417}
11418
11419/* Believe it or not, this produces the same value for ASCII strings
11420   as bytes_hash(). */
11421static Py_hash_t
11422unicode_hash(PyObject *self)
11423{
11424    Py_ssize_t len;
11425    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11426
11427#ifdef Py_DEBUG
11428    assert(_Py_HashSecret_Initialized);
11429#endif
11430    if (_PyUnicode_HASH(self) != -1)
11431        return _PyUnicode_HASH(self);
11432    if (PyUnicode_READY(self) == -1)
11433        return -1;
11434    len = PyUnicode_GET_LENGTH(self);
11435    /*
11436      We make the hash of the empty string be 0, rather than using
11437      (prefix ^ suffix), since this slightly obfuscates the hash secret
11438    */
11439    if (len == 0) {
11440        _PyUnicode_HASH(self) = 0;
11441        return 0;
11442    }
11443    x = _Py_HashBytes(PyUnicode_DATA(self),
11444                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11445    _PyUnicode_HASH(self) = x;
11446    return x;
11447}
11448
11449PyDoc_STRVAR(index__doc__,
11450             "S.index(sub[, start[, end]]) -> int\n\
11451\n\
11452Like S.find() but raise ValueError when the substring is not found.");
11453
11454static PyObject *
11455unicode_index(PyObject *self, PyObject *args)
11456{
11457    /* initialize variables to prevent gcc warning */
11458    Py_ssize_t result;
11459    PyObject *substring = NULL;
11460    Py_ssize_t start = 0;
11461    Py_ssize_t end = 0;
11462
11463    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11464                                            &start, &end))
11465        return NULL;
11466
11467    if (PyUnicode_READY(self) == -1) {
11468        Py_DECREF(substring);
11469        return NULL;
11470    }
11471    if (PyUnicode_READY(substring) == -1) {
11472        Py_DECREF(substring);
11473        return NULL;
11474    }
11475
11476    result = any_find_slice(1, self, substring, start, end);
11477
11478    Py_DECREF(substring);
11479
11480    if (result == -2)
11481        return NULL;
11482
11483    if (result < 0) {
11484        PyErr_SetString(PyExc_ValueError, "substring not found");
11485        return NULL;
11486    }
11487
11488    return PyLong_FromSsize_t(result);
11489}
11490
11491PyDoc_STRVAR(islower__doc__,
11492             "S.islower() -> bool\n\
11493\n\
11494Return True if all cased characters in S are lowercase and there is\n\
11495at least one cased character in S, False otherwise.");
11496
11497static PyObject*
11498unicode_islower(PyObject *self)
11499{
11500    Py_ssize_t i, length;
11501    int kind;
11502    void *data;
11503    int cased;
11504
11505    if (PyUnicode_READY(self) == -1)
11506        return NULL;
11507    length = PyUnicode_GET_LENGTH(self);
11508    kind = PyUnicode_KIND(self);
11509    data = PyUnicode_DATA(self);
11510
11511    /* Shortcut for single character strings */
11512    if (length == 1)
11513        return PyBool_FromLong(
11514            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11515
11516    /* Special case for empty strings */
11517    if (length == 0)
11518        return PyBool_FromLong(0);
11519
11520    cased = 0;
11521    for (i = 0; i < length; i++) {
11522        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11523
11524        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11525            return PyBool_FromLong(0);
11526        else if (!cased && Py_UNICODE_ISLOWER(ch))
11527            cased = 1;
11528    }
11529    return PyBool_FromLong(cased);
11530}
11531
11532PyDoc_STRVAR(isupper__doc__,
11533             "S.isupper() -> bool\n\
11534\n\
11535Return True if all cased characters in S are uppercase and there is\n\
11536at least one cased character in S, False otherwise.");
11537
11538static PyObject*
11539unicode_isupper(PyObject *self)
11540{
11541    Py_ssize_t i, length;
11542    int kind;
11543    void *data;
11544    int cased;
11545
11546    if (PyUnicode_READY(self) == -1)
11547        return NULL;
11548    length = PyUnicode_GET_LENGTH(self);
11549    kind = PyUnicode_KIND(self);
11550    data = PyUnicode_DATA(self);
11551
11552    /* Shortcut for single character strings */
11553    if (length == 1)
11554        return PyBool_FromLong(
11555            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11556
11557    /* Special case for empty strings */
11558    if (length == 0)
11559        return PyBool_FromLong(0);
11560
11561    cased = 0;
11562    for (i = 0; i < length; i++) {
11563        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11564
11565        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11566            return PyBool_FromLong(0);
11567        else if (!cased && Py_UNICODE_ISUPPER(ch))
11568            cased = 1;
11569    }
11570    return PyBool_FromLong(cased);
11571}
11572
11573PyDoc_STRVAR(istitle__doc__,
11574             "S.istitle() -> bool\n\
11575\n\
11576Return True if S is a titlecased string and there is at least one\n\
11577character in S, i.e. upper- and titlecase characters may only\n\
11578follow uncased characters and lowercase characters only cased ones.\n\
11579Return False otherwise.");
11580
11581static PyObject*
11582unicode_istitle(PyObject *self)
11583{
11584    Py_ssize_t i, length;
11585    int kind;
11586    void *data;
11587    int cased, previous_is_cased;
11588
11589    if (PyUnicode_READY(self) == -1)
11590        return NULL;
11591    length = PyUnicode_GET_LENGTH(self);
11592    kind = PyUnicode_KIND(self);
11593    data = PyUnicode_DATA(self);
11594
11595    /* Shortcut for single character strings */
11596    if (length == 1) {
11597        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11598        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11599                               (Py_UNICODE_ISUPPER(ch) != 0));
11600    }
11601
11602    /* Special case for empty strings */
11603    if (length == 0)
11604        return PyBool_FromLong(0);
11605
11606    cased = 0;
11607    previous_is_cased = 0;
11608    for (i = 0; i < length; i++) {
11609        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11610
11611        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11612            if (previous_is_cased)
11613                return PyBool_FromLong(0);
11614            previous_is_cased = 1;
11615            cased = 1;
11616        }
11617        else if (Py_UNICODE_ISLOWER(ch)) {
11618            if (!previous_is_cased)
11619                return PyBool_FromLong(0);
11620            previous_is_cased = 1;
11621            cased = 1;
11622        }
11623        else
11624            previous_is_cased = 0;
11625    }
11626    return PyBool_FromLong(cased);
11627}
11628
11629PyDoc_STRVAR(isspace__doc__,
11630             "S.isspace() -> bool\n\
11631\n\
11632Return True if all characters in S are whitespace\n\
11633and there is at least one character in S, False otherwise.");
11634
11635static PyObject*
11636unicode_isspace(PyObject *self)
11637{
11638    Py_ssize_t i, length;
11639    int kind;
11640    void *data;
11641
11642    if (PyUnicode_READY(self) == -1)
11643        return NULL;
11644    length = PyUnicode_GET_LENGTH(self);
11645    kind = PyUnicode_KIND(self);
11646    data = PyUnicode_DATA(self);
11647
11648    /* Shortcut for single character strings */
11649    if (length == 1)
11650        return PyBool_FromLong(
11651            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11652
11653    /* Special case for empty strings */
11654    if (length == 0)
11655        return PyBool_FromLong(0);
11656
11657    for (i = 0; i < length; i++) {
11658        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11659        if (!Py_UNICODE_ISSPACE(ch))
11660            return PyBool_FromLong(0);
11661    }
11662    return PyBool_FromLong(1);
11663}
11664
11665PyDoc_STRVAR(isalpha__doc__,
11666             "S.isalpha() -> bool\n\
11667\n\
11668Return True if all characters in S are alphabetic\n\
11669and there is at least one character in S, False otherwise.");
11670
11671static PyObject*
11672unicode_isalpha(PyObject *self)
11673{
11674    Py_ssize_t i, length;
11675    int kind;
11676    void *data;
11677
11678    if (PyUnicode_READY(self) == -1)
11679        return NULL;
11680    length = PyUnicode_GET_LENGTH(self);
11681    kind = PyUnicode_KIND(self);
11682    data = PyUnicode_DATA(self);
11683
11684    /* Shortcut for single character strings */
11685    if (length == 1)
11686        return PyBool_FromLong(
11687            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11688
11689    /* Special case for empty strings */
11690    if (length == 0)
11691        return PyBool_FromLong(0);
11692
11693    for (i = 0; i < length; i++) {
11694        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11695            return PyBool_FromLong(0);
11696    }
11697    return PyBool_FromLong(1);
11698}
11699
11700PyDoc_STRVAR(isalnum__doc__,
11701             "S.isalnum() -> bool\n\
11702\n\
11703Return True if all characters in S are alphanumeric\n\
11704and there is at least one character in S, False otherwise.");
11705
11706static PyObject*
11707unicode_isalnum(PyObject *self)
11708{
11709    int kind;
11710    void *data;
11711    Py_ssize_t len, i;
11712
11713    if (PyUnicode_READY(self) == -1)
11714        return NULL;
11715
11716    kind = PyUnicode_KIND(self);
11717    data = PyUnicode_DATA(self);
11718    len = PyUnicode_GET_LENGTH(self);
11719
11720    /* Shortcut for single character strings */
11721    if (len == 1) {
11722        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11723        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11724    }
11725
11726    /* Special case for empty strings */
11727    if (len == 0)
11728        return PyBool_FromLong(0);
11729
11730    for (i = 0; i < len; i++) {
11731        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11732        if (!Py_UNICODE_ISALNUM(ch))
11733            return PyBool_FromLong(0);
11734    }
11735    return PyBool_FromLong(1);
11736}
11737
11738PyDoc_STRVAR(isdecimal__doc__,
11739             "S.isdecimal() -> bool\n\
11740\n\
11741Return True if there are only decimal characters in S,\n\
11742False otherwise.");
11743
11744static PyObject*
11745unicode_isdecimal(PyObject *self)
11746{
11747    Py_ssize_t i, length;
11748    int kind;
11749    void *data;
11750
11751    if (PyUnicode_READY(self) == -1)
11752        return NULL;
11753    length = PyUnicode_GET_LENGTH(self);
11754    kind = PyUnicode_KIND(self);
11755    data = PyUnicode_DATA(self);
11756
11757    /* Shortcut for single character strings */
11758    if (length == 1)
11759        return PyBool_FromLong(
11760            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11761
11762    /* Special case for empty strings */
11763    if (length == 0)
11764        return PyBool_FromLong(0);
11765
11766    for (i = 0; i < length; i++) {
11767        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11768            return PyBool_FromLong(0);
11769    }
11770    return PyBool_FromLong(1);
11771}
11772
11773PyDoc_STRVAR(isdigit__doc__,
11774             "S.isdigit() -> bool\n\
11775\n\
11776Return True if all characters in S are digits\n\
11777and there is at least one character in S, False otherwise.");
11778
11779static PyObject*
11780unicode_isdigit(PyObject *self)
11781{
11782    Py_ssize_t i, length;
11783    int kind;
11784    void *data;
11785
11786    if (PyUnicode_READY(self) == -1)
11787        return NULL;
11788    length = PyUnicode_GET_LENGTH(self);
11789    kind = PyUnicode_KIND(self);
11790    data = PyUnicode_DATA(self);
11791
11792    /* Shortcut for single character strings */
11793    if (length == 1) {
11794        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11795        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11796    }
11797
11798    /* Special case for empty strings */
11799    if (length == 0)
11800        return PyBool_FromLong(0);
11801
11802    for (i = 0; i < length; i++) {
11803        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11804            return PyBool_FromLong(0);
11805    }
11806    return PyBool_FromLong(1);
11807}
11808
11809PyDoc_STRVAR(isnumeric__doc__,
11810             "S.isnumeric() -> bool\n\
11811\n\
11812Return True if there are only numeric characters in S,\n\
11813False otherwise.");
11814
11815static PyObject*
11816unicode_isnumeric(PyObject *self)
11817{
11818    Py_ssize_t i, length;
11819    int kind;
11820    void *data;
11821
11822    if (PyUnicode_READY(self) == -1)
11823        return NULL;
11824    length = PyUnicode_GET_LENGTH(self);
11825    kind = PyUnicode_KIND(self);
11826    data = PyUnicode_DATA(self);
11827
11828    /* Shortcut for single character strings */
11829    if (length == 1)
11830        return PyBool_FromLong(
11831            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11832
11833    /* Special case for empty strings */
11834    if (length == 0)
11835        return PyBool_FromLong(0);
11836
11837    for (i = 0; i < length; i++) {
11838        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11839            return PyBool_FromLong(0);
11840    }
11841    return PyBool_FromLong(1);
11842}
11843
11844int
11845PyUnicode_IsIdentifier(PyObject *self)
11846{
11847    int kind;
11848    void *data;
11849    Py_ssize_t i;
11850    Py_UCS4 first;
11851
11852    if (PyUnicode_READY(self) == -1) {
11853        Py_FatalError("identifier not ready");
11854        return 0;
11855    }
11856
11857    /* Special case for empty strings */
11858    if (PyUnicode_GET_LENGTH(self) == 0)
11859        return 0;
11860    kind = PyUnicode_KIND(self);
11861    data = PyUnicode_DATA(self);
11862
11863    /* PEP 3131 says that the first character must be in
11864       XID_Start and subsequent characters in XID_Continue,
11865       and for the ASCII range, the 2.x rules apply (i.e
11866       start with letters and underscore, continue with
11867       letters, digits, underscore). However, given the current
11868       definition of XID_Start and XID_Continue, it is sufficient
11869       to check just for these, except that _ must be allowed
11870       as starting an identifier.  */
11871    first = PyUnicode_READ(kind, data, 0);
11872    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11873        return 0;
11874
11875    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11876        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11877            return 0;
11878    return 1;
11879}
11880
11881PyDoc_STRVAR(isidentifier__doc__,
11882             "S.isidentifier() -> bool\n\
11883\n\
11884Return True if S is a valid identifier according\n\
11885to the language definition.\n\
11886\n\
11887Use keyword.iskeyword() to test for reserved identifiers\n\
11888such as \"def\" and \"class\".\n");
11889
11890static PyObject*
11891unicode_isidentifier(PyObject *self)
11892{
11893    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11894}
11895
11896PyDoc_STRVAR(isprintable__doc__,
11897             "S.isprintable() -> bool\n\
11898\n\
11899Return True if all characters in S are considered\n\
11900printable in repr() or S is empty, False otherwise.");
11901
11902static PyObject*
11903unicode_isprintable(PyObject *self)
11904{
11905    Py_ssize_t i, length;
11906    int kind;
11907    void *data;
11908
11909    if (PyUnicode_READY(self) == -1)
11910        return NULL;
11911    length = PyUnicode_GET_LENGTH(self);
11912    kind = PyUnicode_KIND(self);
11913    data = PyUnicode_DATA(self);
11914
11915    /* Shortcut for single character strings */
11916    if (length == 1)
11917        return PyBool_FromLong(
11918            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11919
11920    for (i = 0; i < length; i++) {
11921        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11922            Py_RETURN_FALSE;
11923        }
11924    }
11925    Py_RETURN_TRUE;
11926}
11927
11928PyDoc_STRVAR(join__doc__,
11929             "S.join(iterable) -> str\n\
11930\n\
11931Return a string which is the concatenation of the strings in the\n\
11932iterable.  The separator between elements is S.");
11933
11934static PyObject*
11935unicode_join(PyObject *self, PyObject *data)
11936{
11937    return PyUnicode_Join(self, data);
11938}
11939
11940static Py_ssize_t
11941unicode_length(PyObject *self)
11942{
11943    if (PyUnicode_READY(self) == -1)
11944        return -1;
11945    return PyUnicode_GET_LENGTH(self);
11946}
11947
11948PyDoc_STRVAR(ljust__doc__,
11949             "S.ljust(width[, fillchar]) -> str\n\
11950\n\
11951Return S left-justified in a Unicode string of length width. Padding is\n\
11952done using the specified fill character (default is a space).");
11953
11954static PyObject *
11955unicode_ljust(PyObject *self, PyObject *args)
11956{
11957    Py_ssize_t width;
11958    Py_UCS4 fillchar = ' ';
11959
11960    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11961        return NULL;
11962
11963    if (PyUnicode_READY(self) == -1)
11964        return NULL;
11965
11966    if (PyUnicode_GET_LENGTH(self) >= width)
11967        return unicode_result_unchanged(self);
11968
11969    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11970}
11971
11972PyDoc_STRVAR(lower__doc__,
11973             "S.lower() -> str\n\
11974\n\
11975Return a copy of the string S converted to lowercase.");
11976
11977static PyObject*
11978unicode_lower(PyObject *self)
11979{
11980    if (PyUnicode_READY(self) == -1)
11981        return NULL;
11982    if (PyUnicode_IS_ASCII(self))
11983        return ascii_upper_or_lower(self, 1);
11984    return case_operation(self, do_lower);
11985}
11986
11987#define LEFTSTRIP 0
11988#define RIGHTSTRIP 1
11989#define BOTHSTRIP 2
11990
11991/* Arrays indexed by above */
11992static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11993
11994#define STRIPNAME(i) (stripformat[i]+3)
11995
11996/* externally visible for str.strip(unicode) */
11997PyObject *
11998_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11999{
12000    void *data;
12001    int kind;
12002    Py_ssize_t i, j, len;
12003    BLOOM_MASK sepmask;
12004    Py_ssize_t seplen;
12005
12006    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12007        return NULL;
12008
12009    kind = PyUnicode_KIND(self);
12010    data = PyUnicode_DATA(self);
12011    len = PyUnicode_GET_LENGTH(self);
12012    seplen = PyUnicode_GET_LENGTH(sepobj);
12013    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12014                              PyUnicode_DATA(sepobj),
12015                              seplen);
12016
12017    i = 0;
12018    if (striptype != RIGHTSTRIP) {
12019        while (i < len) {
12020            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12021            if (!BLOOM(sepmask, ch))
12022                break;
12023            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12024                break;
12025            i++;
12026        }
12027    }
12028
12029    j = len;
12030    if (striptype != LEFTSTRIP) {
12031        j--;
12032        while (j >= i) {
12033            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12034            if (!BLOOM(sepmask, ch))
12035                break;
12036            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12037                break;
12038            j--;
12039        }
12040
12041        j++;
12042    }
12043
12044    return PyUnicode_Substring(self, i, j);
12045}
12046
12047PyObject*
12048PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12049{
12050    unsigned char *data;
12051    int kind;
12052    Py_ssize_t length;
12053
12054    if (PyUnicode_READY(self) == -1)
12055        return NULL;
12056
12057    length = PyUnicode_GET_LENGTH(self);
12058    end = Py_MIN(end, length);
12059
12060    if (start == 0 && end == length)
12061        return unicode_result_unchanged(self);
12062
12063    if (start < 0 || end < 0) {
12064        PyErr_SetString(PyExc_IndexError, "string index out of range");
12065        return NULL;
12066    }
12067    if (start >= length || end < start)
12068        _Py_RETURN_UNICODE_EMPTY();
12069
12070    length = end - start;
12071    if (PyUnicode_IS_ASCII(self)) {
12072        data = PyUnicode_1BYTE_DATA(self);
12073        return _PyUnicode_FromASCII((char*)(data + start), length);
12074    }
12075    else {
12076        kind = PyUnicode_KIND(self);
12077        data = PyUnicode_1BYTE_DATA(self);
12078        return PyUnicode_FromKindAndData(kind,
12079                                         data + kind * start,
12080                                         length);
12081    }
12082}
12083
12084static PyObject *
12085do_strip(PyObject *self, int striptype)
12086{
12087    Py_ssize_t len, i, j;
12088
12089    if (PyUnicode_READY(self) == -1)
12090        return NULL;
12091
12092    len = PyUnicode_GET_LENGTH(self);
12093
12094    if (PyUnicode_IS_ASCII(self)) {
12095        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12096
12097        i = 0;
12098        if (striptype != RIGHTSTRIP) {
12099            while (i < len) {
12100                Py_UCS1 ch = data[i];
12101                if (!_Py_ascii_whitespace[ch])
12102                    break;
12103                i++;
12104            }
12105        }
12106
12107        j = len;
12108        if (striptype != LEFTSTRIP) {
12109            j--;
12110            while (j >= i) {
12111                Py_UCS1 ch = data[j];
12112                if (!_Py_ascii_whitespace[ch])
12113                    break;
12114                j--;
12115            }
12116            j++;
12117        }
12118    }
12119    else {
12120        int kind = PyUnicode_KIND(self);
12121        void *data = PyUnicode_DATA(self);
12122
12123        i = 0;
12124        if (striptype != RIGHTSTRIP) {
12125            while (i < len) {
12126                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12127                if (!Py_UNICODE_ISSPACE(ch))
12128                    break;
12129                i++;
12130            }
12131        }
12132
12133        j = len;
12134        if (striptype != LEFTSTRIP) {
12135            j--;
12136            while (j >= i) {
12137                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12138                if (!Py_UNICODE_ISSPACE(ch))
12139                    break;
12140                j--;
12141            }
12142            j++;
12143        }
12144    }
12145
12146    return PyUnicode_Substring(self, i, j);
12147}
12148
12149
12150static PyObject *
12151do_argstrip(PyObject *self, int striptype, PyObject *args)
12152{
12153    PyObject *sep = NULL;
12154
12155    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12156        return NULL;
12157
12158    if (sep != NULL && sep != Py_None) {
12159        if (PyUnicode_Check(sep))
12160            return _PyUnicode_XStrip(self, striptype, sep);
12161        else {
12162            PyErr_Format(PyExc_TypeError,
12163                         "%s arg must be None or str",
12164                         STRIPNAME(striptype));
12165            return NULL;
12166        }
12167    }
12168
12169    return do_strip(self, striptype);
12170}
12171
12172
12173PyDoc_STRVAR(strip__doc__,
12174             "S.strip([chars]) -> str\n\
12175\n\
12176Return a copy of the string S with leading and trailing\n\
12177whitespace removed.\n\
12178If chars is given and not None, remove characters in chars instead.");
12179
12180static PyObject *
12181unicode_strip(PyObject *self, PyObject *args)
12182{
12183    if (PyTuple_GET_SIZE(args) == 0)
12184        return do_strip(self, BOTHSTRIP); /* Common case */
12185    else
12186        return do_argstrip(self, BOTHSTRIP, args);
12187}
12188
12189
12190PyDoc_STRVAR(lstrip__doc__,
12191             "S.lstrip([chars]) -> str\n\
12192\n\
12193Return a copy of the string S with leading whitespace removed.\n\
12194If chars is given and not None, remove characters in chars instead.");
12195
12196static PyObject *
12197unicode_lstrip(PyObject *self, PyObject *args)
12198{
12199    if (PyTuple_GET_SIZE(args) == 0)
12200        return do_strip(self, LEFTSTRIP); /* Common case */
12201    else
12202        return do_argstrip(self, LEFTSTRIP, args);
12203}
12204
12205
12206PyDoc_STRVAR(rstrip__doc__,
12207             "S.rstrip([chars]) -> str\n\
12208\n\
12209Return a copy of the string S with trailing whitespace removed.\n\
12210If chars is given and not None, remove characters in chars instead.");
12211
12212static PyObject *
12213unicode_rstrip(PyObject *self, PyObject *args)
12214{
12215    if (PyTuple_GET_SIZE(args) == 0)
12216        return do_strip(self, RIGHTSTRIP); /* Common case */
12217    else
12218        return do_argstrip(self, RIGHTSTRIP, args);
12219}
12220
12221
12222static PyObject*
12223unicode_repeat(PyObject *str, Py_ssize_t len)
12224{
12225    PyObject *u;
12226    Py_ssize_t nchars, n;
12227
12228    if (len < 1)
12229        _Py_RETURN_UNICODE_EMPTY();
12230
12231    /* no repeat, return original string */
12232    if (len == 1)
12233        return unicode_result_unchanged(str);
12234
12235    if (PyUnicode_READY(str) == -1)
12236        return NULL;
12237
12238    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12239        PyErr_SetString(PyExc_OverflowError,
12240                        "repeated string is too long");
12241        return NULL;
12242    }
12243    nchars = len * PyUnicode_GET_LENGTH(str);
12244
12245    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12246    if (!u)
12247        return NULL;
12248    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12249
12250    if (PyUnicode_GET_LENGTH(str) == 1) {
12251        const int kind = PyUnicode_KIND(str);
12252        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12253        if (kind == PyUnicode_1BYTE_KIND) {
12254            void *to = PyUnicode_DATA(u);
12255            memset(to, (unsigned char)fill_char, len);
12256        }
12257        else if (kind == PyUnicode_2BYTE_KIND) {
12258            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12259            for (n = 0; n < len; ++n)
12260                ucs2[n] = fill_char;
12261        } else {
12262            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12263            assert(kind == PyUnicode_4BYTE_KIND);
12264            for (n = 0; n < len; ++n)
12265                ucs4[n] = fill_char;
12266        }
12267    }
12268    else {
12269        /* number of characters copied this far */
12270        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12271        const Py_ssize_t char_size = PyUnicode_KIND(str);
12272        char *to = (char *) PyUnicode_DATA(u);
12273        Py_MEMCPY(to, PyUnicode_DATA(str),
12274                  PyUnicode_GET_LENGTH(str) * char_size);
12275        while (done < nchars) {
12276            n = (done <= nchars-done) ? done : nchars-done;
12277            Py_MEMCPY(to + (done * char_size), to, n * char_size);
12278            done += n;
12279        }
12280    }
12281
12282    assert(_PyUnicode_CheckConsistency(u, 1));
12283    return u;
12284}
12285
12286PyObject *
12287PyUnicode_Replace(PyObject *obj,
12288                  PyObject *subobj,
12289                  PyObject *replobj,
12290                  Py_ssize_t maxcount)
12291{
12292    PyObject *self;
12293    PyObject *str1;
12294    PyObject *str2;
12295    PyObject *result;
12296
12297    self = PyUnicode_FromObject(obj);
12298    if (self == NULL)
12299        return NULL;
12300    str1 = PyUnicode_FromObject(subobj);
12301    if (str1 == NULL) {
12302        Py_DECREF(self);
12303        return NULL;
12304    }
12305    str2 = PyUnicode_FromObject(replobj);
12306    if (str2 == NULL) {
12307        Py_DECREF(self);
12308        Py_DECREF(str1);
12309        return NULL;
12310    }
12311    if (PyUnicode_READY(self) == -1 ||
12312        PyUnicode_READY(str1) == -1 ||
12313        PyUnicode_READY(str2) == -1)
12314        result = NULL;
12315    else
12316        result = replace(self, str1, str2, maxcount);
12317    Py_DECREF(self);
12318    Py_DECREF(str1);
12319    Py_DECREF(str2);
12320    return result;
12321}
12322
12323PyDoc_STRVAR(replace__doc__,
12324             "S.replace(old, new[, count]) -> str\n\
12325\n\
12326Return a copy of S with all occurrences of substring\n\
12327old replaced by new.  If the optional argument count is\n\
12328given, only the first count occurrences are replaced.");
12329
12330static PyObject*
12331unicode_replace(PyObject *self, PyObject *args)
12332{
12333    PyObject *str1;
12334    PyObject *str2;
12335    Py_ssize_t maxcount = -1;
12336    PyObject *result;
12337
12338    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12339        return NULL;
12340    if (PyUnicode_READY(self) == -1)
12341        return NULL;
12342    str1 = PyUnicode_FromObject(str1);
12343    if (str1 == NULL)
12344        return NULL;
12345    str2 = PyUnicode_FromObject(str2);
12346    if (str2 == NULL) {
12347        Py_DECREF(str1);
12348        return NULL;
12349    }
12350    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12351        result = NULL;
12352    else
12353        result = replace(self, str1, str2, maxcount);
12354
12355    Py_DECREF(str1);
12356    Py_DECREF(str2);
12357    return result;
12358}
12359
12360static PyObject *
12361unicode_repr(PyObject *unicode)
12362{
12363    PyObject *repr;
12364    Py_ssize_t isize;
12365    Py_ssize_t osize, squote, dquote, i, o;
12366    Py_UCS4 max, quote;
12367    int ikind, okind, unchanged;
12368    void *idata, *odata;
12369
12370    if (PyUnicode_READY(unicode) == -1)
12371        return NULL;
12372
12373    isize = PyUnicode_GET_LENGTH(unicode);
12374    idata = PyUnicode_DATA(unicode);
12375
12376    /* Compute length of output, quote characters, and
12377       maximum character */
12378    osize = 0;
12379    max = 127;
12380    squote = dquote = 0;
12381    ikind = PyUnicode_KIND(unicode);
12382    for (i = 0; i < isize; i++) {
12383        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12384        Py_ssize_t incr = 1;
12385        switch (ch) {
12386        case '\'': squote++; break;
12387        case '"':  dquote++; break;
12388        case '\\': case '\t': case '\r': case '\n':
12389            incr = 2;
12390            break;
12391        default:
12392            /* Fast-path ASCII */
12393            if (ch < ' ' || ch == 0x7f)
12394                incr = 4; /* \xHH */
12395            else if (ch < 0x7f)
12396                ;
12397            else if (Py_UNICODE_ISPRINTABLE(ch))
12398                max = ch > max ? ch : max;
12399            else if (ch < 0x100)
12400                incr = 4; /* \xHH */
12401            else if (ch < 0x10000)
12402                incr = 6; /* \uHHHH */
12403            else
12404                incr = 10; /* \uHHHHHHHH */
12405        }
12406        if (osize > PY_SSIZE_T_MAX - incr) {
12407            PyErr_SetString(PyExc_OverflowError,
12408                            "string is too long to generate repr");
12409            return NULL;
12410        }
12411        osize += incr;
12412    }
12413
12414    quote = '\'';
12415    unchanged = (osize == isize);
12416    if (squote) {
12417        unchanged = 0;
12418        if (dquote)
12419            /* Both squote and dquote present. Use squote,
12420               and escape them */
12421            osize += squote;
12422        else
12423            quote = '"';
12424    }
12425    osize += 2;   /* quotes */
12426
12427    repr = PyUnicode_New(osize, max);
12428    if (repr == NULL)
12429        return NULL;
12430    okind = PyUnicode_KIND(repr);
12431    odata = PyUnicode_DATA(repr);
12432
12433    PyUnicode_WRITE(okind, odata, 0, quote);
12434    PyUnicode_WRITE(okind, odata, osize-1, quote);
12435    if (unchanged) {
12436        _PyUnicode_FastCopyCharacters(repr, 1,
12437                                      unicode, 0,
12438                                      isize);
12439    }
12440    else {
12441        for (i = 0, o = 1; i < isize; i++) {
12442            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12443
12444            /* Escape quotes and backslashes */
12445            if ((ch == quote) || (ch == '\\')) {
12446                PyUnicode_WRITE(okind, odata, o++, '\\');
12447                PyUnicode_WRITE(okind, odata, o++, ch);
12448                continue;
12449            }
12450
12451            /* Map special whitespace to '\t', \n', '\r' */
12452            if (ch == '\t') {
12453                PyUnicode_WRITE(okind, odata, o++, '\\');
12454                PyUnicode_WRITE(okind, odata, o++, 't');
12455            }
12456            else if (ch == '\n') {
12457                PyUnicode_WRITE(okind, odata, o++, '\\');
12458                PyUnicode_WRITE(okind, odata, o++, 'n');
12459            }
12460            else if (ch == '\r') {
12461                PyUnicode_WRITE(okind, odata, o++, '\\');
12462                PyUnicode_WRITE(okind, odata, o++, 'r');
12463            }
12464
12465            /* Map non-printable US ASCII to '\xhh' */
12466            else if (ch < ' ' || ch == 0x7F) {
12467                PyUnicode_WRITE(okind, odata, o++, '\\');
12468                PyUnicode_WRITE(okind, odata, o++, 'x');
12469                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12470                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12471            }
12472
12473            /* Copy ASCII characters as-is */
12474            else if (ch < 0x7F) {
12475                PyUnicode_WRITE(okind, odata, o++, ch);
12476            }
12477
12478            /* Non-ASCII characters */
12479            else {
12480                /* Map Unicode whitespace and control characters
12481                   (categories Z* and C* except ASCII space)
12482                */
12483                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12484                    PyUnicode_WRITE(okind, odata, o++, '\\');
12485                    /* Map 8-bit characters to '\xhh' */
12486                    if (ch <= 0xff) {
12487                        PyUnicode_WRITE(okind, odata, o++, 'x');
12488                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12489                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12490                    }
12491                    /* Map 16-bit characters to '\uxxxx' */
12492                    else if (ch <= 0xffff) {
12493                        PyUnicode_WRITE(okind, odata, o++, 'u');
12494                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12495                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12496                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12497                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12498                    }
12499                    /* Map 21-bit characters to '\U00xxxxxx' */
12500                    else {
12501                        PyUnicode_WRITE(okind, odata, o++, 'U');
12502                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12503                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12504                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12505                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12506                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12507                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12508                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12509                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12510                    }
12511                }
12512                /* Copy characters as-is */
12513                else {
12514                    PyUnicode_WRITE(okind, odata, o++, ch);
12515                }
12516            }
12517        }
12518    }
12519    /* Closing quote already added at the beginning */
12520    assert(_PyUnicode_CheckConsistency(repr, 1));
12521    return repr;
12522}
12523
12524PyDoc_STRVAR(rfind__doc__,
12525             "S.rfind(sub[, start[, end]]) -> int\n\
12526\n\
12527Return the highest index in S where substring sub is found,\n\
12528such that sub is contained within S[start:end].  Optional\n\
12529arguments start and end are interpreted as in slice notation.\n\
12530\n\
12531Return -1 on failure.");
12532
12533static PyObject *
12534unicode_rfind(PyObject *self, PyObject *args)
12535{
12536    /* initialize variables to prevent gcc warning */
12537    PyObject *substring = NULL;
12538    Py_ssize_t start = 0;
12539    Py_ssize_t end = 0;
12540    Py_ssize_t result;
12541
12542    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12543                                            &start, &end))
12544        return NULL;
12545
12546    if (PyUnicode_READY(self) == -1) {
12547        Py_DECREF(substring);
12548        return NULL;
12549    }
12550    if (PyUnicode_READY(substring) == -1) {
12551        Py_DECREF(substring);
12552        return NULL;
12553    }
12554
12555    result = any_find_slice(-1, self, substring, start, end);
12556
12557    Py_DECREF(substring);
12558
12559    if (result == -2)
12560        return NULL;
12561
12562    return PyLong_FromSsize_t(result);
12563}
12564
12565PyDoc_STRVAR(rindex__doc__,
12566             "S.rindex(sub[, start[, end]]) -> int\n\
12567\n\
12568Like S.rfind() but raise ValueError when the substring is not found.");
12569
12570static PyObject *
12571unicode_rindex(PyObject *self, PyObject *args)
12572{
12573    /* initialize variables to prevent gcc warning */
12574    PyObject *substring = NULL;
12575    Py_ssize_t start = 0;
12576    Py_ssize_t end = 0;
12577    Py_ssize_t result;
12578
12579    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12580                                            &start, &end))
12581        return NULL;
12582
12583    if (PyUnicode_READY(self) == -1) {
12584        Py_DECREF(substring);
12585        return NULL;
12586    }
12587    if (PyUnicode_READY(substring) == -1) {
12588        Py_DECREF(substring);
12589        return NULL;
12590    }
12591
12592    result = any_find_slice(-1, self, substring, start, end);
12593
12594    Py_DECREF(substring);
12595
12596    if (result == -2)
12597        return NULL;
12598
12599    if (result < 0) {
12600        PyErr_SetString(PyExc_ValueError, "substring not found");
12601        return NULL;
12602    }
12603
12604    return PyLong_FromSsize_t(result);
12605}
12606
12607PyDoc_STRVAR(rjust__doc__,
12608             "S.rjust(width[, fillchar]) -> str\n\
12609\n\
12610Return S right-justified in a string of length width. Padding is\n\
12611done using the specified fill character (default is a space).");
12612
12613static PyObject *
12614unicode_rjust(PyObject *self, PyObject *args)
12615{
12616    Py_ssize_t width;
12617    Py_UCS4 fillchar = ' ';
12618
12619    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12620        return NULL;
12621
12622    if (PyUnicode_READY(self) == -1)
12623        return NULL;
12624
12625    if (PyUnicode_GET_LENGTH(self) >= width)
12626        return unicode_result_unchanged(self);
12627
12628    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12629}
12630
12631PyObject *
12632PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12633{
12634    PyObject *result;
12635
12636    s = PyUnicode_FromObject(s);
12637    if (s == NULL)
12638        return NULL;
12639    if (sep != NULL) {
12640        sep = PyUnicode_FromObject(sep);
12641        if (sep == NULL) {
12642            Py_DECREF(s);
12643            return NULL;
12644        }
12645    }
12646
12647    result = split(s, sep, maxsplit);
12648
12649    Py_DECREF(s);
12650    Py_XDECREF(sep);
12651    return result;
12652}
12653
12654PyDoc_STRVAR(split__doc__,
12655             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12656\n\
12657Return a list of the words in S, using sep as the\n\
12658delimiter string.  If maxsplit is given, at most maxsplit\n\
12659splits are done. If sep is not specified or is None, any\n\
12660whitespace string is a separator and empty strings are\n\
12661removed from the result.");
12662
12663static PyObject*
12664unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12665{
12666    static char *kwlist[] = {"sep", "maxsplit", 0};
12667    PyObject *substring = Py_None;
12668    Py_ssize_t maxcount = -1;
12669
12670    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12671                                     kwlist, &substring, &maxcount))
12672        return NULL;
12673
12674    if (substring == Py_None)
12675        return split(self, NULL, maxcount);
12676    else if (PyUnicode_Check(substring))
12677        return split(self, substring, maxcount);
12678    else
12679        return PyUnicode_Split(self, substring, maxcount);
12680}
12681
12682PyObject *
12683PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12684{
12685    PyObject* str_obj;
12686    PyObject* sep_obj;
12687    PyObject* out;
12688    int kind1, kind2;
12689    void *buf1, *buf2;
12690    Py_ssize_t len1, len2;
12691
12692    str_obj = PyUnicode_FromObject(str_in);
12693    if (!str_obj)
12694        return NULL;
12695    sep_obj = PyUnicode_FromObject(sep_in);
12696    if (!sep_obj) {
12697        Py_DECREF(str_obj);
12698        return NULL;
12699    }
12700    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12701        Py_DECREF(sep_obj);
12702        Py_DECREF(str_obj);
12703        return NULL;
12704    }
12705
12706    kind1 = PyUnicode_KIND(str_obj);
12707    kind2 = PyUnicode_KIND(sep_obj);
12708    len1 = PyUnicode_GET_LENGTH(str_obj);
12709    len2 = PyUnicode_GET_LENGTH(sep_obj);
12710    if (kind1 < kind2 || len1 < len2) {
12711        _Py_INCREF_UNICODE_EMPTY();
12712        if (!unicode_empty)
12713            out = NULL;
12714        else {
12715            out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12716            Py_DECREF(unicode_empty);
12717        }
12718        Py_DECREF(sep_obj);
12719        Py_DECREF(str_obj);
12720        return out;
12721    }
12722    buf1 = PyUnicode_DATA(str_obj);
12723    buf2 = PyUnicode_DATA(sep_obj);
12724    if (kind2 != kind1) {
12725        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12726        if (!buf2)
12727            goto onError;
12728    }
12729
12730    switch (kind1) {
12731    case PyUnicode_1BYTE_KIND:
12732        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12733            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12734        else
12735            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12736        break;
12737    case PyUnicode_2BYTE_KIND:
12738        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12739        break;
12740    case PyUnicode_4BYTE_KIND:
12741        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12742        break;
12743    default:
12744        assert(0);
12745        out = 0;
12746    }
12747
12748    Py_DECREF(sep_obj);
12749    Py_DECREF(str_obj);
12750    if (kind2 != kind1)
12751        PyMem_Free(buf2);
12752
12753    return out;
12754  onError:
12755    Py_DECREF(sep_obj);
12756    Py_DECREF(str_obj);
12757    if (kind2 != kind1 && buf2)
12758        PyMem_Free(buf2);
12759    return NULL;
12760}
12761
12762
12763PyObject *
12764PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12765{
12766    PyObject* str_obj;
12767    PyObject* sep_obj;
12768    PyObject* out;
12769    int kind1, kind2;
12770    void *buf1, *buf2;
12771    Py_ssize_t len1, len2;
12772
12773    str_obj = PyUnicode_FromObject(str_in);
12774    if (!str_obj)
12775        return NULL;
12776    sep_obj = PyUnicode_FromObject(sep_in);
12777    if (!sep_obj) {
12778        Py_DECREF(str_obj);
12779        return NULL;
12780    }
12781
12782    kind1 = PyUnicode_KIND(str_obj);
12783    kind2 = PyUnicode_KIND(sep_obj);
12784    len1 = PyUnicode_GET_LENGTH(str_obj);
12785    len2 = PyUnicode_GET_LENGTH(sep_obj);
12786    if (kind1 < kind2 || len1 < len2) {
12787        _Py_INCREF_UNICODE_EMPTY();
12788        if (!unicode_empty)
12789            out = NULL;
12790        else {
12791            out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12792            Py_DECREF(unicode_empty);
12793        }
12794        Py_DECREF(sep_obj);
12795        Py_DECREF(str_obj);
12796        return out;
12797    }
12798    buf1 = PyUnicode_DATA(str_obj);
12799    buf2 = PyUnicode_DATA(sep_obj);
12800    if (kind2 != kind1) {
12801        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12802        if (!buf2)
12803            goto onError;
12804    }
12805
12806    switch (kind1) {
12807    case PyUnicode_1BYTE_KIND:
12808        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12809            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12810        else
12811            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12812        break;
12813    case PyUnicode_2BYTE_KIND:
12814        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12815        break;
12816    case PyUnicode_4BYTE_KIND:
12817        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12818        break;
12819    default:
12820        assert(0);
12821        out = 0;
12822    }
12823
12824    Py_DECREF(sep_obj);
12825    Py_DECREF(str_obj);
12826    if (kind2 != kind1)
12827        PyMem_Free(buf2);
12828
12829    return out;
12830  onError:
12831    Py_DECREF(sep_obj);
12832    Py_DECREF(str_obj);
12833    if (kind2 != kind1 && buf2)
12834        PyMem_Free(buf2);
12835    return NULL;
12836}
12837
12838PyDoc_STRVAR(partition__doc__,
12839             "S.partition(sep) -> (head, sep, tail)\n\
12840\n\
12841Search for the separator sep in S, and return the part before it,\n\
12842the separator itself, and the part after it.  If the separator is not\n\
12843found, return S and two empty strings.");
12844
12845static PyObject*
12846unicode_partition(PyObject *self, PyObject *separator)
12847{
12848    return PyUnicode_Partition(self, separator);
12849}
12850
12851PyDoc_STRVAR(rpartition__doc__,
12852             "S.rpartition(sep) -> (head, sep, tail)\n\
12853\n\
12854Search for the separator sep in S, starting at the end of S, and return\n\
12855the part before it, the separator itself, and the part after it.  If the\n\
12856separator is not found, return two empty strings and S.");
12857
12858static PyObject*
12859unicode_rpartition(PyObject *self, PyObject *separator)
12860{
12861    return PyUnicode_RPartition(self, separator);
12862}
12863
12864PyObject *
12865PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12866{
12867    PyObject *result;
12868
12869    s = PyUnicode_FromObject(s);
12870    if (s == NULL)
12871        return NULL;
12872    if (sep != NULL) {
12873        sep = PyUnicode_FromObject(sep);
12874        if (sep == NULL) {
12875            Py_DECREF(s);
12876            return NULL;
12877        }
12878    }
12879
12880    result = rsplit(s, sep, maxsplit);
12881
12882    Py_DECREF(s);
12883    Py_XDECREF(sep);
12884    return result;
12885}
12886
12887PyDoc_STRVAR(rsplit__doc__,
12888             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12889\n\
12890Return a list of the words in S, using sep as the\n\
12891delimiter string, starting at the end of the string and\n\
12892working to the front.  If maxsplit is given, at most maxsplit\n\
12893splits are done. If sep is not specified, any whitespace string\n\
12894is a separator.");
12895
12896static PyObject*
12897unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12898{
12899    static char *kwlist[] = {"sep", "maxsplit", 0};
12900    PyObject *substring = Py_None;
12901    Py_ssize_t maxcount = -1;
12902
12903    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12904                                     kwlist, &substring, &maxcount))
12905        return NULL;
12906
12907    if (substring == Py_None)
12908        return rsplit(self, NULL, maxcount);
12909    else if (PyUnicode_Check(substring))
12910        return rsplit(self, substring, maxcount);
12911    else
12912        return PyUnicode_RSplit(self, substring, maxcount);
12913}
12914
12915PyDoc_STRVAR(splitlines__doc__,
12916             "S.splitlines([keepends]) -> list of strings\n\
12917\n\
12918Return a list of the lines in S, breaking at line boundaries.\n\
12919Line breaks are not included in the resulting list unless keepends\n\
12920is given and true.");
12921
12922static PyObject*
12923unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12924{
12925    static char *kwlist[] = {"keepends", 0};
12926    int keepends = 0;
12927
12928    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12929                                     kwlist, &keepends))
12930        return NULL;
12931
12932    return PyUnicode_Splitlines(self, keepends);
12933}
12934
12935static
12936PyObject *unicode_str(PyObject *self)
12937{
12938    return unicode_result_unchanged(self);
12939}
12940
12941PyDoc_STRVAR(swapcase__doc__,
12942             "S.swapcase() -> str\n\
12943\n\
12944Return a copy of S with uppercase characters converted to lowercase\n\
12945and vice versa.");
12946
12947static PyObject*
12948unicode_swapcase(PyObject *self)
12949{
12950    if (PyUnicode_READY(self) == -1)
12951        return NULL;
12952    return case_operation(self, do_swapcase);
12953}
12954
12955/*[clinic input]
12956
12957@staticmethod
12958str.maketrans as unicode_maketrans
12959
12960  x: object
12961
12962  y: unicode=NULL
12963
12964  z: unicode=NULL
12965
12966  /
12967
12968Return a translation table usable for str.translate().
12969
12970If there is only one argument, it must be a dictionary mapping Unicode
12971ordinals (integers) or characters to Unicode ordinals, strings or None.
12972Character keys will be then converted to ordinals.
12973If there are two arguments, they must be strings of equal length, and
12974in the resulting dictionary, each character in x will be mapped to the
12975character at the same position in y. If there is a third argument, it
12976must be a string, whose characters will be mapped to None in the result.
12977[clinic start generated code]*/
12978
12979static PyObject *
12980unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12981/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12982{
12983    PyObject *new = NULL, *key, *value;
12984    Py_ssize_t i = 0;
12985    int res;
12986
12987    new = PyDict_New();
12988    if (!new)
12989        return NULL;
12990    if (y != NULL) {
12991        int x_kind, y_kind, z_kind;
12992        void *x_data, *y_data, *z_data;
12993
12994        /* x must be a string too, of equal length */
12995        if (!PyUnicode_Check(x)) {
12996            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12997                            "be a string if there is a second argument");
12998            goto err;
12999        }
13000        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13001            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13002                            "arguments must have equal length");
13003            goto err;
13004        }
13005        /* create entries for translating chars in x to those in y */
13006        x_kind = PyUnicode_KIND(x);
13007        y_kind = PyUnicode_KIND(y);
13008        x_data = PyUnicode_DATA(x);
13009        y_data = PyUnicode_DATA(y);
13010        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13011            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13012            if (!key)
13013                goto err;
13014            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13015            if (!value) {
13016                Py_DECREF(key);
13017                goto err;
13018            }
13019            res = PyDict_SetItem(new, key, value);
13020            Py_DECREF(key);
13021            Py_DECREF(value);
13022            if (res < 0)
13023                goto err;
13024        }
13025        /* create entries for deleting chars in z */
13026        if (z != NULL) {
13027            z_kind = PyUnicode_KIND(z);
13028            z_data = PyUnicode_DATA(z);
13029            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13030                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13031                if (!key)
13032                    goto err;
13033                res = PyDict_SetItem(new, key, Py_None);
13034                Py_DECREF(key);
13035                if (res < 0)
13036                    goto err;
13037            }
13038        }
13039    } else {
13040        int kind;
13041        void *data;
13042
13043        /* x must be a dict */
13044        if (!PyDict_CheckExact(x)) {
13045            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13046                            "to maketrans it must be a dict");
13047            goto err;
13048        }
13049        /* copy entries into the new dict, converting string keys to int keys */
13050        while (PyDict_Next(x, &i, &key, &value)) {
13051            if (PyUnicode_Check(key)) {
13052                /* convert string keys to integer keys */
13053                PyObject *newkey;
13054                if (PyUnicode_GET_LENGTH(key) != 1) {
13055                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13056                                    "table must be of length 1");
13057                    goto err;
13058                }
13059                kind = PyUnicode_KIND(key);
13060                data = PyUnicode_DATA(key);
13061                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13062                if (!newkey)
13063                    goto err;
13064                res = PyDict_SetItem(new, newkey, value);
13065                Py_DECREF(newkey);
13066                if (res < 0)
13067                    goto err;
13068            } else if (PyLong_Check(key)) {
13069                /* just keep integer keys */
13070                if (PyDict_SetItem(new, key, value) < 0)
13071                    goto err;
13072            } else {
13073                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13074                                "be strings or integers");
13075                goto err;
13076            }
13077        }
13078    }
13079    return new;
13080  err:
13081    Py_DECREF(new);
13082    return NULL;
13083}
13084
13085PyDoc_STRVAR(translate__doc__,
13086             "S.translate(table) -> str\n\
13087\n\
13088Return a copy of the string S in which each character has been mapped\n\
13089through the given translation table. The table must implement\n\
13090lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13091mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13092this operation raises LookupError, the character is left untouched.\n\
13093Characters mapped to None are deleted.");
13094
13095static PyObject*
13096unicode_translate(PyObject *self, PyObject *table)
13097{
13098    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13099}
13100
13101PyDoc_STRVAR(upper__doc__,
13102             "S.upper() -> str\n\
13103\n\
13104Return a copy of S converted to uppercase.");
13105
13106static PyObject*
13107unicode_upper(PyObject *self)
13108{
13109    if (PyUnicode_READY(self) == -1)
13110        return NULL;
13111    if (PyUnicode_IS_ASCII(self))
13112        return ascii_upper_or_lower(self, 0);
13113    return case_operation(self, do_upper);
13114}
13115
13116PyDoc_STRVAR(zfill__doc__,
13117             "S.zfill(width) -> str\n\
13118\n\
13119Pad a numeric string S with zeros on the left, to fill a field\n\
13120of the specified width. The string S is never truncated.");
13121
13122static PyObject *
13123unicode_zfill(PyObject *self, PyObject *args)
13124{
13125    Py_ssize_t fill;
13126    PyObject *u;
13127    Py_ssize_t width;
13128    int kind;
13129    void *data;
13130    Py_UCS4 chr;
13131
13132    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13133        return NULL;
13134
13135    if (PyUnicode_READY(self) == -1)
13136        return NULL;
13137
13138    if (PyUnicode_GET_LENGTH(self) >= width)
13139        return unicode_result_unchanged(self);
13140
13141    fill = width - PyUnicode_GET_LENGTH(self);
13142
13143    u = pad(self, fill, 0, '0');
13144
13145    if (u == NULL)
13146        return NULL;
13147
13148    kind = PyUnicode_KIND(u);
13149    data = PyUnicode_DATA(u);
13150    chr = PyUnicode_READ(kind, data, fill);
13151
13152    if (chr == '+' || chr == '-') {
13153        /* move sign to beginning of string */
13154        PyUnicode_WRITE(kind, data, 0, chr);
13155        PyUnicode_WRITE(kind, data, fill, '0');
13156    }
13157
13158    assert(_PyUnicode_CheckConsistency(u, 1));
13159    return u;
13160}
13161
13162#if 0
13163static PyObject *
13164unicode__decimal2ascii(PyObject *self)
13165{
13166    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13167}
13168#endif
13169
13170PyDoc_STRVAR(startswith__doc__,
13171             "S.startswith(prefix[, start[, end]]) -> bool\n\
13172\n\
13173Return True if S starts with the specified prefix, False otherwise.\n\
13174With optional start, test S beginning at that position.\n\
13175With optional end, stop comparing S at that position.\n\
13176prefix can also be a tuple of strings to try.");
13177
13178static PyObject *
13179unicode_startswith(PyObject *self,
13180                   PyObject *args)
13181{
13182    PyObject *subobj;
13183    PyObject *substring;
13184    Py_ssize_t start = 0;
13185    Py_ssize_t end = PY_SSIZE_T_MAX;
13186    int result;
13187
13188    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13189        return NULL;
13190    if (PyTuple_Check(subobj)) {
13191        Py_ssize_t i;
13192        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13193            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
13194            if (substring == NULL)
13195                return NULL;
13196            result = tailmatch(self, substring, start, end, -1);
13197            Py_DECREF(substring);
13198            if (result == -1)
13199                return NULL;
13200            if (result) {
13201                Py_RETURN_TRUE;
13202            }
13203        }
13204        /* nothing matched */
13205        Py_RETURN_FALSE;
13206    }
13207    substring = PyUnicode_FromObject(subobj);
13208    if (substring == NULL) {
13209        if (PyErr_ExceptionMatches(PyExc_TypeError))
13210            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13211                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13212        return NULL;
13213    }
13214    result = tailmatch(self, substring, start, end, -1);
13215    Py_DECREF(substring);
13216    if (result == -1)
13217        return NULL;
13218    return PyBool_FromLong(result);
13219}
13220
13221
13222PyDoc_STRVAR(endswith__doc__,
13223             "S.endswith(suffix[, start[, end]]) -> bool\n\
13224\n\
13225Return True if S ends with the specified suffix, False otherwise.\n\
13226With optional start, test S beginning at that position.\n\
13227With optional end, stop comparing S at that position.\n\
13228suffix can also be a tuple of strings to try.");
13229
13230static PyObject *
13231unicode_endswith(PyObject *self,
13232                 PyObject *args)
13233{
13234    PyObject *subobj;
13235    PyObject *substring;
13236    Py_ssize_t start = 0;
13237    Py_ssize_t end = PY_SSIZE_T_MAX;
13238    int result;
13239
13240    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13241        return NULL;
13242    if (PyTuple_Check(subobj)) {
13243        Py_ssize_t i;
13244        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13245            substring = PyUnicode_FromObject(
13246                PyTuple_GET_ITEM(subobj, i));
13247            if (substring == NULL)
13248                return NULL;
13249            result = tailmatch(self, substring, start, end, +1);
13250            Py_DECREF(substring);
13251            if (result == -1)
13252                return NULL;
13253            if (result) {
13254                Py_RETURN_TRUE;
13255            }
13256        }
13257        Py_RETURN_FALSE;
13258    }
13259    substring = PyUnicode_FromObject(subobj);
13260    if (substring == NULL) {
13261        if (PyErr_ExceptionMatches(PyExc_TypeError))
13262            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13263                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13264        return NULL;
13265    }
13266    result = tailmatch(self, substring, start, end, +1);
13267    Py_DECREF(substring);
13268    if (result == -1)
13269        return NULL;
13270    return PyBool_FromLong(result);
13271}
13272
13273Py_LOCAL_INLINE(void)
13274_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13275{
13276    if (!writer->readonly)
13277        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13278    else {
13279        /* Copy-on-write mode: set buffer size to 0 so
13280         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13281         * next write. */
13282        writer->size = 0;
13283    }
13284    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13285    writer->data = PyUnicode_DATA(writer->buffer);
13286    writer->kind = PyUnicode_KIND(writer->buffer);
13287}
13288
13289void
13290_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13291{
13292    memset(writer, 0, sizeof(*writer));
13293#ifdef Py_DEBUG
13294    writer->kind = 5;    /* invalid kind */
13295#endif
13296    writer->min_char = 127;
13297}
13298
13299int
13300_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13301                                 Py_ssize_t length, Py_UCS4 maxchar)
13302{
13303#ifdef MS_WINDOWS
13304   /* On Windows, overallocate by 50% is the best factor */
13305#  define OVERALLOCATE_FACTOR 2
13306#else
13307   /* On Linux, overallocate by 25% is the best factor */
13308#  define OVERALLOCATE_FACTOR 4
13309#endif
13310    Py_ssize_t newlen;
13311    PyObject *newbuffer;
13312
13313    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13314    assert((maxchar > writer->maxchar && length >= 0)
13315           || length > 0);
13316
13317    if (length > PY_SSIZE_T_MAX - writer->pos) {
13318        PyErr_NoMemory();
13319        return -1;
13320    }
13321    newlen = writer->pos + length;
13322
13323    maxchar = Py_MAX(maxchar, writer->min_char);
13324
13325    if (writer->buffer == NULL) {
13326        assert(!writer->readonly);
13327        if (writer->overallocate
13328            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13329            /* overallocate to limit the number of realloc() */
13330            newlen += newlen / OVERALLOCATE_FACTOR;
13331        }
13332        if (newlen < writer->min_length)
13333            newlen = writer->min_length;
13334
13335        writer->buffer = PyUnicode_New(newlen, maxchar);
13336        if (writer->buffer == NULL)
13337            return -1;
13338    }
13339    else if (newlen > writer->size) {
13340        if (writer->overallocate
13341            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13342            /* overallocate to limit the number of realloc() */
13343            newlen += newlen / OVERALLOCATE_FACTOR;
13344        }
13345        if (newlen < writer->min_length)
13346            newlen = writer->min_length;
13347
13348        if (maxchar > writer->maxchar || writer->readonly) {
13349            /* resize + widen */
13350            newbuffer = PyUnicode_New(newlen, maxchar);
13351            if (newbuffer == NULL)
13352                return -1;
13353            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13354                                          writer->buffer, 0, writer->pos);
13355            Py_DECREF(writer->buffer);
13356            writer->readonly = 0;
13357        }
13358        else {
13359            newbuffer = resize_compact(writer->buffer, newlen);
13360            if (newbuffer == NULL)
13361                return -1;
13362        }
13363        writer->buffer = newbuffer;
13364    }
13365    else if (maxchar > writer->maxchar) {
13366        assert(!writer->readonly);
13367        newbuffer = PyUnicode_New(writer->size, maxchar);
13368        if (newbuffer == NULL)
13369            return -1;
13370        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13371                                      writer->buffer, 0, writer->pos);
13372        Py_DECREF(writer->buffer);
13373        writer->buffer = newbuffer;
13374    }
13375    _PyUnicodeWriter_Update(writer);
13376    return 0;
13377
13378#undef OVERALLOCATE_FACTOR
13379}
13380
13381int
13382_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13383                                     enum PyUnicode_Kind kind)
13384{
13385    Py_UCS4 maxchar;
13386
13387    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13388    assert(writer->kind < kind);
13389
13390    switch (kind)
13391    {
13392    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13393    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13394    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13395    default:
13396        assert(0 && "invalid kind");
13397        return -1;
13398    }
13399
13400    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13401}
13402
13403Py_LOCAL_INLINE(int)
13404_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13405{
13406    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13407        return -1;
13408    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13409    writer->pos++;
13410    return 0;
13411}
13412
13413int
13414_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13415{
13416    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13417}
13418
13419int
13420_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13421{
13422    Py_UCS4 maxchar;
13423    Py_ssize_t len;
13424
13425    if (PyUnicode_READY(str) == -1)
13426        return -1;
13427    len = PyUnicode_GET_LENGTH(str);
13428    if (len == 0)
13429        return 0;
13430    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13431    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13432        if (writer->buffer == NULL && !writer->overallocate) {
13433            assert(_PyUnicode_CheckConsistency(str, 1));
13434            writer->readonly = 1;
13435            Py_INCREF(str);
13436            writer->buffer = str;
13437            _PyUnicodeWriter_Update(writer);
13438            writer->pos += len;
13439            return 0;
13440        }
13441        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13442            return -1;
13443    }
13444    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13445                                  str, 0, len);
13446    writer->pos += len;
13447    return 0;
13448}
13449
13450int
13451_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13452                                Py_ssize_t start, Py_ssize_t end)
13453{
13454    Py_UCS4 maxchar;
13455    Py_ssize_t len;
13456
13457    if (PyUnicode_READY(str) == -1)
13458        return -1;
13459
13460    assert(0 <= start);
13461    assert(end <= PyUnicode_GET_LENGTH(str));
13462    assert(start <= end);
13463
13464    if (end == 0)
13465        return 0;
13466
13467    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13468        return _PyUnicodeWriter_WriteStr(writer, str);
13469
13470    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13471        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13472    else
13473        maxchar = writer->maxchar;
13474    len = end - start;
13475
13476    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13477        return -1;
13478
13479    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13480                                  str, start, len);
13481    writer->pos += len;
13482    return 0;
13483}
13484
13485int
13486_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13487                                  const char *ascii, Py_ssize_t len)
13488{
13489    if (len == -1)
13490        len = strlen(ascii);
13491
13492    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13493
13494    if (writer->buffer == NULL && !writer->overallocate) {
13495        PyObject *str;
13496
13497        str = _PyUnicode_FromASCII(ascii, len);
13498        if (str == NULL)
13499            return -1;
13500
13501        writer->readonly = 1;
13502        writer->buffer = str;
13503        _PyUnicodeWriter_Update(writer);
13504        writer->pos += len;
13505        return 0;
13506    }
13507
13508    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13509        return -1;
13510
13511    switch (writer->kind)
13512    {
13513    case PyUnicode_1BYTE_KIND:
13514    {
13515        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13516        Py_UCS1 *data = writer->data;
13517
13518        Py_MEMCPY(data + writer->pos, str, len);
13519        break;
13520    }
13521    case PyUnicode_2BYTE_KIND:
13522    {
13523        _PyUnicode_CONVERT_BYTES(
13524            Py_UCS1, Py_UCS2,
13525            ascii, ascii + len,
13526            (Py_UCS2 *)writer->data + writer->pos);
13527        break;
13528    }
13529    case PyUnicode_4BYTE_KIND:
13530    {
13531        _PyUnicode_CONVERT_BYTES(
13532            Py_UCS1, Py_UCS4,
13533            ascii, ascii + len,
13534            (Py_UCS4 *)writer->data + writer->pos);
13535        break;
13536    }
13537    default:
13538        assert(0);
13539    }
13540
13541    writer->pos += len;
13542    return 0;
13543}
13544
13545int
13546_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13547                                   const char *str, Py_ssize_t len)
13548{
13549    Py_UCS4 maxchar;
13550
13551    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13552    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13553        return -1;
13554    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13555    writer->pos += len;
13556    return 0;
13557}
13558
13559PyObject *
13560_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13561{
13562    PyObject *str;
13563    if (writer->pos == 0) {
13564        Py_CLEAR(writer->buffer);
13565        _Py_RETURN_UNICODE_EMPTY();
13566    }
13567    if (writer->readonly) {
13568        str = writer->buffer;
13569        writer->buffer = NULL;
13570        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13571        return str;
13572    }
13573    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13574        PyObject *newbuffer;
13575        newbuffer = resize_compact(writer->buffer, writer->pos);
13576        if (newbuffer == NULL) {
13577            Py_CLEAR(writer->buffer);
13578            return NULL;
13579        }
13580        writer->buffer = newbuffer;
13581    }
13582    str = writer->buffer;
13583    writer->buffer = NULL;
13584    assert(_PyUnicode_CheckConsistency(str, 1));
13585    return unicode_result_ready(str);
13586}
13587
13588void
13589_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13590{
13591    Py_CLEAR(writer->buffer);
13592}
13593
13594#include "stringlib/unicode_format.h"
13595
13596PyDoc_STRVAR(format__doc__,
13597             "S.format(*args, **kwargs) -> str\n\
13598\n\
13599Return a formatted version of S, using substitutions from args and kwargs.\n\
13600The substitutions are identified by braces ('{' and '}').");
13601
13602PyDoc_STRVAR(format_map__doc__,
13603             "S.format_map(mapping) -> str\n\
13604\n\
13605Return a formatted version of S, using substitutions from mapping.\n\
13606The substitutions are identified by braces ('{' and '}').");
13607
13608static PyObject *
13609unicode__format__(PyObject* self, PyObject* args)
13610{
13611    PyObject *format_spec;
13612    _PyUnicodeWriter writer;
13613    int ret;
13614
13615    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13616        return NULL;
13617
13618    if (PyUnicode_READY(self) == -1)
13619        return NULL;
13620    _PyUnicodeWriter_Init(&writer);
13621    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13622                                          self, format_spec, 0,
13623                                          PyUnicode_GET_LENGTH(format_spec));
13624    if (ret == -1) {
13625        _PyUnicodeWriter_Dealloc(&writer);
13626        return NULL;
13627    }
13628    return _PyUnicodeWriter_Finish(&writer);
13629}
13630
13631PyDoc_STRVAR(p_format__doc__,
13632             "S.__format__(format_spec) -> str\n\
13633\n\
13634Return a formatted version of S as described by format_spec.");
13635
13636static PyObject *
13637unicode__sizeof__(PyObject *v)
13638{
13639    Py_ssize_t size;
13640
13641    /* If it's a compact object, account for base structure +
13642       character data. */
13643    if (PyUnicode_IS_COMPACT_ASCII(v))
13644        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13645    else if (PyUnicode_IS_COMPACT(v))
13646        size = sizeof(PyCompactUnicodeObject) +
13647            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13648    else {
13649        /* If it is a two-block object, account for base object, and
13650           for character block if present. */
13651        size = sizeof(PyUnicodeObject);
13652        if (_PyUnicode_DATA_ANY(v))
13653            size += (PyUnicode_GET_LENGTH(v) + 1) *
13654                PyUnicode_KIND(v);
13655    }
13656    /* If the wstr pointer is present, account for it unless it is shared
13657       with the data pointer. Check if the data is not shared. */
13658    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13659        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13660    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13661        size += PyUnicode_UTF8_LENGTH(v) + 1;
13662
13663    return PyLong_FromSsize_t(size);
13664}
13665
13666PyDoc_STRVAR(sizeof__doc__,
13667             "S.__sizeof__() -> size of S in memory, in bytes");
13668
13669static PyObject *
13670unicode_getnewargs(PyObject *v)
13671{
13672    PyObject *copy = _PyUnicode_Copy(v);
13673    if (!copy)
13674        return NULL;
13675    return Py_BuildValue("(N)", copy);
13676}
13677
13678static PyMethodDef unicode_methods[] = {
13679    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13680    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13681    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13682    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13683    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13684    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13685    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13686    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13687    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13688    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13689    {"expandtabs", (PyCFunction) unicode_expandtabs,
13690     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13691    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13692    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13693    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13694    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13695    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13696    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13697    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13698    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13699    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13700    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13701    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13702    {"splitlines", (PyCFunction) unicode_splitlines,
13703     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13704    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13705    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13706    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13707    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13708    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13709    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13710    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13711    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13712    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13713    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13714    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13715    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13716    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13717    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13718    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13719    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13720    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13721    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13722    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13723    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13724    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13725    UNICODE_MAKETRANS_METHODDEF
13726    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13727#if 0
13728    /* These methods are just used for debugging the implementation. */
13729    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13730#endif
13731
13732    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13733    {NULL, NULL}
13734};
13735
13736static PyObject *
13737unicode_mod(PyObject *v, PyObject *w)
13738{
13739    if (!PyUnicode_Check(v))
13740        Py_RETURN_NOTIMPLEMENTED;
13741    return PyUnicode_Format(v, w);
13742}
13743
13744static PyNumberMethods unicode_as_number = {
13745    0,              /*nb_add*/
13746    0,              /*nb_subtract*/
13747    0,              /*nb_multiply*/
13748    unicode_mod,            /*nb_remainder*/
13749};
13750
13751static PySequenceMethods unicode_as_sequence = {
13752    (lenfunc) unicode_length,       /* sq_length */
13753    PyUnicode_Concat,           /* sq_concat */
13754    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13755    (ssizeargfunc) unicode_getitem,     /* sq_item */
13756    0,                  /* sq_slice */
13757    0,                  /* sq_ass_item */
13758    0,                  /* sq_ass_slice */
13759    PyUnicode_Contains,         /* sq_contains */
13760};
13761
13762static PyObject*
13763unicode_subscript(PyObject* self, PyObject* item)
13764{
13765    if (PyUnicode_READY(self) == -1)
13766        return NULL;
13767
13768    if (PyIndex_Check(item)) {
13769        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13770        if (i == -1 && PyErr_Occurred())
13771            return NULL;
13772        if (i < 0)
13773            i += PyUnicode_GET_LENGTH(self);
13774        return unicode_getitem(self, i);
13775    } else if (PySlice_Check(item)) {
13776        Py_ssize_t start, stop, step, slicelength, cur, i;
13777        PyObject *result;
13778        void *src_data, *dest_data;
13779        int src_kind, dest_kind;
13780        Py_UCS4 ch, max_char, kind_limit;
13781
13782        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13783                                 &start, &stop, &step, &slicelength) < 0) {
13784            return NULL;
13785        }
13786
13787        if (slicelength <= 0) {
13788            _Py_RETURN_UNICODE_EMPTY();
13789        } else if (start == 0 && step == 1 &&
13790                   slicelength == PyUnicode_GET_LENGTH(self)) {
13791            return unicode_result_unchanged(self);
13792        } else if (step == 1) {
13793            return PyUnicode_Substring(self,
13794                                       start, start + slicelength);
13795        }
13796        /* General case */
13797        src_kind = PyUnicode_KIND(self);
13798        src_data = PyUnicode_DATA(self);
13799        if (!PyUnicode_IS_ASCII(self)) {
13800            kind_limit = kind_maxchar_limit(src_kind);
13801            max_char = 0;
13802            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13803                ch = PyUnicode_READ(src_kind, src_data, cur);
13804                if (ch > max_char) {
13805                    max_char = ch;
13806                    if (max_char >= kind_limit)
13807                        break;
13808                }
13809            }
13810        }
13811        else
13812            max_char = 127;
13813        result = PyUnicode_New(slicelength, max_char);
13814        if (result == NULL)
13815            return NULL;
13816        dest_kind = PyUnicode_KIND(result);
13817        dest_data = PyUnicode_DATA(result);
13818
13819        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13820            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13821            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13822        }
13823        assert(_PyUnicode_CheckConsistency(result, 1));
13824        return result;
13825    } else {
13826        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13827        return NULL;
13828    }
13829}
13830
13831static PyMappingMethods unicode_as_mapping = {
13832    (lenfunc)unicode_length,        /* mp_length */
13833    (binaryfunc)unicode_subscript,  /* mp_subscript */
13834    (objobjargproc)0,           /* mp_ass_subscript */
13835};
13836
13837
13838/* Helpers for PyUnicode_Format() */
13839
13840struct unicode_formatter_t {
13841    PyObject *args;
13842    int args_owned;
13843    Py_ssize_t arglen, argidx;
13844    PyObject *dict;
13845
13846    enum PyUnicode_Kind fmtkind;
13847    Py_ssize_t fmtcnt, fmtpos;
13848    void *fmtdata;
13849    PyObject *fmtstr;
13850
13851    _PyUnicodeWriter writer;
13852};
13853
13854struct unicode_format_arg_t {
13855    Py_UCS4 ch;
13856    int flags;
13857    Py_ssize_t width;
13858    int prec;
13859    int sign;
13860};
13861
13862static PyObject *
13863unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13864{
13865    Py_ssize_t argidx = ctx->argidx;
13866
13867    if (argidx < ctx->arglen) {
13868        ctx->argidx++;
13869        if (ctx->arglen < 0)
13870            return ctx->args;
13871        else
13872            return PyTuple_GetItem(ctx->args, argidx);
13873    }
13874    PyErr_SetString(PyExc_TypeError,
13875                    "not enough arguments for format string");
13876    return NULL;
13877}
13878
13879/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13880
13881/* Format a float into the writer if the writer is not NULL, or into *p_output
13882   otherwise.
13883
13884   Return 0 on success, raise an exception and return -1 on error. */
13885static int
13886formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13887            PyObject **p_output,
13888            _PyUnicodeWriter *writer)
13889{
13890    char *p;
13891    double x;
13892    Py_ssize_t len;
13893    int prec;
13894    int dtoa_flags;
13895
13896    x = PyFloat_AsDouble(v);
13897    if (x == -1.0 && PyErr_Occurred())
13898        return -1;
13899
13900    prec = arg->prec;
13901    if (prec < 0)
13902        prec = 6;
13903
13904    if (arg->flags & F_ALT)
13905        dtoa_flags = Py_DTSF_ALT;
13906    else
13907        dtoa_flags = 0;
13908    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13909    if (p == NULL)
13910        return -1;
13911    len = strlen(p);
13912    if (writer) {
13913        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13914            PyMem_Free(p);
13915            return -1;
13916        }
13917    }
13918    else
13919        *p_output = _PyUnicode_FromASCII(p, len);
13920    PyMem_Free(p);
13921    return 0;
13922}
13923
13924/* formatlong() emulates the format codes d, u, o, x and X, and
13925 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13926 * Python's regular ints.
13927 * Return value:  a new PyUnicodeObject*, or NULL if error.
13928 *     The output string is of the form
13929 *         "-"? ("0x" | "0X")? digit+
13930 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13931 *         set in flags.  The case of hex digits will be correct,
13932 *     There will be at least prec digits, zero-filled on the left if
13933 *         necessary to get that many.
13934 * val          object to be converted
13935 * flags        bitmask of format flags; only F_ALT is looked at
13936 * prec         minimum number of digits; 0-fill on left if needed
13937 * type         a character in [duoxX]; u acts the same as d
13938 *
13939 * CAUTION:  o, x and X conversions on regular ints can never
13940 * produce a '-' sign, but can for Python's unbounded ints.
13941 */
13942PyObject *
13943_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13944{
13945    PyObject *result = NULL;
13946    char *buf;
13947    Py_ssize_t i;
13948    int sign;           /* 1 if '-', else 0 */
13949    int len;            /* number of characters */
13950    Py_ssize_t llen;
13951    int numdigits;      /* len == numnondigits + numdigits */
13952    int numnondigits = 0;
13953
13954    /* Avoid exceeding SSIZE_T_MAX */
13955    if (prec > INT_MAX-3) {
13956        PyErr_SetString(PyExc_OverflowError,
13957                        "precision too large");
13958        return NULL;
13959    }
13960
13961    assert(PyLong_Check(val));
13962
13963    switch (type) {
13964    default:
13965        assert(!"'type' not in [diuoxX]");
13966    case 'd':
13967    case 'i':
13968    case 'u':
13969        /* int and int subclasses should print numerically when a numeric */
13970        /* format code is used (see issue18780) */
13971        result = PyNumber_ToBase(val, 10);
13972        break;
13973    case 'o':
13974        numnondigits = 2;
13975        result = PyNumber_ToBase(val, 8);
13976        break;
13977    case 'x':
13978    case 'X':
13979        numnondigits = 2;
13980        result = PyNumber_ToBase(val, 16);
13981        break;
13982    }
13983    if (!result)
13984        return NULL;
13985
13986    assert(unicode_modifiable(result));
13987    assert(PyUnicode_IS_READY(result));
13988    assert(PyUnicode_IS_ASCII(result));
13989
13990    /* To modify the string in-place, there can only be one reference. */
13991    if (Py_REFCNT(result) != 1) {
13992        Py_DECREF(result);
13993        PyErr_BadInternalCall();
13994        return NULL;
13995    }
13996    buf = PyUnicode_DATA(result);
13997    llen = PyUnicode_GET_LENGTH(result);
13998    if (llen > INT_MAX) {
13999        Py_DECREF(result);
14000        PyErr_SetString(PyExc_ValueError,
14001                        "string too large in _PyUnicode_FormatLong");
14002        return NULL;
14003    }
14004    len = (int)llen;
14005    sign = buf[0] == '-';
14006    numnondigits += sign;
14007    numdigits = len - numnondigits;
14008    assert(numdigits > 0);
14009
14010    /* Get rid of base marker unless F_ALT */
14011    if (((alt) == 0 &&
14012        (type == 'o' || type == 'x' || type == 'X'))) {
14013        assert(buf[sign] == '0');
14014        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14015               buf[sign+1] == 'o');
14016        numnondigits -= 2;
14017        buf += 2;
14018        len -= 2;
14019        if (sign)
14020            buf[0] = '-';
14021        assert(len == numnondigits + numdigits);
14022        assert(numdigits > 0);
14023    }
14024
14025    /* Fill with leading zeroes to meet minimum width. */
14026    if (prec > numdigits) {
14027        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14028                                numnondigits + prec);
14029        char *b1;
14030        if (!r1) {
14031            Py_DECREF(result);
14032            return NULL;
14033        }
14034        b1 = PyBytes_AS_STRING(r1);
14035        for (i = 0; i < numnondigits; ++i)
14036            *b1++ = *buf++;
14037        for (i = 0; i < prec - numdigits; i++)
14038            *b1++ = '0';
14039        for (i = 0; i < numdigits; i++)
14040            *b1++ = *buf++;
14041        *b1 = '\0';
14042        Py_DECREF(result);
14043        result = r1;
14044        buf = PyBytes_AS_STRING(result);
14045        len = numnondigits + prec;
14046    }
14047
14048    /* Fix up case for hex conversions. */
14049    if (type == 'X') {
14050        /* Need to convert all lower case letters to upper case.
14051           and need to convert 0x to 0X (and -0x to -0X). */
14052        for (i = 0; i < len; i++)
14053            if (buf[i] >= 'a' && buf[i] <= 'x')
14054                buf[i] -= 'a'-'A';
14055    }
14056    if (!PyUnicode_Check(result)
14057        || buf != PyUnicode_DATA(result)) {
14058        PyObject *unicode;
14059        unicode = _PyUnicode_FromASCII(buf, len);
14060        Py_DECREF(result);
14061        result = unicode;
14062    }
14063    else if (len != PyUnicode_GET_LENGTH(result)) {
14064        if (PyUnicode_Resize(&result, len) < 0)
14065            Py_CLEAR(result);
14066    }
14067    return result;
14068}
14069
14070/* Format an integer or a float as an integer.
14071 * Return 1 if the number has been formatted into the writer,
14072 *        0 if the number has been formatted into *p_output
14073 *       -1 and raise an exception on error */
14074static int
14075mainformatlong(PyObject *v,
14076               struct unicode_format_arg_t *arg,
14077               PyObject **p_output,
14078               _PyUnicodeWriter *writer)
14079{
14080    PyObject *iobj, *res;
14081    char type = (char)arg->ch;
14082
14083    if (!PyNumber_Check(v))
14084        goto wrongtype;
14085
14086    /* make sure number is a type of integer for o, x, and X */
14087    if (!PyLong_Check(v)) {
14088        if (type == 'o' || type == 'x' || type == 'X') {
14089            iobj = PyNumber_Index(v);
14090            if (iobj == NULL) {
14091                if (PyErr_ExceptionMatches(PyExc_TypeError))
14092                    goto wrongtype;
14093                return -1;
14094            }
14095        }
14096        else {
14097            iobj = PyNumber_Long(v);
14098            if (iobj == NULL ) {
14099                if (PyErr_ExceptionMatches(PyExc_TypeError))
14100                    goto wrongtype;
14101                return -1;
14102            }
14103        }
14104        assert(PyLong_Check(iobj));
14105    }
14106    else {
14107        iobj = v;
14108        Py_INCREF(iobj);
14109    }
14110
14111    if (PyLong_CheckExact(v)
14112        && arg->width == -1 && arg->prec == -1
14113        && !(arg->flags & (F_SIGN | F_BLANK))
14114        && type != 'X')
14115    {
14116        /* Fast path */
14117        int alternate = arg->flags & F_ALT;
14118        int base;
14119
14120        switch(type)
14121        {
14122            default:
14123                assert(0 && "'type' not in [diuoxX]");
14124            case 'd':
14125            case 'i':
14126            case 'u':
14127                base = 10;
14128                break;
14129            case 'o':
14130                base = 8;
14131                break;
14132            case 'x':
14133            case 'X':
14134                base = 16;
14135                break;
14136        }
14137
14138        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14139            Py_DECREF(iobj);
14140            return -1;
14141        }
14142        Py_DECREF(iobj);
14143        return 1;
14144    }
14145
14146    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14147    Py_DECREF(iobj);
14148    if (res == NULL)
14149        return -1;
14150    *p_output = res;
14151    return 0;
14152
14153wrongtype:
14154    switch(type)
14155    {
14156        case 'o':
14157        case 'x':
14158        case 'X':
14159            PyErr_Format(PyExc_TypeError,
14160                    "%%%c format: an integer is required, "
14161                    "not %.200s",
14162                    type, Py_TYPE(v)->tp_name);
14163            break;
14164        default:
14165            PyErr_Format(PyExc_TypeError,
14166                    "%%%c format: a number is required, "
14167                    "not %.200s",
14168                    type, Py_TYPE(v)->tp_name);
14169            break;
14170    }
14171    return -1;
14172}
14173
14174static Py_UCS4
14175formatchar(PyObject *v)
14176{
14177    /* presume that the buffer is at least 3 characters long */
14178    if (PyUnicode_Check(v)) {
14179        if (PyUnicode_GET_LENGTH(v) == 1) {
14180            return PyUnicode_READ_CHAR(v, 0);
14181        }
14182        goto onError;
14183    }
14184    else {
14185        PyObject *iobj;
14186        long x;
14187        /* make sure number is a type of integer */
14188        if (!PyLong_Check(v)) {
14189            iobj = PyNumber_Index(v);
14190            if (iobj == NULL) {
14191                goto onError;
14192            }
14193            v = iobj;
14194            Py_DECREF(iobj);
14195        }
14196        /* Integer input truncated to a character */
14197        x = PyLong_AsLong(v);
14198        if (x == -1 && PyErr_Occurred())
14199            goto onError;
14200
14201        if (x < 0 || x > MAX_UNICODE) {
14202            PyErr_SetString(PyExc_OverflowError,
14203                            "%c arg not in range(0x110000)");
14204            return (Py_UCS4) -1;
14205        }
14206
14207        return (Py_UCS4) x;
14208    }
14209
14210  onError:
14211    PyErr_SetString(PyExc_TypeError,
14212                    "%c requires int or char");
14213    return (Py_UCS4) -1;
14214}
14215
14216/* Parse options of an argument: flags, width, precision.
14217   Handle also "%(name)" syntax.
14218
14219   Return 0 if the argument has been formatted into arg->str.
14220   Return 1 if the argument has been written into ctx->writer,
14221   Raise an exception and return -1 on error. */
14222static int
14223unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14224                         struct unicode_format_arg_t *arg)
14225{
14226#define FORMAT_READ(ctx) \
14227        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14228
14229    PyObject *v;
14230
14231    if (arg->ch == '(') {
14232        /* Get argument value from a dictionary. Example: "%(name)s". */
14233        Py_ssize_t keystart;
14234        Py_ssize_t keylen;
14235        PyObject *key;
14236        int pcount = 1;
14237
14238        if (ctx->dict == NULL) {
14239            PyErr_SetString(PyExc_TypeError,
14240                            "format requires a mapping");
14241            return -1;
14242        }
14243        ++ctx->fmtpos;
14244        --ctx->fmtcnt;
14245        keystart = ctx->fmtpos;
14246        /* Skip over balanced parentheses */
14247        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14248            arg->ch = FORMAT_READ(ctx);
14249            if (arg->ch == ')')
14250                --pcount;
14251            else if (arg->ch == '(')
14252                ++pcount;
14253            ctx->fmtpos++;
14254        }
14255        keylen = ctx->fmtpos - keystart - 1;
14256        if (ctx->fmtcnt < 0 || pcount > 0) {
14257            PyErr_SetString(PyExc_ValueError,
14258                            "incomplete format key");
14259            return -1;
14260        }
14261        key = PyUnicode_Substring(ctx->fmtstr,
14262                                  keystart, keystart + keylen);
14263        if (key == NULL)
14264            return -1;
14265        if (ctx->args_owned) {
14266            Py_DECREF(ctx->args);
14267            ctx->args_owned = 0;
14268        }
14269        ctx->args = PyObject_GetItem(ctx->dict, key);
14270        Py_DECREF(key);
14271        if (ctx->args == NULL)
14272            return -1;
14273        ctx->args_owned = 1;
14274        ctx->arglen = -1;
14275        ctx->argidx = -2;
14276    }
14277
14278    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14279    while (--ctx->fmtcnt >= 0) {
14280        arg->ch = FORMAT_READ(ctx);
14281        ctx->fmtpos++;
14282        switch (arg->ch) {
14283        case '-': arg->flags |= F_LJUST; continue;
14284        case '+': arg->flags |= F_SIGN; continue;
14285        case ' ': arg->flags |= F_BLANK; continue;
14286        case '#': arg->flags |= F_ALT; continue;
14287        case '0': arg->flags |= F_ZERO; continue;
14288        }
14289        break;
14290    }
14291
14292    /* Parse width. Example: "%10s" => width=10 */
14293    if (arg->ch == '*') {
14294        v = unicode_format_getnextarg(ctx);
14295        if (v == NULL)
14296            return -1;
14297        if (!PyLong_Check(v)) {
14298            PyErr_SetString(PyExc_TypeError,
14299                            "* wants int");
14300            return -1;
14301        }
14302        arg->width = PyLong_AsSsize_t(v);
14303        if (arg->width == -1 && PyErr_Occurred())
14304            return -1;
14305        if (arg->width < 0) {
14306            arg->flags |= F_LJUST;
14307            arg->width = -arg->width;
14308        }
14309        if (--ctx->fmtcnt >= 0) {
14310            arg->ch = FORMAT_READ(ctx);
14311            ctx->fmtpos++;
14312        }
14313    }
14314    else if (arg->ch >= '0' && arg->ch <= '9') {
14315        arg->width = arg->ch - '0';
14316        while (--ctx->fmtcnt >= 0) {
14317            arg->ch = FORMAT_READ(ctx);
14318            ctx->fmtpos++;
14319            if (arg->ch < '0' || arg->ch > '9')
14320                break;
14321            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14322               mixing signed and unsigned comparison. Since arg->ch is between
14323               '0' and '9', casting to int is safe. */
14324            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14325                PyErr_SetString(PyExc_ValueError,
14326                                "width too big");
14327                return -1;
14328            }
14329            arg->width = arg->width*10 + (arg->ch - '0');
14330        }
14331    }
14332
14333    /* Parse precision. Example: "%.3f" => prec=3 */
14334    if (arg->ch == '.') {
14335        arg->prec = 0;
14336        if (--ctx->fmtcnt >= 0) {
14337            arg->ch = FORMAT_READ(ctx);
14338            ctx->fmtpos++;
14339        }
14340        if (arg->ch == '*') {
14341            v = unicode_format_getnextarg(ctx);
14342            if (v == NULL)
14343                return -1;
14344            if (!PyLong_Check(v)) {
14345                PyErr_SetString(PyExc_TypeError,
14346                                "* wants int");
14347                return -1;
14348            }
14349            arg->prec = _PyLong_AsInt(v);
14350            if (arg->prec == -1 && PyErr_Occurred())
14351                return -1;
14352            if (arg->prec < 0)
14353                arg->prec = 0;
14354            if (--ctx->fmtcnt >= 0) {
14355                arg->ch = FORMAT_READ(ctx);
14356                ctx->fmtpos++;
14357            }
14358        }
14359        else if (arg->ch >= '0' && arg->ch <= '9') {
14360            arg->prec = arg->ch - '0';
14361            while (--ctx->fmtcnt >= 0) {
14362                arg->ch = FORMAT_READ(ctx);
14363                ctx->fmtpos++;
14364                if (arg->ch < '0' || arg->ch > '9')
14365                    break;
14366                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14367                    PyErr_SetString(PyExc_ValueError,
14368                                    "precision too big");
14369                    return -1;
14370                }
14371                arg->prec = arg->prec*10 + (arg->ch - '0');
14372            }
14373        }
14374    }
14375
14376    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14377    if (ctx->fmtcnt >= 0) {
14378        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14379            if (--ctx->fmtcnt >= 0) {
14380                arg->ch = FORMAT_READ(ctx);
14381                ctx->fmtpos++;
14382            }
14383        }
14384    }
14385    if (ctx->fmtcnt < 0) {
14386        PyErr_SetString(PyExc_ValueError,
14387                        "incomplete format");
14388        return -1;
14389    }
14390    return 0;
14391
14392#undef FORMAT_READ
14393}
14394
14395/* Format one argument. Supported conversion specifiers:
14396
14397   - "s", "r", "a": any type
14398   - "i", "d", "u": int or float
14399   - "o", "x", "X": int
14400   - "e", "E", "f", "F", "g", "G": float
14401   - "c": int or str (1 character)
14402
14403   When possible, the output is written directly into the Unicode writer
14404   (ctx->writer). A string is created when padding is required.
14405
14406   Return 0 if the argument has been formatted into *p_str,
14407          1 if the argument has been written into ctx->writer,
14408         -1 on error. */
14409static int
14410unicode_format_arg_format(struct unicode_formatter_t *ctx,
14411                          struct unicode_format_arg_t *arg,
14412                          PyObject **p_str)
14413{
14414    PyObject *v;
14415    _PyUnicodeWriter *writer = &ctx->writer;
14416
14417    if (ctx->fmtcnt == 0)
14418        ctx->writer.overallocate = 0;
14419
14420    if (arg->ch == '%') {
14421        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14422            return -1;
14423        return 1;
14424    }
14425
14426    v = unicode_format_getnextarg(ctx);
14427    if (v == NULL)
14428        return -1;
14429
14430
14431    switch (arg->ch) {
14432    case 's':
14433    case 'r':
14434    case 'a':
14435        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14436            /* Fast path */
14437            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14438                return -1;
14439            return 1;
14440        }
14441
14442        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14443            *p_str = v;
14444            Py_INCREF(*p_str);
14445        }
14446        else {
14447            if (arg->ch == 's')
14448                *p_str = PyObject_Str(v);
14449            else if (arg->ch == 'r')
14450                *p_str = PyObject_Repr(v);
14451            else
14452                *p_str = PyObject_ASCII(v);
14453        }
14454        break;
14455
14456    case 'i':
14457    case 'd':
14458    case 'u':
14459    case 'o':
14460    case 'x':
14461    case 'X':
14462    {
14463        int ret = mainformatlong(v, arg, p_str, writer);
14464        if (ret != 0)
14465            return ret;
14466        arg->sign = 1;
14467        break;
14468    }
14469
14470    case 'e':
14471    case 'E':
14472    case 'f':
14473    case 'F':
14474    case 'g':
14475    case 'G':
14476        if (arg->width == -1 && arg->prec == -1
14477            && !(arg->flags & (F_SIGN | F_BLANK)))
14478        {
14479            /* Fast path */
14480            if (formatfloat(v, arg, NULL, writer) == -1)
14481                return -1;
14482            return 1;
14483        }
14484
14485        arg->sign = 1;
14486        if (formatfloat(v, arg, p_str, NULL) == -1)
14487            return -1;
14488        break;
14489
14490    case 'c':
14491    {
14492        Py_UCS4 ch = formatchar(v);
14493        if (ch == (Py_UCS4) -1)
14494            return -1;
14495        if (arg->width == -1 && arg->prec == -1) {
14496            /* Fast path */
14497            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14498                return -1;
14499            return 1;
14500        }
14501        *p_str = PyUnicode_FromOrdinal(ch);
14502        break;
14503    }
14504
14505    default:
14506        PyErr_Format(PyExc_ValueError,
14507                     "unsupported format character '%c' (0x%x) "
14508                     "at index %zd",
14509                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14510                     (int)arg->ch,
14511                     ctx->fmtpos - 1);
14512        return -1;
14513    }
14514    if (*p_str == NULL)
14515        return -1;
14516    assert (PyUnicode_Check(*p_str));
14517    return 0;
14518}
14519
14520static int
14521unicode_format_arg_output(struct unicode_formatter_t *ctx,
14522                          struct unicode_format_arg_t *arg,
14523                          PyObject *str)
14524{
14525    Py_ssize_t len;
14526    enum PyUnicode_Kind kind;
14527    void *pbuf;
14528    Py_ssize_t pindex;
14529    Py_UCS4 signchar;
14530    Py_ssize_t buflen;
14531    Py_UCS4 maxchar;
14532    Py_ssize_t sublen;
14533    _PyUnicodeWriter *writer = &ctx->writer;
14534    Py_UCS4 fill;
14535
14536    fill = ' ';
14537    if (arg->sign && arg->flags & F_ZERO)
14538        fill = '0';
14539
14540    if (PyUnicode_READY(str) == -1)
14541        return -1;
14542
14543    len = PyUnicode_GET_LENGTH(str);
14544    if ((arg->width == -1 || arg->width <= len)
14545        && (arg->prec == -1 || arg->prec >= len)
14546        && !(arg->flags & (F_SIGN | F_BLANK)))
14547    {
14548        /* Fast path */
14549        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14550            return -1;
14551        return 0;
14552    }
14553
14554    /* Truncate the string for "s", "r" and "a" formats
14555       if the precision is set */
14556    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14557        if (arg->prec >= 0 && len > arg->prec)
14558            len = arg->prec;
14559    }
14560
14561    /* Adjust sign and width */
14562    kind = PyUnicode_KIND(str);
14563    pbuf = PyUnicode_DATA(str);
14564    pindex = 0;
14565    signchar = '\0';
14566    if (arg->sign) {
14567        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14568        if (ch == '-' || ch == '+') {
14569            signchar = ch;
14570            len--;
14571            pindex++;
14572        }
14573        else if (arg->flags & F_SIGN)
14574            signchar = '+';
14575        else if (arg->flags & F_BLANK)
14576            signchar = ' ';
14577        else
14578            arg->sign = 0;
14579    }
14580    if (arg->width < len)
14581        arg->width = len;
14582
14583    /* Prepare the writer */
14584    maxchar = writer->maxchar;
14585    if (!(arg->flags & F_LJUST)) {
14586        if (arg->sign) {
14587            if ((arg->width-1) > len)
14588                maxchar = Py_MAX(maxchar, fill);
14589        }
14590        else {
14591            if (arg->width > len)
14592                maxchar = Py_MAX(maxchar, fill);
14593        }
14594    }
14595    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14596        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14597        maxchar = Py_MAX(maxchar, strmaxchar);
14598    }
14599
14600    buflen = arg->width;
14601    if (arg->sign && len == arg->width)
14602        buflen++;
14603    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14604        return -1;
14605
14606    /* Write the sign if needed */
14607    if (arg->sign) {
14608        if (fill != ' ') {
14609            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14610            writer->pos += 1;
14611        }
14612        if (arg->width > len)
14613            arg->width--;
14614    }
14615
14616    /* Write the numeric prefix for "x", "X" and "o" formats
14617       if the alternate form is used.
14618       For example, write "0x" for the "%#x" format. */
14619    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14620        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14621        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14622        if (fill != ' ') {
14623            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14624            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14625            writer->pos += 2;
14626            pindex += 2;
14627        }
14628        arg->width -= 2;
14629        if (arg->width < 0)
14630            arg->width = 0;
14631        len -= 2;
14632    }
14633
14634    /* Pad left with the fill character if needed */
14635    if (arg->width > len && !(arg->flags & F_LJUST)) {
14636        sublen = arg->width - len;
14637        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14638        writer->pos += sublen;
14639        arg->width = len;
14640    }
14641
14642    /* If padding with spaces: write sign if needed and/or numeric prefix if
14643       the alternate form is used */
14644    if (fill == ' ') {
14645        if (arg->sign) {
14646            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14647            writer->pos += 1;
14648        }
14649        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14650            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14651            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14652            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14653            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14654            writer->pos += 2;
14655            pindex += 2;
14656        }
14657    }
14658
14659    /* Write characters */
14660    if (len) {
14661        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14662                                      str, pindex, len);
14663        writer->pos += len;
14664    }
14665
14666    /* Pad right with the fill character if needed */
14667    if (arg->width > len) {
14668        sublen = arg->width - len;
14669        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14670        writer->pos += sublen;
14671    }
14672    return 0;
14673}
14674
14675/* Helper of PyUnicode_Format(): format one arg.
14676   Return 0 on success, raise an exception and return -1 on error. */
14677static int
14678unicode_format_arg(struct unicode_formatter_t *ctx)
14679{
14680    struct unicode_format_arg_t arg;
14681    PyObject *str;
14682    int ret;
14683
14684    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14685    arg.flags = 0;
14686    arg.width = -1;
14687    arg.prec = -1;
14688    arg.sign = 0;
14689    str = NULL;
14690
14691    ret = unicode_format_arg_parse(ctx, &arg);
14692    if (ret == -1)
14693        return -1;
14694
14695    ret = unicode_format_arg_format(ctx, &arg, &str);
14696    if (ret == -1)
14697        return -1;
14698
14699    if (ret != 1) {
14700        ret = unicode_format_arg_output(ctx, &arg, str);
14701        Py_DECREF(str);
14702        if (ret == -1)
14703            return -1;
14704    }
14705
14706    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14707        PyErr_SetString(PyExc_TypeError,
14708                        "not all arguments converted during string formatting");
14709        return -1;
14710    }
14711    return 0;
14712}
14713
14714PyObject *
14715PyUnicode_Format(PyObject *format, PyObject *args)
14716{
14717    struct unicode_formatter_t ctx;
14718
14719    if (format == NULL || args == NULL) {
14720        PyErr_BadInternalCall();
14721        return NULL;
14722    }
14723
14724    ctx.fmtstr = PyUnicode_FromObject(format);
14725    if (ctx.fmtstr == NULL)
14726        return NULL;
14727    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14728        Py_DECREF(ctx.fmtstr);
14729        return NULL;
14730    }
14731    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14732    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14733    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14734    ctx.fmtpos = 0;
14735
14736    _PyUnicodeWriter_Init(&ctx.writer);
14737    ctx.writer.min_length = ctx.fmtcnt + 100;
14738    ctx.writer.overallocate = 1;
14739
14740    if (PyTuple_Check(args)) {
14741        ctx.arglen = PyTuple_Size(args);
14742        ctx.argidx = 0;
14743    }
14744    else {
14745        ctx.arglen = -1;
14746        ctx.argidx = -2;
14747    }
14748    ctx.args_owned = 0;
14749    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14750        ctx.dict = args;
14751    else
14752        ctx.dict = NULL;
14753    ctx.args = args;
14754
14755    while (--ctx.fmtcnt >= 0) {
14756        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14757            Py_ssize_t nonfmtpos;
14758
14759            nonfmtpos = ctx.fmtpos++;
14760            while (ctx.fmtcnt >= 0 &&
14761                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14762                ctx.fmtpos++;
14763                ctx.fmtcnt--;
14764            }
14765            if (ctx.fmtcnt < 0) {
14766                ctx.fmtpos--;
14767                ctx.writer.overallocate = 0;
14768            }
14769
14770            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14771                                                nonfmtpos, ctx.fmtpos) < 0)
14772                goto onError;
14773        }
14774        else {
14775            ctx.fmtpos++;
14776            if (unicode_format_arg(&ctx) == -1)
14777                goto onError;
14778        }
14779    }
14780
14781    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14782        PyErr_SetString(PyExc_TypeError,
14783                        "not all arguments converted during string formatting");
14784        goto onError;
14785    }
14786
14787    if (ctx.args_owned) {
14788        Py_DECREF(ctx.args);
14789    }
14790    Py_DECREF(ctx.fmtstr);
14791    return _PyUnicodeWriter_Finish(&ctx.writer);
14792
14793  onError:
14794    Py_DECREF(ctx.fmtstr);
14795    _PyUnicodeWriter_Dealloc(&ctx.writer);
14796    if (ctx.args_owned) {
14797        Py_DECREF(ctx.args);
14798    }
14799    return NULL;
14800}
14801
14802static PyObject *
14803unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14804
14805static PyObject *
14806unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14807{
14808    PyObject *x = NULL;
14809    static char *kwlist[] = {"object", "encoding", "errors", 0};
14810    char *encoding = NULL;
14811    char *errors = NULL;
14812
14813    if (type != &PyUnicode_Type)
14814        return unicode_subtype_new(type, args, kwds);
14815    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14816                                     kwlist, &x, &encoding, &errors))
14817        return NULL;
14818    if (x == NULL)
14819        _Py_RETURN_UNICODE_EMPTY();
14820    if (encoding == NULL && errors == NULL)
14821        return PyObject_Str(x);
14822    else
14823        return PyUnicode_FromEncodedObject(x, encoding, errors);
14824}
14825
14826static PyObject *
14827unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14828{
14829    PyObject *unicode, *self;
14830    Py_ssize_t length, char_size;
14831    int share_wstr, share_utf8;
14832    unsigned int kind;
14833    void *data;
14834
14835    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14836
14837    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14838    if (unicode == NULL)
14839        return NULL;
14840    assert(_PyUnicode_CHECK(unicode));
14841    if (PyUnicode_READY(unicode) == -1) {
14842        Py_DECREF(unicode);
14843        return NULL;
14844    }
14845
14846    self = type->tp_alloc(type, 0);
14847    if (self == NULL) {
14848        Py_DECREF(unicode);
14849        return NULL;
14850    }
14851    kind = PyUnicode_KIND(unicode);
14852    length = PyUnicode_GET_LENGTH(unicode);
14853
14854    _PyUnicode_LENGTH(self) = length;
14855#ifdef Py_DEBUG
14856    _PyUnicode_HASH(self) = -1;
14857#else
14858    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14859#endif
14860    _PyUnicode_STATE(self).interned = 0;
14861    _PyUnicode_STATE(self).kind = kind;
14862    _PyUnicode_STATE(self).compact = 0;
14863    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14864    _PyUnicode_STATE(self).ready = 1;
14865    _PyUnicode_WSTR(self) = NULL;
14866    _PyUnicode_UTF8_LENGTH(self) = 0;
14867    _PyUnicode_UTF8(self) = NULL;
14868    _PyUnicode_WSTR_LENGTH(self) = 0;
14869    _PyUnicode_DATA_ANY(self) = NULL;
14870
14871    share_utf8 = 0;
14872    share_wstr = 0;
14873    if (kind == PyUnicode_1BYTE_KIND) {
14874        char_size = 1;
14875        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14876            share_utf8 = 1;
14877    }
14878    else if (kind == PyUnicode_2BYTE_KIND) {
14879        char_size = 2;
14880        if (sizeof(wchar_t) == 2)
14881            share_wstr = 1;
14882    }
14883    else {
14884        assert(kind == PyUnicode_4BYTE_KIND);
14885        char_size = 4;
14886        if (sizeof(wchar_t) == 4)
14887            share_wstr = 1;
14888    }
14889
14890    /* Ensure we won't overflow the length. */
14891    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14892        PyErr_NoMemory();
14893        goto onError;
14894    }
14895    data = PyObject_MALLOC((length + 1) * char_size);
14896    if (data == NULL) {
14897        PyErr_NoMemory();
14898        goto onError;
14899    }
14900
14901    _PyUnicode_DATA_ANY(self) = data;
14902    if (share_utf8) {
14903        _PyUnicode_UTF8_LENGTH(self) = length;
14904        _PyUnicode_UTF8(self) = data;
14905    }
14906    if (share_wstr) {
14907        _PyUnicode_WSTR_LENGTH(self) = length;
14908        _PyUnicode_WSTR(self) = (wchar_t *)data;
14909    }
14910
14911    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14912              kind * (length + 1));
14913    assert(_PyUnicode_CheckConsistency(self, 1));
14914#ifdef Py_DEBUG
14915    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14916#endif
14917    Py_DECREF(unicode);
14918    return self;
14919
14920onError:
14921    Py_DECREF(unicode);
14922    Py_DECREF(self);
14923    return NULL;
14924}
14925
14926PyDoc_STRVAR(unicode_doc,
14927"str(object='') -> str\n\
14928str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14929\n\
14930Create a new string object from the given object. If encoding or\n\
14931errors is specified, then the object must expose a data buffer\n\
14932that will be decoded using the given encoding and error handler.\n\
14933Otherwise, returns the result of object.__str__() (if defined)\n\
14934or repr(object).\n\
14935encoding defaults to sys.getdefaultencoding().\n\
14936errors defaults to 'strict'.");
14937
14938static PyObject *unicode_iter(PyObject *seq);
14939
14940PyTypeObject PyUnicode_Type = {
14941    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14942    "str",              /* tp_name */
14943    sizeof(PyUnicodeObject),        /* tp_size */
14944    0,                  /* tp_itemsize */
14945    /* Slots */
14946    (destructor)unicode_dealloc,    /* tp_dealloc */
14947    0,                  /* tp_print */
14948    0,                  /* tp_getattr */
14949    0,                  /* tp_setattr */
14950    0,                  /* tp_reserved */
14951    unicode_repr,           /* tp_repr */
14952    &unicode_as_number,         /* tp_as_number */
14953    &unicode_as_sequence,       /* tp_as_sequence */
14954    &unicode_as_mapping,        /* tp_as_mapping */
14955    (hashfunc) unicode_hash,        /* tp_hash*/
14956    0,                  /* tp_call*/
14957    (reprfunc) unicode_str,     /* tp_str */
14958    PyObject_GenericGetAttr,        /* tp_getattro */
14959    0,                  /* tp_setattro */
14960    0,                  /* tp_as_buffer */
14961    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14962    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14963    unicode_doc,            /* tp_doc */
14964    0,                  /* tp_traverse */
14965    0,                  /* tp_clear */
14966    PyUnicode_RichCompare,      /* tp_richcompare */
14967    0,                  /* tp_weaklistoffset */
14968    unicode_iter,           /* tp_iter */
14969    0,                  /* tp_iternext */
14970    unicode_methods,            /* tp_methods */
14971    0,                  /* tp_members */
14972    0,                  /* tp_getset */
14973    &PyBaseObject_Type,         /* tp_base */
14974    0,                  /* tp_dict */
14975    0,                  /* tp_descr_get */
14976    0,                  /* tp_descr_set */
14977    0,                  /* tp_dictoffset */
14978    0,                  /* tp_init */
14979    0,                  /* tp_alloc */
14980    unicode_new,            /* tp_new */
14981    PyObject_Del,           /* tp_free */
14982};
14983
14984/* Initialize the Unicode implementation */
14985
14986int _PyUnicode_Init(void)
14987{
14988    /* XXX - move this array to unicodectype.c ? */
14989    Py_UCS2 linebreak[] = {
14990        0x000A, /* LINE FEED */
14991        0x000D, /* CARRIAGE RETURN */
14992        0x001C, /* FILE SEPARATOR */
14993        0x001D, /* GROUP SEPARATOR */
14994        0x001E, /* RECORD SEPARATOR */
14995        0x0085, /* NEXT LINE */
14996        0x2028, /* LINE SEPARATOR */
14997        0x2029, /* PARAGRAPH SEPARATOR */
14998    };
14999
15000    /* Init the implementation */
15001    _Py_INCREF_UNICODE_EMPTY();
15002    if (!unicode_empty)
15003        Py_FatalError("Can't create empty string");
15004    Py_DECREF(unicode_empty);
15005
15006    if (PyType_Ready(&PyUnicode_Type) < 0)
15007        Py_FatalError("Can't initialize 'unicode'");
15008
15009    /* initialize the linebreak bloom filter */
15010    bloom_linebreak = make_bloom_mask(
15011        PyUnicode_2BYTE_KIND, linebreak,
15012        Py_ARRAY_LENGTH(linebreak));
15013
15014    if (PyType_Ready(&EncodingMapType) < 0)
15015         Py_FatalError("Can't initialize encoding map type");
15016
15017    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15018        Py_FatalError("Can't initialize field name iterator type");
15019
15020    if (PyType_Ready(&PyFormatterIter_Type) < 0)
15021        Py_FatalError("Can't initialize formatter iter type");
15022
15023    return 0;
15024}
15025
15026/* Finalize the Unicode implementation */
15027
15028int
15029PyUnicode_ClearFreeList(void)
15030{
15031    return 0;
15032}
15033
15034void
15035_PyUnicode_Fini(void)
15036{
15037    int i;
15038
15039    Py_CLEAR(unicode_empty);
15040
15041    for (i = 0; i < 256; i++)
15042        Py_CLEAR(unicode_latin1[i]);
15043    _PyUnicode_ClearStaticStrings();
15044    (void)PyUnicode_ClearFreeList();
15045}
15046
15047void
15048PyUnicode_InternInPlace(PyObject **p)
15049{
15050    PyObject *s = *p;
15051    PyObject *t;
15052#ifdef Py_DEBUG
15053    assert(s != NULL);
15054    assert(_PyUnicode_CHECK(s));
15055#else
15056    if (s == NULL || !PyUnicode_Check(s))
15057        return;
15058#endif
15059    /* If it's a subclass, we don't really know what putting
15060       it in the interned dict might do. */
15061    if (!PyUnicode_CheckExact(s))
15062        return;
15063    if (PyUnicode_CHECK_INTERNED(s))
15064        return;
15065    if (interned == NULL) {
15066        interned = PyDict_New();
15067        if (interned == NULL) {
15068            PyErr_Clear(); /* Don't leave an exception */
15069            return;
15070        }
15071    }
15072    /* It might be that the GetItem call fails even
15073       though the key is present in the dictionary,
15074       namely when this happens during a stack overflow. */
15075    Py_ALLOW_RECURSION
15076    t = PyDict_GetItem(interned, s);
15077    Py_END_ALLOW_RECURSION
15078
15079    if (t) {
15080        Py_INCREF(t);
15081        Py_DECREF(*p);
15082        *p = t;
15083        return;
15084    }
15085
15086    PyThreadState_GET()->recursion_critical = 1;
15087    if (PyDict_SetItem(interned, s, s) < 0) {
15088        PyErr_Clear();
15089        PyThreadState_GET()->recursion_critical = 0;
15090        return;
15091    }
15092    PyThreadState_GET()->recursion_critical = 0;
15093    /* The two references in interned are not counted by refcnt.
15094       The deallocator will take care of this */
15095    Py_REFCNT(s) -= 2;
15096    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15097}
15098
15099void
15100PyUnicode_InternImmortal(PyObject **p)
15101{
15102    PyUnicode_InternInPlace(p);
15103    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15104        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15105        Py_INCREF(*p);
15106    }
15107}
15108
15109PyObject *
15110PyUnicode_InternFromString(const char *cp)
15111{
15112    PyObject *s = PyUnicode_FromString(cp);
15113    if (s == NULL)
15114        return NULL;
15115    PyUnicode_InternInPlace(&s);
15116    return s;
15117}
15118
15119void
15120_Py_ReleaseInternedUnicodeStrings(void)
15121{
15122    PyObject *keys;
15123    PyObject *s;
15124    Py_ssize_t i, n;
15125    Py_ssize_t immortal_size = 0, mortal_size = 0;
15126
15127    if (interned == NULL || !PyDict_Check(interned))
15128        return;
15129    keys = PyDict_Keys(interned);
15130    if (keys == NULL || !PyList_Check(keys)) {
15131        PyErr_Clear();
15132        return;
15133    }
15134
15135    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15136       detector, interned unicode strings are not forcibly deallocated;
15137       rather, we give them their stolen references back, and then clear
15138       and DECREF the interned dict. */
15139
15140    n = PyList_GET_SIZE(keys);
15141    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15142            n);
15143    for (i = 0; i < n; i++) {
15144        s = PyList_GET_ITEM(keys, i);
15145        if (PyUnicode_READY(s) == -1) {
15146            assert(0 && "could not ready string");
15147            fprintf(stderr, "could not ready string\n");
15148        }
15149        switch (PyUnicode_CHECK_INTERNED(s)) {
15150        case SSTATE_NOT_INTERNED:
15151            /* XXX Shouldn't happen */
15152            break;
15153        case SSTATE_INTERNED_IMMORTAL:
15154            Py_REFCNT(s) += 1;
15155            immortal_size += PyUnicode_GET_LENGTH(s);
15156            break;
15157        case SSTATE_INTERNED_MORTAL:
15158            Py_REFCNT(s) += 2;
15159            mortal_size += PyUnicode_GET_LENGTH(s);
15160            break;
15161        default:
15162            Py_FatalError("Inconsistent interned string state.");
15163        }
15164        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15165    }
15166    fprintf(stderr, "total size of all interned strings: "
15167            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15168            "mortal/immortal\n", mortal_size, immortal_size);
15169    Py_DECREF(keys);
15170    PyDict_Clear(interned);
15171    Py_CLEAR(interned);
15172}
15173
15174
15175/********************* Unicode Iterator **************************/
15176
15177typedef struct {
15178    PyObject_HEAD
15179    Py_ssize_t it_index;
15180    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15181} unicodeiterobject;
15182
15183static void
15184unicodeiter_dealloc(unicodeiterobject *it)
15185{
15186    _PyObject_GC_UNTRACK(it);
15187    Py_XDECREF(it->it_seq);
15188    PyObject_GC_Del(it);
15189}
15190
15191static int
15192unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15193{
15194    Py_VISIT(it->it_seq);
15195    return 0;
15196}
15197
15198static PyObject *
15199unicodeiter_next(unicodeiterobject *it)
15200{
15201    PyObject *seq, *item;
15202
15203    assert(it != NULL);
15204    seq = it->it_seq;
15205    if (seq == NULL)
15206        return NULL;
15207    assert(_PyUnicode_CHECK(seq));
15208
15209    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15210        int kind = PyUnicode_KIND(seq);
15211        void *data = PyUnicode_DATA(seq);
15212        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15213        item = PyUnicode_FromOrdinal(chr);
15214        if (item != NULL)
15215            ++it->it_index;
15216        return item;
15217    }
15218
15219    Py_DECREF(seq);
15220    it->it_seq = NULL;
15221    return NULL;
15222}
15223
15224static PyObject *
15225unicodeiter_len(unicodeiterobject *it)
15226{
15227    Py_ssize_t len = 0;
15228    if (it->it_seq)
15229        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15230    return PyLong_FromSsize_t(len);
15231}
15232
15233PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15234
15235static PyObject *
15236unicodeiter_reduce(unicodeiterobject *it)
15237{
15238    if (it->it_seq != NULL) {
15239        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15240                             it->it_seq, it->it_index);
15241    } else {
15242        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15243        if (u == NULL)
15244            return NULL;
15245        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15246    }
15247}
15248
15249PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15250
15251static PyObject *
15252unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15253{
15254    Py_ssize_t index = PyLong_AsSsize_t(state);
15255    if (index == -1 && PyErr_Occurred())
15256        return NULL;
15257    if (it->it_seq != NULL) {
15258        if (index < 0)
15259            index = 0;
15260        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15261            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15262        it->it_index = index;
15263    }
15264    Py_RETURN_NONE;
15265}
15266
15267PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15268
15269static PyMethodDef unicodeiter_methods[] = {
15270    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15271     length_hint_doc},
15272    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15273     reduce_doc},
15274    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15275     setstate_doc},
15276    {NULL,      NULL}       /* sentinel */
15277};
15278
15279PyTypeObject PyUnicodeIter_Type = {
15280    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15281    "str_iterator",         /* tp_name */
15282    sizeof(unicodeiterobject),      /* tp_basicsize */
15283    0,                  /* tp_itemsize */
15284    /* methods */
15285    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15286    0,                  /* tp_print */
15287    0,                  /* tp_getattr */
15288    0,                  /* tp_setattr */
15289    0,                  /* tp_reserved */
15290    0,                  /* tp_repr */
15291    0,                  /* tp_as_number */
15292    0,                  /* tp_as_sequence */
15293    0,                  /* tp_as_mapping */
15294    0,                  /* tp_hash */
15295    0,                  /* tp_call */
15296    0,                  /* tp_str */
15297    PyObject_GenericGetAttr,        /* tp_getattro */
15298    0,                  /* tp_setattro */
15299    0,                  /* tp_as_buffer */
15300    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15301    0,                  /* tp_doc */
15302    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15303    0,                  /* tp_clear */
15304    0,                  /* tp_richcompare */
15305    0,                  /* tp_weaklistoffset */
15306    PyObject_SelfIter,          /* tp_iter */
15307    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15308    unicodeiter_methods,            /* tp_methods */
15309    0,
15310};
15311
15312static PyObject *
15313unicode_iter(PyObject *seq)
15314{
15315    unicodeiterobject *it;
15316
15317    if (!PyUnicode_Check(seq)) {
15318        PyErr_BadInternalCall();
15319        return NULL;
15320    }
15321    if (PyUnicode_READY(seq) == -1)
15322        return NULL;
15323    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15324    if (it == NULL)
15325        return NULL;
15326    it->it_index = 0;
15327    Py_INCREF(seq);
15328    it->it_seq = seq;
15329    _PyObject_GC_TRACK(it);
15330    return (PyObject *)it;
15331}
15332
15333
15334size_t
15335Py_UNICODE_strlen(const Py_UNICODE *u)
15336{
15337    int res = 0;
15338    while(*u++)
15339        res++;
15340    return res;
15341}
15342
15343Py_UNICODE*
15344Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15345{
15346    Py_UNICODE *u = s1;
15347    while ((*u++ = *s2++));
15348    return s1;
15349}
15350
15351Py_UNICODE*
15352Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15353{
15354    Py_UNICODE *u = s1;
15355    while ((*u++ = *s2++))
15356        if (n-- == 0)
15357            break;
15358    return s1;
15359}
15360
15361Py_UNICODE*
15362Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15363{
15364    Py_UNICODE *u1 = s1;
15365    u1 += Py_UNICODE_strlen(u1);
15366    Py_UNICODE_strcpy(u1, s2);
15367    return s1;
15368}
15369
15370int
15371Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15372{
15373    while (*s1 && *s2 && *s1 == *s2)
15374        s1++, s2++;
15375    if (*s1 && *s2)
15376        return (*s1 < *s2) ? -1 : +1;
15377    if (*s1)
15378        return 1;
15379    if (*s2)
15380        return -1;
15381    return 0;
15382}
15383
15384int
15385Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15386{
15387    Py_UNICODE u1, u2;
15388    for (; n != 0; n--) {
15389        u1 = *s1;
15390        u2 = *s2;
15391        if (u1 != u2)
15392            return (u1 < u2) ? -1 : +1;
15393        if (u1 == '\0')
15394            return 0;
15395        s1++;
15396        s2++;
15397    }
15398    return 0;
15399}
15400
15401Py_UNICODE*
15402Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15403{
15404    const Py_UNICODE *p;
15405    for (p = s; *p; p++)
15406        if (*p == c)
15407            return (Py_UNICODE*)p;
15408    return NULL;
15409}
15410
15411Py_UNICODE*
15412Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15413{
15414    const Py_UNICODE *p;
15415    p = s + Py_UNICODE_strlen(s);
15416    while (p != s) {
15417        p--;
15418        if (*p == c)
15419            return (Py_UNICODE*)p;
15420    }
15421    return NULL;
15422}
15423
15424Py_UNICODE*
15425PyUnicode_AsUnicodeCopy(PyObject *unicode)
15426{
15427    Py_UNICODE *u, *copy;
15428    Py_ssize_t len, size;
15429
15430    if (!PyUnicode_Check(unicode)) {
15431        PyErr_BadArgument();
15432        return NULL;
15433    }
15434    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15435    if (u == NULL)
15436        return NULL;
15437    /* Ensure we won't overflow the size. */
15438    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15439        PyErr_NoMemory();
15440        return NULL;
15441    }
15442    size = len + 1; /* copy the null character */
15443    size *= sizeof(Py_UNICODE);
15444    copy = PyMem_Malloc(size);
15445    if (copy == NULL) {
15446        PyErr_NoMemory();
15447        return NULL;
15448    }
15449    memcpy(copy, u, size);
15450    return copy;
15451}
15452
15453/* A _string module, to export formatter_parser and formatter_field_name_split
15454   to the string.Formatter class implemented in Python. */
15455
15456static PyMethodDef _string_methods[] = {
15457    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15458     METH_O, PyDoc_STR("split the argument as a field name")},
15459    {"formatter_parser", (PyCFunction) formatter_parser,
15460     METH_O, PyDoc_STR("parse the argument as a format string")},
15461    {NULL, NULL}
15462};
15463
15464static struct PyModuleDef _string_module = {
15465    PyModuleDef_HEAD_INIT,
15466    "_string",
15467    PyDoc_STR("string helper module"),
15468    0,
15469    _string_methods,
15470    NULL,
15471    NULL,
15472    NULL,
15473    NULL
15474};
15475
15476PyMODINIT_FUNC
15477PyInit__string(void)
15478{
15479    return PyModule_Create(&_string_module);
15480}
15481
15482
15483#ifdef __cplusplus
15484}
15485#endif
15486