unicodeobject.c revision c3713e9706e51bbd30958c27d35e7fda764b0c4a
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45#include "stringlib/eq.h"
46
47#ifdef MS_WINDOWS
48#include <windows.h>
49#endif
50
51/*[clinic input]
52class str "PyUnicodeObject *" "&PyUnicode_Type"
53[clinic start generated code]*/
54/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
55
56/* --- Globals ------------------------------------------------------------
57
58NOTE: In the interpreter's initialization phase, some globals are currently
59      initialized dynamically as needed. In the process Unicode objects may
60      be created before the Unicode type is ready.
61
62*/
63
64
65#ifdef __cplusplus
66extern "C" {
67#endif
68
69/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
70#define MAX_UNICODE 0x10ffff
71
72#ifdef Py_DEBUG
73#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
74#else
75#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
76#endif
77
78#define _PyUnicode_UTF8(op)                             \
79    (((PyCompactUnicodeObject*)(op))->utf8)
80#define PyUnicode_UTF8(op)                              \
81    (assert(_PyUnicode_CHECK(op)),                      \
82     assert(PyUnicode_IS_READY(op)),                    \
83     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
84         ((char*)((PyASCIIObject*)(op) + 1)) :          \
85         _PyUnicode_UTF8(op))
86#define _PyUnicode_UTF8_LENGTH(op)                      \
87    (((PyCompactUnicodeObject*)(op))->utf8_length)
88#define PyUnicode_UTF8_LENGTH(op)                       \
89    (assert(_PyUnicode_CHECK(op)),                      \
90     assert(PyUnicode_IS_READY(op)),                    \
91     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
92         ((PyASCIIObject*)(op))->length :               \
93         _PyUnicode_UTF8_LENGTH(op))
94#define _PyUnicode_WSTR(op)                             \
95    (((PyASCIIObject*)(op))->wstr)
96#define _PyUnicode_WSTR_LENGTH(op)                      \
97    (((PyCompactUnicodeObject*)(op))->wstr_length)
98#define _PyUnicode_LENGTH(op)                           \
99    (((PyASCIIObject *)(op))->length)
100#define _PyUnicode_STATE(op)                            \
101    (((PyASCIIObject *)(op))->state)
102#define _PyUnicode_HASH(op)                             \
103    (((PyASCIIObject *)(op))->hash)
104#define _PyUnicode_KIND(op)                             \
105    (assert(_PyUnicode_CHECK(op)),                      \
106     ((PyASCIIObject *)(op))->state.kind)
107#define _PyUnicode_GET_LENGTH(op)                       \
108    (assert(_PyUnicode_CHECK(op)),                      \
109     ((PyASCIIObject *)(op))->length)
110#define _PyUnicode_DATA_ANY(op)                         \
111    (((PyUnicodeObject*)(op))->data.any)
112
113#undef PyUnicode_READY
114#define PyUnicode_READY(op)                             \
115    (assert(_PyUnicode_CHECK(op)),                      \
116     (PyUnicode_IS_READY(op) ?                          \
117      0 :                                               \
118      _PyUnicode_Ready(op)))
119
120#define _PyUnicode_SHARE_UTF8(op)                       \
121    (assert(_PyUnicode_CHECK(op)),                      \
122     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
123     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
124#define _PyUnicode_SHARE_WSTR(op)                       \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
127
128/* true if the Unicode object has an allocated UTF-8 memory block
129   (not shared with other data) */
130#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
131    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
132      && _PyUnicode_UTF8(op)                            \
133      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
134
135/* true if the Unicode object has an allocated wstr memory block
136   (not shared with other data) */
137#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
138    ((_PyUnicode_WSTR(op) &&                            \
139      (!PyUnicode_IS_READY(op) ||                       \
140       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
141
142/* Generic helper macro to convert characters of different types.
143   from_type and to_type have to be valid type names, begin and end
144   are pointers to the source characters which should be of type
145   "from_type *".  to is a pointer of type "to_type *" and points to the
146   buffer where the result characters are written to. */
147#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
148    do {                                                \
149        to_type *_to = (to_type *)(to);                \
150        const from_type *_iter = (from_type *)(begin);  \
151        const from_type *_end = (from_type *)(end);     \
152        Py_ssize_t n = (_end) - (_iter);                \
153        const from_type *_unrolled_end =                \
154            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
155        while (_iter < (_unrolled_end)) {               \
156            _to[0] = (to_type) _iter[0];                \
157            _to[1] = (to_type) _iter[1];                \
158            _to[2] = (to_type) _iter[2];                \
159            _to[3] = (to_type) _iter[3];                \
160            _iter += 4; _to += 4;                       \
161        }                                               \
162        while (_iter < (_end))                          \
163            *_to++ = (to_type) *_iter++;                \
164    } while (0)
165
166/* This dictionary holds all interned unicode strings.  Note that references
167   to strings in this dictionary are *not* counted in the string's ob_refcnt.
168   When the interned string reaches a refcnt of 0 the string deallocation
169   function will delete the reference from this dictionary.
170
171   Another way to look at this is that to say that the actual reference
172   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
173*/
174static PyObject *interned = NULL;
175
176/* The empty Unicode object is shared to improve performance. */
177static PyObject *unicode_empty = NULL;
178
179#define _Py_INCREF_UNICODE_EMPTY()                      \
180    do {                                                \
181        if (unicode_empty != NULL)                      \
182            Py_INCREF(unicode_empty);                   \
183        else {                                          \
184            unicode_empty = PyUnicode_New(0, 0);        \
185            if (unicode_empty != NULL) {                \
186                Py_INCREF(unicode_empty);               \
187                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
188            }                                           \
189        }                                               \
190    } while (0)
191
192#define _Py_RETURN_UNICODE_EMPTY()                      \
193    do {                                                \
194        _Py_INCREF_UNICODE_EMPTY();                     \
195        return unicode_empty;                           \
196    } while (0)
197
198/* Forward declaration */
199Py_LOCAL_INLINE(int)
200_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
201
202/* List of static strings. */
203static _Py_Identifier *static_strings = NULL;
204
205/* Single character Unicode strings in the Latin-1 range are being
206   shared as well. */
207static PyObject *unicode_latin1[256] = {NULL};
208
209/* Fast detection of the most frequent whitespace characters */
210const unsigned char _Py_ascii_whitespace[] = {
211    0, 0, 0, 0, 0, 0, 0, 0,
212/*     case 0x0009: * CHARACTER TABULATION */
213/*     case 0x000A: * LINE FEED */
214/*     case 0x000B: * LINE TABULATION */
215/*     case 0x000C: * FORM FEED */
216/*     case 0x000D: * CARRIAGE RETURN */
217    0, 1, 1, 1, 1, 1, 0, 0,
218    0, 0, 0, 0, 0, 0, 0, 0,
219/*     case 0x001C: * FILE SEPARATOR */
220/*     case 0x001D: * GROUP SEPARATOR */
221/*     case 0x001E: * RECORD SEPARATOR */
222/*     case 0x001F: * UNIT SEPARATOR */
223    0, 0, 0, 0, 1, 1, 1, 1,
224/*     case 0x0020: * SPACE */
225    1, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228    0, 0, 0, 0, 0, 0, 0, 0,
229
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0,
237    0, 0, 0, 0, 0, 0, 0, 0
238};
239
240/* forward */
241static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
242static PyObject* get_latin1_char(unsigned char ch);
243static int unicode_modifiable(PyObject *unicode);
244
245
246static PyObject *
247_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
248static PyObject *
249_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
250static PyObject *
251_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
252
253static PyObject *
254unicode_encode_call_errorhandler(const char *errors,
255       PyObject **errorHandler,const char *encoding, const char *reason,
256       PyObject *unicode, PyObject **exceptionObject,
257       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
258
259static void
260raise_encode_exception(PyObject **exceptionObject,
261                       const char *encoding,
262                       PyObject *unicode,
263                       Py_ssize_t startpos, Py_ssize_t endpos,
264                       const char *reason);
265
266/* Same for linebreaks */
267static unsigned char ascii_linebreak[] = {
268    0, 0, 0, 0, 0, 0, 0, 0,
269/*         0x000A, * LINE FEED */
270/*         0x000B, * LINE TABULATION */
271/*         0x000C, * FORM FEED */
272/*         0x000D, * CARRIAGE RETURN */
273    0, 0, 1, 1, 1, 1, 0, 0,
274    0, 0, 0, 0, 0, 0, 0, 0,
275/*         0x001C, * FILE SEPARATOR */
276/*         0x001D, * GROUP SEPARATOR */
277/*         0x001E, * RECORD SEPARATOR */
278    0, 0, 0, 0, 1, 1, 1, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0,
291    0, 0, 0, 0, 0, 0, 0, 0
292};
293
294#include "clinic/unicodeobject.c.h"
295
296typedef enum {
297    _Py_ERROR_UNKNOWN=0,
298    _Py_ERROR_STRICT,
299    _Py_ERROR_SURROGATEESCAPE,
300    _Py_ERROR_REPLACE,
301    _Py_ERROR_IGNORE,
302    _Py_ERROR_XMLCHARREFREPLACE,
303    _Py_ERROR_OTHER
304} _Py_error_handler;
305
306static _Py_error_handler
307get_error_handler(const char *errors)
308{
309    if (errors == NULL)
310        return _Py_ERROR_STRICT;
311    if (strcmp(errors, "strict") == 0)
312        return _Py_ERROR_STRICT;
313    if (strcmp(errors, "surrogateescape") == 0)
314        return _Py_ERROR_SURROGATEESCAPE;
315    if (strcmp(errors, "ignore") == 0)
316        return _Py_ERROR_IGNORE;
317    if (strcmp(errors, "replace") == 0)
318        return _Py_ERROR_REPLACE;
319    if (strcmp(errors, "xmlcharrefreplace") == 0)
320        return _Py_ERROR_XMLCHARREFREPLACE;
321    return _Py_ERROR_OTHER;
322}
323
324/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
325   This function is kept for backward compatibility with the old API. */
326Py_UNICODE
327PyUnicode_GetMax(void)
328{
329#ifdef Py_UNICODE_WIDE
330    return 0x10FFFF;
331#else
332    /* This is actually an illegal character, so it should
333       not be passed to unichr. */
334    return 0xFFFF;
335#endif
336}
337
338#ifdef Py_DEBUG
339int
340_PyUnicode_CheckConsistency(PyObject *op, int check_content)
341{
342    PyASCIIObject *ascii;
343    unsigned int kind;
344
345    assert(PyUnicode_Check(op));
346
347    ascii = (PyASCIIObject *)op;
348    kind = ascii->state.kind;
349
350    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
351        assert(kind == PyUnicode_1BYTE_KIND);
352        assert(ascii->state.ready == 1);
353    }
354    else {
355        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
356        void *data;
357
358        if (ascii->state.compact == 1) {
359            data = compact + 1;
360            assert(kind == PyUnicode_1BYTE_KIND
361                   || kind == PyUnicode_2BYTE_KIND
362                   || kind == PyUnicode_4BYTE_KIND);
363            assert(ascii->state.ascii == 0);
364            assert(ascii->state.ready == 1);
365            assert (compact->utf8 != data);
366        }
367        else {
368            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
369
370            data = unicode->data.any;
371            if (kind == PyUnicode_WCHAR_KIND) {
372                assert(ascii->length == 0);
373                assert(ascii->hash == -1);
374                assert(ascii->state.compact == 0);
375                assert(ascii->state.ascii == 0);
376                assert(ascii->state.ready == 0);
377                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
378                assert(ascii->wstr != NULL);
379                assert(data == NULL);
380                assert(compact->utf8 == NULL);
381            }
382            else {
383                assert(kind == PyUnicode_1BYTE_KIND
384                       || kind == PyUnicode_2BYTE_KIND
385                       || kind == PyUnicode_4BYTE_KIND);
386                assert(ascii->state.compact == 0);
387                assert(ascii->state.ready == 1);
388                assert(data != NULL);
389                if (ascii->state.ascii) {
390                    assert (compact->utf8 == data);
391                    assert (compact->utf8_length == ascii->length);
392                }
393                else
394                    assert (compact->utf8 != data);
395            }
396        }
397        if (kind != PyUnicode_WCHAR_KIND) {
398            if (
399#if SIZEOF_WCHAR_T == 2
400                kind == PyUnicode_2BYTE_KIND
401#else
402                kind == PyUnicode_4BYTE_KIND
403#endif
404               )
405            {
406                assert(ascii->wstr == data);
407                assert(compact->wstr_length == ascii->length);
408            } else
409                assert(ascii->wstr != data);
410        }
411
412        if (compact->utf8 == NULL)
413            assert(compact->utf8_length == 0);
414        if (ascii->wstr == NULL)
415            assert(compact->wstr_length == 0);
416    }
417    /* check that the best kind is used */
418    if (check_content && kind != PyUnicode_WCHAR_KIND)
419    {
420        Py_ssize_t i;
421        Py_UCS4 maxchar = 0;
422        void *data;
423        Py_UCS4 ch;
424
425        data = PyUnicode_DATA(ascii);
426        for (i=0; i < ascii->length; i++)
427        {
428            ch = PyUnicode_READ(kind, data, i);
429            if (ch > maxchar)
430                maxchar = ch;
431        }
432        if (kind == PyUnicode_1BYTE_KIND) {
433            if (ascii->state.ascii == 0) {
434                assert(maxchar >= 128);
435                assert(maxchar <= 255);
436            }
437            else
438                assert(maxchar < 128);
439        }
440        else if (kind == PyUnicode_2BYTE_KIND) {
441            assert(maxchar >= 0x100);
442            assert(maxchar <= 0xFFFF);
443        }
444        else {
445            assert(maxchar >= 0x10000);
446            assert(maxchar <= MAX_UNICODE);
447        }
448        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
449    }
450    return 1;
451}
452#endif
453
454static PyObject*
455unicode_result_wchar(PyObject *unicode)
456{
457#ifndef Py_DEBUG
458    Py_ssize_t len;
459
460    len = _PyUnicode_WSTR_LENGTH(unicode);
461    if (len == 0) {
462        Py_DECREF(unicode);
463        _Py_RETURN_UNICODE_EMPTY();
464    }
465
466    if (len == 1) {
467        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
468        if ((Py_UCS4)ch < 256) {
469            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
470            Py_DECREF(unicode);
471            return latin1_char;
472        }
473    }
474
475    if (_PyUnicode_Ready(unicode) < 0) {
476        Py_DECREF(unicode);
477        return NULL;
478    }
479#else
480    assert(Py_REFCNT(unicode) == 1);
481
482    /* don't make the result ready in debug mode to ensure that the caller
483       makes the string ready before using it */
484    assert(_PyUnicode_CheckConsistency(unicode, 1));
485#endif
486    return unicode;
487}
488
489static PyObject*
490unicode_result_ready(PyObject *unicode)
491{
492    Py_ssize_t length;
493
494    length = PyUnicode_GET_LENGTH(unicode);
495    if (length == 0) {
496        if (unicode != unicode_empty) {
497            Py_DECREF(unicode);
498            _Py_RETURN_UNICODE_EMPTY();
499        }
500        return unicode_empty;
501    }
502
503    if (length == 1) {
504        void *data = PyUnicode_DATA(unicode);
505        int kind = PyUnicode_KIND(unicode);
506        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
507        if (ch < 256) {
508            PyObject *latin1_char = unicode_latin1[ch];
509            if (latin1_char != NULL) {
510                if (unicode != latin1_char) {
511                    Py_INCREF(latin1_char);
512                    Py_DECREF(unicode);
513                }
514                return latin1_char;
515            }
516            else {
517                assert(_PyUnicode_CheckConsistency(unicode, 1));
518                Py_INCREF(unicode);
519                unicode_latin1[ch] = unicode;
520                return unicode;
521            }
522        }
523    }
524
525    assert(_PyUnicode_CheckConsistency(unicode, 1));
526    return unicode;
527}
528
529static PyObject*
530unicode_result(PyObject *unicode)
531{
532    assert(_PyUnicode_CHECK(unicode));
533    if (PyUnicode_IS_READY(unicode))
534        return unicode_result_ready(unicode);
535    else
536        return unicode_result_wchar(unicode);
537}
538
539static PyObject*
540unicode_result_unchanged(PyObject *unicode)
541{
542    if (PyUnicode_CheckExact(unicode)) {
543        if (PyUnicode_READY(unicode) == -1)
544            return NULL;
545        Py_INCREF(unicode);
546        return unicode;
547    }
548    else
549        /* Subtype -- return genuine unicode string with the same value. */
550        return _PyUnicode_Copy(unicode);
551}
552
553/* --- Bloom Filters ----------------------------------------------------- */
554
555/* stuff to implement simple "bloom filters" for Unicode characters.
556   to keep things simple, we use a single bitmask, using the least 5
557   bits from each unicode characters as the bit index. */
558
559/* the linebreak mask is set up by Unicode_Init below */
560
561#if LONG_BIT >= 128
562#define BLOOM_WIDTH 128
563#elif LONG_BIT >= 64
564#define BLOOM_WIDTH 64
565#elif LONG_BIT >= 32
566#define BLOOM_WIDTH 32
567#else
568#error "LONG_BIT is smaller than 32"
569#endif
570
571#define BLOOM_MASK unsigned long
572
573static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
574
575#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
576
577#define BLOOM_LINEBREAK(ch)                                             \
578    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
579     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
580
581Py_LOCAL_INLINE(BLOOM_MASK)
582make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
583{
584#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
585    do {                                               \
586        TYPE *data = (TYPE *)PTR;                      \
587        TYPE *end = data + LEN;                        \
588        Py_UCS4 ch;                                    \
589        for (; data != end; data++) {                  \
590            ch = *data;                                \
591            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
592        }                                              \
593        break;                                         \
594    } while (0)
595
596    /* calculate simple bloom-style bitmask for a given unicode string */
597
598    BLOOM_MASK mask;
599
600    mask = 0;
601    switch (kind) {
602    case PyUnicode_1BYTE_KIND:
603        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
604        break;
605    case PyUnicode_2BYTE_KIND:
606        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
607        break;
608    case PyUnicode_4BYTE_KIND:
609        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
610        break;
611    default:
612        assert(0);
613    }
614    return mask;
615
616#undef BLOOM_UPDATE
617}
618
619/* Compilation of templated routines */
620
621#include "stringlib/asciilib.h"
622#include "stringlib/fastsearch.h"
623#include "stringlib/partition.h"
624#include "stringlib/split.h"
625#include "stringlib/count.h"
626#include "stringlib/find.h"
627#include "stringlib/find_max_char.h"
628#include "stringlib/localeutil.h"
629#include "stringlib/undef.h"
630
631#include "stringlib/ucs1lib.h"
632#include "stringlib/fastsearch.h"
633#include "stringlib/partition.h"
634#include "stringlib/split.h"
635#include "stringlib/count.h"
636#include "stringlib/find.h"
637#include "stringlib/replace.h"
638#include "stringlib/find_max_char.h"
639#include "stringlib/localeutil.h"
640#include "stringlib/undef.h"
641
642#include "stringlib/ucs2lib.h"
643#include "stringlib/fastsearch.h"
644#include "stringlib/partition.h"
645#include "stringlib/split.h"
646#include "stringlib/count.h"
647#include "stringlib/find.h"
648#include "stringlib/replace.h"
649#include "stringlib/find_max_char.h"
650#include "stringlib/localeutil.h"
651#include "stringlib/undef.h"
652
653#include "stringlib/ucs4lib.h"
654#include "stringlib/fastsearch.h"
655#include "stringlib/partition.h"
656#include "stringlib/split.h"
657#include "stringlib/count.h"
658#include "stringlib/find.h"
659#include "stringlib/replace.h"
660#include "stringlib/find_max_char.h"
661#include "stringlib/localeutil.h"
662#include "stringlib/undef.h"
663
664#include "stringlib/unicodedefs.h"
665#include "stringlib/fastsearch.h"
666#include "stringlib/count.h"
667#include "stringlib/find.h"
668#include "stringlib/undef.h"
669
670/* --- Unicode Object ----------------------------------------------------- */
671
672static PyObject *
673fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
674
675Py_LOCAL_INLINE(Py_ssize_t) findchar(const void *s, int kind,
676                                     Py_ssize_t size, Py_UCS4 ch,
677                                     int direction)
678{
679    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
680
681    switch (kind) {
682    case PyUnicode_1BYTE_KIND:
683        {
684            Py_UCS1 ch1 = (Py_UCS1) ch;
685            if (ch1 == ch)
686                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
687            else
688                return -1;
689        }
690    case PyUnicode_2BYTE_KIND:
691        {
692            Py_UCS2 ch2 = (Py_UCS2) ch;
693            if (ch2 == ch)
694                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
695            else
696                return -1;
697        }
698    case PyUnicode_4BYTE_KIND:
699        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
700    default:
701        assert(0);
702        return -1;
703    }
704}
705
706#ifdef Py_DEBUG
707/* Fill the data of an Unicode string with invalid characters to detect bugs
708   earlier.
709
710   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
711   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
712   invalid character in Unicode 6.0. */
713static void
714unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
715{
716    int kind = PyUnicode_KIND(unicode);
717    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
718    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
719    if (length <= old_length)
720        return;
721    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
722}
723#endif
724
725static PyObject*
726resize_compact(PyObject *unicode, Py_ssize_t length)
727{
728    Py_ssize_t char_size;
729    Py_ssize_t struct_size;
730    Py_ssize_t new_size;
731    int share_wstr;
732    PyObject *new_unicode;
733#ifdef Py_DEBUG
734    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
735#endif
736
737    assert(unicode_modifiable(unicode));
738    assert(PyUnicode_IS_READY(unicode));
739    assert(PyUnicode_IS_COMPACT(unicode));
740
741    char_size = PyUnicode_KIND(unicode);
742    if (PyUnicode_IS_ASCII(unicode))
743        struct_size = sizeof(PyASCIIObject);
744    else
745        struct_size = sizeof(PyCompactUnicodeObject);
746    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
747
748    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
749        PyErr_NoMemory();
750        return NULL;
751    }
752    new_size = (struct_size + (length + 1) * char_size);
753
754    _Py_DEC_REFTOTAL;
755    _Py_ForgetReference(unicode);
756
757    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
758    if (new_unicode == NULL) {
759        _Py_NewReference(unicode);
760        PyErr_NoMemory();
761        return NULL;
762    }
763    unicode = new_unicode;
764    _Py_NewReference(unicode);
765
766    _PyUnicode_LENGTH(unicode) = length;
767    if (share_wstr) {
768        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
769        if (!PyUnicode_IS_ASCII(unicode))
770            _PyUnicode_WSTR_LENGTH(unicode) = length;
771    }
772    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
773        PyObject_DEL(_PyUnicode_WSTR(unicode));
774        _PyUnicode_WSTR(unicode) = NULL;
775    }
776#ifdef Py_DEBUG
777    unicode_fill_invalid(unicode, old_length);
778#endif
779    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
780                    length, 0);
781    assert(_PyUnicode_CheckConsistency(unicode, 0));
782    return unicode;
783}
784
785static int
786resize_inplace(PyObject *unicode, Py_ssize_t length)
787{
788    wchar_t *wstr;
789    Py_ssize_t new_size;
790    assert(!PyUnicode_IS_COMPACT(unicode));
791    assert(Py_REFCNT(unicode) == 1);
792
793    if (PyUnicode_IS_READY(unicode)) {
794        Py_ssize_t char_size;
795        int share_wstr, share_utf8;
796        void *data;
797#ifdef Py_DEBUG
798        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
799#endif
800
801        data = _PyUnicode_DATA_ANY(unicode);
802        char_size = PyUnicode_KIND(unicode);
803        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
804        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
805
806        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
807            PyErr_NoMemory();
808            return -1;
809        }
810        new_size = (length + 1) * char_size;
811
812        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
813        {
814            PyObject_DEL(_PyUnicode_UTF8(unicode));
815            _PyUnicode_UTF8(unicode) = NULL;
816            _PyUnicode_UTF8_LENGTH(unicode) = 0;
817        }
818
819        data = (PyObject *)PyObject_REALLOC(data, new_size);
820        if (data == NULL) {
821            PyErr_NoMemory();
822            return -1;
823        }
824        _PyUnicode_DATA_ANY(unicode) = data;
825        if (share_wstr) {
826            _PyUnicode_WSTR(unicode) = data;
827            _PyUnicode_WSTR_LENGTH(unicode) = length;
828        }
829        if (share_utf8) {
830            _PyUnicode_UTF8(unicode) = data;
831            _PyUnicode_UTF8_LENGTH(unicode) = length;
832        }
833        _PyUnicode_LENGTH(unicode) = length;
834        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
835#ifdef Py_DEBUG
836        unicode_fill_invalid(unicode, old_length);
837#endif
838        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
839            assert(_PyUnicode_CheckConsistency(unicode, 0));
840            return 0;
841        }
842    }
843    assert(_PyUnicode_WSTR(unicode) != NULL);
844
845    /* check for integer overflow */
846    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
847        PyErr_NoMemory();
848        return -1;
849    }
850    new_size = sizeof(wchar_t) * (length + 1);
851    wstr =  _PyUnicode_WSTR(unicode);
852    wstr = PyObject_REALLOC(wstr, new_size);
853    if (!wstr) {
854        PyErr_NoMemory();
855        return -1;
856    }
857    _PyUnicode_WSTR(unicode) = wstr;
858    _PyUnicode_WSTR(unicode)[length] = 0;
859    _PyUnicode_WSTR_LENGTH(unicode) = length;
860    assert(_PyUnicode_CheckConsistency(unicode, 0));
861    return 0;
862}
863
864static PyObject*
865resize_copy(PyObject *unicode, Py_ssize_t length)
866{
867    Py_ssize_t copy_length;
868    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
869        PyObject *copy;
870
871        if (PyUnicode_READY(unicode) == -1)
872            return NULL;
873
874        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
875        if (copy == NULL)
876            return NULL;
877
878        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
879        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
880        return copy;
881    }
882    else {
883        PyObject *w;
884
885        w = (PyObject*)_PyUnicode_New(length);
886        if (w == NULL)
887            return NULL;
888        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
889        copy_length = Py_MIN(copy_length, length);
890        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
891                  copy_length * sizeof(wchar_t));
892        return w;
893    }
894}
895
896/* We allocate one more byte to make sure the string is
897   Ux0000 terminated; some code (e.g. new_identifier)
898   relies on that.
899
900   XXX This allocator could further be enhanced by assuring that the
901   free list never reduces its size below 1.
902
903*/
904
905static PyUnicodeObject *
906_PyUnicode_New(Py_ssize_t length)
907{
908    PyUnicodeObject *unicode;
909    size_t new_size;
910
911    /* Optimization for empty strings */
912    if (length == 0 && unicode_empty != NULL) {
913        Py_INCREF(unicode_empty);
914        return (PyUnicodeObject*)unicode_empty;
915    }
916
917    /* Ensure we won't overflow the size. */
918    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
919        return (PyUnicodeObject *)PyErr_NoMemory();
920    }
921    if (length < 0) {
922        PyErr_SetString(PyExc_SystemError,
923                        "Negative size passed to _PyUnicode_New");
924        return NULL;
925    }
926
927    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
928    if (unicode == NULL)
929        return NULL;
930    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
931
932    _PyUnicode_WSTR_LENGTH(unicode) = length;
933    _PyUnicode_HASH(unicode) = -1;
934    _PyUnicode_STATE(unicode).interned = 0;
935    _PyUnicode_STATE(unicode).kind = 0;
936    _PyUnicode_STATE(unicode).compact = 0;
937    _PyUnicode_STATE(unicode).ready = 0;
938    _PyUnicode_STATE(unicode).ascii = 0;
939    _PyUnicode_DATA_ANY(unicode) = NULL;
940    _PyUnicode_LENGTH(unicode) = 0;
941    _PyUnicode_UTF8(unicode) = NULL;
942    _PyUnicode_UTF8_LENGTH(unicode) = 0;
943
944    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
945    if (!_PyUnicode_WSTR(unicode)) {
946        Py_DECREF(unicode);
947        PyErr_NoMemory();
948        return NULL;
949    }
950
951    /* Initialize the first element to guard against cases where
952     * the caller fails before initializing str -- unicode_resize()
953     * reads str[0], and the Keep-Alive optimization can keep memory
954     * allocated for str alive across a call to unicode_dealloc(unicode).
955     * We don't want unicode_resize to read uninitialized memory in
956     * that case.
957     */
958    _PyUnicode_WSTR(unicode)[0] = 0;
959    _PyUnicode_WSTR(unicode)[length] = 0;
960
961    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
962    return unicode;
963}
964
965static const char*
966unicode_kind_name(PyObject *unicode)
967{
968    /* don't check consistency: unicode_kind_name() is called from
969       _PyUnicode_Dump() */
970    if (!PyUnicode_IS_COMPACT(unicode))
971    {
972        if (!PyUnicode_IS_READY(unicode))
973            return "wstr";
974        switch (PyUnicode_KIND(unicode))
975        {
976        case PyUnicode_1BYTE_KIND:
977            if (PyUnicode_IS_ASCII(unicode))
978                return "legacy ascii";
979            else
980                return "legacy latin1";
981        case PyUnicode_2BYTE_KIND:
982            return "legacy UCS2";
983        case PyUnicode_4BYTE_KIND:
984            return "legacy UCS4";
985        default:
986            return "<legacy invalid kind>";
987        }
988    }
989    assert(PyUnicode_IS_READY(unicode));
990    switch (PyUnicode_KIND(unicode)) {
991    case PyUnicode_1BYTE_KIND:
992        if (PyUnicode_IS_ASCII(unicode))
993            return "ascii";
994        else
995            return "latin1";
996    case PyUnicode_2BYTE_KIND:
997        return "UCS2";
998    case PyUnicode_4BYTE_KIND:
999        return "UCS4";
1000    default:
1001        return "<invalid compact kind>";
1002    }
1003}
1004
1005#ifdef Py_DEBUG
1006/* Functions wrapping macros for use in debugger */
1007char *_PyUnicode_utf8(void *unicode){
1008    return PyUnicode_UTF8(unicode);
1009}
1010
1011void *_PyUnicode_compact_data(void *unicode) {
1012    return _PyUnicode_COMPACT_DATA(unicode);
1013}
1014void *_PyUnicode_data(void *unicode){
1015    printf("obj %p\n", unicode);
1016    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
1017    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
1018    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
1019    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
1020    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
1021    return PyUnicode_DATA(unicode);
1022}
1023
1024void
1025_PyUnicode_Dump(PyObject *op)
1026{
1027    PyASCIIObject *ascii = (PyASCIIObject *)op;
1028    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1029    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1030    void *data;
1031
1032    if (ascii->state.compact)
1033    {
1034        if (ascii->state.ascii)
1035            data = (ascii + 1);
1036        else
1037            data = (compact + 1);
1038    }
1039    else
1040        data = unicode->data.any;
1041    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1042           unicode_kind_name(op), ascii->length);
1043
1044    if (ascii->wstr == data)
1045        printf("shared ");
1046    printf("wstr=%p", ascii->wstr);
1047
1048    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1049        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1050        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1051            printf("shared ");
1052        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1053               compact->utf8, compact->utf8_length);
1054    }
1055    printf(", data=%p\n", data);
1056}
1057#endif
1058
1059PyObject *
1060PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1061{
1062    PyObject *obj;
1063    PyCompactUnicodeObject *unicode;
1064    void *data;
1065    enum PyUnicode_Kind kind;
1066    int is_sharing, is_ascii;
1067    Py_ssize_t char_size;
1068    Py_ssize_t struct_size;
1069
1070    /* Optimization for empty strings */
1071    if (size == 0 && unicode_empty != NULL) {
1072        Py_INCREF(unicode_empty);
1073        return unicode_empty;
1074    }
1075
1076    is_ascii = 0;
1077    is_sharing = 0;
1078    struct_size = sizeof(PyCompactUnicodeObject);
1079    if (maxchar < 128) {
1080        kind = PyUnicode_1BYTE_KIND;
1081        char_size = 1;
1082        is_ascii = 1;
1083        struct_size = sizeof(PyASCIIObject);
1084    }
1085    else if (maxchar < 256) {
1086        kind = PyUnicode_1BYTE_KIND;
1087        char_size = 1;
1088    }
1089    else if (maxchar < 65536) {
1090        kind = PyUnicode_2BYTE_KIND;
1091        char_size = 2;
1092        if (sizeof(wchar_t) == 2)
1093            is_sharing = 1;
1094    }
1095    else {
1096        if (maxchar > MAX_UNICODE) {
1097            PyErr_SetString(PyExc_SystemError,
1098                            "invalid maximum character passed to PyUnicode_New");
1099            return NULL;
1100        }
1101        kind = PyUnicode_4BYTE_KIND;
1102        char_size = 4;
1103        if (sizeof(wchar_t) == 4)
1104            is_sharing = 1;
1105    }
1106
1107    /* Ensure we won't overflow the size. */
1108    if (size < 0) {
1109        PyErr_SetString(PyExc_SystemError,
1110                        "Negative size passed to PyUnicode_New");
1111        return NULL;
1112    }
1113    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1114        return PyErr_NoMemory();
1115
1116    /* Duplicated allocation code from _PyObject_New() instead of a call to
1117     * PyObject_New() so we are able to allocate space for the object and
1118     * it's data buffer.
1119     */
1120    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1121    if (obj == NULL)
1122        return PyErr_NoMemory();
1123    obj = PyObject_INIT(obj, &PyUnicode_Type);
1124    if (obj == NULL)
1125        return NULL;
1126
1127    unicode = (PyCompactUnicodeObject *)obj;
1128    if (is_ascii)
1129        data = ((PyASCIIObject*)obj) + 1;
1130    else
1131        data = unicode + 1;
1132    _PyUnicode_LENGTH(unicode) = size;
1133    _PyUnicode_HASH(unicode) = -1;
1134    _PyUnicode_STATE(unicode).interned = 0;
1135    _PyUnicode_STATE(unicode).kind = kind;
1136    _PyUnicode_STATE(unicode).compact = 1;
1137    _PyUnicode_STATE(unicode).ready = 1;
1138    _PyUnicode_STATE(unicode).ascii = is_ascii;
1139    if (is_ascii) {
1140        ((char*)data)[size] = 0;
1141        _PyUnicode_WSTR(unicode) = NULL;
1142    }
1143    else if (kind == PyUnicode_1BYTE_KIND) {
1144        ((char*)data)[size] = 0;
1145        _PyUnicode_WSTR(unicode) = NULL;
1146        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1147        unicode->utf8 = NULL;
1148        unicode->utf8_length = 0;
1149    }
1150    else {
1151        unicode->utf8 = NULL;
1152        unicode->utf8_length = 0;
1153        if (kind == PyUnicode_2BYTE_KIND)
1154            ((Py_UCS2*)data)[size] = 0;
1155        else /* kind == PyUnicode_4BYTE_KIND */
1156            ((Py_UCS4*)data)[size] = 0;
1157        if (is_sharing) {
1158            _PyUnicode_WSTR_LENGTH(unicode) = size;
1159            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1160        }
1161        else {
1162            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1163            _PyUnicode_WSTR(unicode) = NULL;
1164        }
1165    }
1166#ifdef Py_DEBUG
1167    unicode_fill_invalid((PyObject*)unicode, 0);
1168#endif
1169    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1170    return obj;
1171}
1172
1173#if SIZEOF_WCHAR_T == 2
1174/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1175   will decode surrogate pairs, the other conversions are implemented as macros
1176   for efficiency.
1177
1178   This function assumes that unicode can hold one more code point than wstr
1179   characters for a terminating null character. */
1180static void
1181unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1182                              PyObject *unicode)
1183{
1184    const wchar_t *iter;
1185    Py_UCS4 *ucs4_out;
1186
1187    assert(unicode != NULL);
1188    assert(_PyUnicode_CHECK(unicode));
1189    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1190    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1191
1192    for (iter = begin; iter < end; ) {
1193        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1194                           _PyUnicode_GET_LENGTH(unicode)));
1195        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1196            && (iter+1) < end
1197            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1198        {
1199            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1200            iter += 2;
1201        }
1202        else {
1203            *ucs4_out++ = *iter;
1204            iter++;
1205        }
1206    }
1207    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1208                        _PyUnicode_GET_LENGTH(unicode)));
1209
1210}
1211#endif
1212
1213static int
1214unicode_check_modifiable(PyObject *unicode)
1215{
1216    if (!unicode_modifiable(unicode)) {
1217        PyErr_SetString(PyExc_SystemError,
1218                        "Cannot modify a string currently used");
1219        return -1;
1220    }
1221    return 0;
1222}
1223
1224static int
1225_copy_characters(PyObject *to, Py_ssize_t to_start,
1226                 PyObject *from, Py_ssize_t from_start,
1227                 Py_ssize_t how_many, int check_maxchar)
1228{
1229    unsigned int from_kind, to_kind;
1230    void *from_data, *to_data;
1231
1232    assert(0 <= how_many);
1233    assert(0 <= from_start);
1234    assert(0 <= to_start);
1235    assert(PyUnicode_Check(from));
1236    assert(PyUnicode_IS_READY(from));
1237    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1238
1239    assert(PyUnicode_Check(to));
1240    assert(PyUnicode_IS_READY(to));
1241    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1242
1243    if (how_many == 0)
1244        return 0;
1245
1246    from_kind = PyUnicode_KIND(from);
1247    from_data = PyUnicode_DATA(from);
1248    to_kind = PyUnicode_KIND(to);
1249    to_data = PyUnicode_DATA(to);
1250
1251#ifdef Py_DEBUG
1252    if (!check_maxchar
1253        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1254    {
1255        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1256        Py_UCS4 ch;
1257        Py_ssize_t i;
1258        for (i=0; i < how_many; i++) {
1259            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1260            assert(ch <= to_maxchar);
1261        }
1262    }
1263#endif
1264
1265    if (from_kind == to_kind) {
1266        if (check_maxchar
1267            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1268        {
1269            /* Writing Latin-1 characters into an ASCII string requires to
1270               check that all written characters are pure ASCII */
1271            Py_UCS4 max_char;
1272            max_char = ucs1lib_find_max_char(from_data,
1273                                             (Py_UCS1*)from_data + how_many);
1274            if (max_char >= 128)
1275                return -1;
1276        }
1277        Py_MEMCPY((char*)to_data + to_kind * to_start,
1278                  (char*)from_data + from_kind * from_start,
1279                  to_kind * how_many);
1280    }
1281    else if (from_kind == PyUnicode_1BYTE_KIND
1282             && to_kind == PyUnicode_2BYTE_KIND)
1283    {
1284        _PyUnicode_CONVERT_BYTES(
1285            Py_UCS1, Py_UCS2,
1286            PyUnicode_1BYTE_DATA(from) + from_start,
1287            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1288            PyUnicode_2BYTE_DATA(to) + to_start
1289            );
1290    }
1291    else if (from_kind == PyUnicode_1BYTE_KIND
1292             && to_kind == PyUnicode_4BYTE_KIND)
1293    {
1294        _PyUnicode_CONVERT_BYTES(
1295            Py_UCS1, Py_UCS4,
1296            PyUnicode_1BYTE_DATA(from) + from_start,
1297            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1298            PyUnicode_4BYTE_DATA(to) + to_start
1299            );
1300    }
1301    else if (from_kind == PyUnicode_2BYTE_KIND
1302             && to_kind == PyUnicode_4BYTE_KIND)
1303    {
1304        _PyUnicode_CONVERT_BYTES(
1305            Py_UCS2, Py_UCS4,
1306            PyUnicode_2BYTE_DATA(from) + from_start,
1307            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1308            PyUnicode_4BYTE_DATA(to) + to_start
1309            );
1310    }
1311    else {
1312        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1313
1314        if (!check_maxchar) {
1315            if (from_kind == PyUnicode_2BYTE_KIND
1316                && to_kind == PyUnicode_1BYTE_KIND)
1317            {
1318                _PyUnicode_CONVERT_BYTES(
1319                    Py_UCS2, Py_UCS1,
1320                    PyUnicode_2BYTE_DATA(from) + from_start,
1321                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1322                    PyUnicode_1BYTE_DATA(to) + to_start
1323                    );
1324            }
1325            else if (from_kind == PyUnicode_4BYTE_KIND
1326                     && to_kind == PyUnicode_1BYTE_KIND)
1327            {
1328                _PyUnicode_CONVERT_BYTES(
1329                    Py_UCS4, Py_UCS1,
1330                    PyUnicode_4BYTE_DATA(from) + from_start,
1331                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1332                    PyUnicode_1BYTE_DATA(to) + to_start
1333                    );
1334            }
1335            else if (from_kind == PyUnicode_4BYTE_KIND
1336                     && to_kind == PyUnicode_2BYTE_KIND)
1337            {
1338                _PyUnicode_CONVERT_BYTES(
1339                    Py_UCS4, Py_UCS2,
1340                    PyUnicode_4BYTE_DATA(from) + from_start,
1341                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1342                    PyUnicode_2BYTE_DATA(to) + to_start
1343                    );
1344            }
1345            else {
1346                assert(0);
1347                return -1;
1348            }
1349        }
1350        else {
1351            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1352            Py_UCS4 ch;
1353            Py_ssize_t i;
1354
1355            for (i=0; i < how_many; i++) {
1356                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1357                if (ch > to_maxchar)
1358                    return -1;
1359                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1360            }
1361        }
1362    }
1363    return 0;
1364}
1365
1366void
1367_PyUnicode_FastCopyCharacters(
1368    PyObject *to, Py_ssize_t to_start,
1369    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1370{
1371    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1372}
1373
1374Py_ssize_t
1375PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1376                         PyObject *from, Py_ssize_t from_start,
1377                         Py_ssize_t how_many)
1378{
1379    int err;
1380
1381    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1382        PyErr_BadInternalCall();
1383        return -1;
1384    }
1385
1386    if (PyUnicode_READY(from) == -1)
1387        return -1;
1388    if (PyUnicode_READY(to) == -1)
1389        return -1;
1390
1391    if (from_start < 0) {
1392        PyErr_SetString(PyExc_IndexError, "string index out of range");
1393        return -1;
1394    }
1395    if (to_start < 0) {
1396        PyErr_SetString(PyExc_IndexError, "string index out of range");
1397        return -1;
1398    }
1399    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1400    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1401        PyErr_Format(PyExc_SystemError,
1402                     "Cannot write %zi characters at %zi "
1403                     "in a string of %zi characters",
1404                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1405        return -1;
1406    }
1407
1408    if (how_many == 0)
1409        return 0;
1410
1411    if (unicode_check_modifiable(to))
1412        return -1;
1413
1414    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1415    if (err) {
1416        PyErr_Format(PyExc_SystemError,
1417                     "Cannot copy %s characters "
1418                     "into a string of %s characters",
1419                     unicode_kind_name(from),
1420                     unicode_kind_name(to));
1421        return -1;
1422    }
1423    return how_many;
1424}
1425
1426/* Find the maximum code point and count the number of surrogate pairs so a
1427   correct string length can be computed before converting a string to UCS4.
1428   This function counts single surrogates as a character and not as a pair.
1429
1430   Return 0 on success, or -1 on error. */
1431static int
1432find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1433                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1434{
1435    const wchar_t *iter;
1436    Py_UCS4 ch;
1437
1438    assert(num_surrogates != NULL && maxchar != NULL);
1439    *num_surrogates = 0;
1440    *maxchar = 0;
1441
1442    for (iter = begin; iter < end; ) {
1443#if SIZEOF_WCHAR_T == 2
1444        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1445            && (iter+1) < end
1446            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1447        {
1448            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1449            ++(*num_surrogates);
1450            iter += 2;
1451        }
1452        else
1453#endif
1454        {
1455            ch = *iter;
1456            iter++;
1457        }
1458        if (ch > *maxchar) {
1459            *maxchar = ch;
1460            if (*maxchar > MAX_UNICODE) {
1461                PyErr_Format(PyExc_ValueError,
1462                             "character U+%x is not in range [U+0000; U+10ffff]",
1463                             ch);
1464                return -1;
1465            }
1466        }
1467    }
1468    return 0;
1469}
1470
1471int
1472_PyUnicode_Ready(PyObject *unicode)
1473{
1474    wchar_t *end;
1475    Py_UCS4 maxchar = 0;
1476    Py_ssize_t num_surrogates;
1477#if SIZEOF_WCHAR_T == 2
1478    Py_ssize_t length_wo_surrogates;
1479#endif
1480
1481    /* _PyUnicode_Ready() is only intended for old-style API usage where
1482       strings were created using _PyObject_New() and where no canonical
1483       representation (the str field) has been set yet aka strings
1484       which are not yet ready. */
1485    assert(_PyUnicode_CHECK(unicode));
1486    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1487    assert(_PyUnicode_WSTR(unicode) != NULL);
1488    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1489    assert(_PyUnicode_UTF8(unicode) == NULL);
1490    /* Actually, it should neither be interned nor be anything else: */
1491    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1492
1493    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1494    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1495                                &maxchar, &num_surrogates) == -1)
1496        return -1;
1497
1498    if (maxchar < 256) {
1499        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1500        if (!_PyUnicode_DATA_ANY(unicode)) {
1501            PyErr_NoMemory();
1502            return -1;
1503        }
1504        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1505                                _PyUnicode_WSTR(unicode), end,
1506                                PyUnicode_1BYTE_DATA(unicode));
1507        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1508        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1509        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1510        if (maxchar < 128) {
1511            _PyUnicode_STATE(unicode).ascii = 1;
1512            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1513            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1514        }
1515        else {
1516            _PyUnicode_STATE(unicode).ascii = 0;
1517            _PyUnicode_UTF8(unicode) = NULL;
1518            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1519        }
1520        PyObject_FREE(_PyUnicode_WSTR(unicode));
1521        _PyUnicode_WSTR(unicode) = NULL;
1522        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1523    }
1524    /* In this case we might have to convert down from 4-byte native
1525       wchar_t to 2-byte unicode. */
1526    else if (maxchar < 65536) {
1527        assert(num_surrogates == 0 &&
1528               "FindMaxCharAndNumSurrogatePairs() messed up");
1529
1530#if SIZEOF_WCHAR_T == 2
1531        /* We can share representations and are done. */
1532        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1533        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1534        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1535        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1536        _PyUnicode_UTF8(unicode) = NULL;
1537        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1538#else
1539        /* sizeof(wchar_t) == 4 */
1540        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1541            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1542        if (!_PyUnicode_DATA_ANY(unicode)) {
1543            PyErr_NoMemory();
1544            return -1;
1545        }
1546        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1547                                _PyUnicode_WSTR(unicode), end,
1548                                PyUnicode_2BYTE_DATA(unicode));
1549        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1550        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1551        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1552        _PyUnicode_UTF8(unicode) = NULL;
1553        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1554        PyObject_FREE(_PyUnicode_WSTR(unicode));
1555        _PyUnicode_WSTR(unicode) = NULL;
1556        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1557#endif
1558    }
1559    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1560    else {
1561#if SIZEOF_WCHAR_T == 2
1562        /* in case the native representation is 2-bytes, we need to allocate a
1563           new normalized 4-byte version. */
1564        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1565        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1566            PyErr_NoMemory();
1567            return -1;
1568        }
1569        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1570        if (!_PyUnicode_DATA_ANY(unicode)) {
1571            PyErr_NoMemory();
1572            return -1;
1573        }
1574        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1575        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1576        _PyUnicode_UTF8(unicode) = NULL;
1577        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1578        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1579        _PyUnicode_STATE(unicode).ready = 1;
1580        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1581        PyObject_FREE(_PyUnicode_WSTR(unicode));
1582        _PyUnicode_WSTR(unicode) = NULL;
1583        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1584#else
1585        assert(num_surrogates == 0);
1586
1587        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1588        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1589        _PyUnicode_UTF8(unicode) = NULL;
1590        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1591        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1592#endif
1593        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1594    }
1595    _PyUnicode_STATE(unicode).ready = 1;
1596    assert(_PyUnicode_CheckConsistency(unicode, 1));
1597    return 0;
1598}
1599
1600static void
1601unicode_dealloc(PyObject *unicode)
1602{
1603    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1604    case SSTATE_NOT_INTERNED:
1605        break;
1606
1607    case SSTATE_INTERNED_MORTAL:
1608        /* revive dead object temporarily for DelItem */
1609        Py_REFCNT(unicode) = 3;
1610        if (PyDict_DelItem(interned, unicode) != 0)
1611            Py_FatalError(
1612                "deletion of interned string failed");
1613        break;
1614
1615    case SSTATE_INTERNED_IMMORTAL:
1616        Py_FatalError("Immortal interned string died.");
1617
1618    default:
1619        Py_FatalError("Inconsistent interned string state.");
1620    }
1621
1622    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1623        PyObject_DEL(_PyUnicode_WSTR(unicode));
1624    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1625        PyObject_DEL(_PyUnicode_UTF8(unicode));
1626    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1627        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1628
1629    Py_TYPE(unicode)->tp_free(unicode);
1630}
1631
1632#ifdef Py_DEBUG
1633static int
1634unicode_is_singleton(PyObject *unicode)
1635{
1636    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1637    if (unicode == unicode_empty)
1638        return 1;
1639    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1640    {
1641        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1642        if (ch < 256 && unicode_latin1[ch] == unicode)
1643            return 1;
1644    }
1645    return 0;
1646}
1647#endif
1648
1649static int
1650unicode_modifiable(PyObject *unicode)
1651{
1652    assert(_PyUnicode_CHECK(unicode));
1653    if (Py_REFCNT(unicode) != 1)
1654        return 0;
1655    if (_PyUnicode_HASH(unicode) != -1)
1656        return 0;
1657    if (PyUnicode_CHECK_INTERNED(unicode))
1658        return 0;
1659    if (!PyUnicode_CheckExact(unicode))
1660        return 0;
1661#ifdef Py_DEBUG
1662    /* singleton refcount is greater than 1 */
1663    assert(!unicode_is_singleton(unicode));
1664#endif
1665    return 1;
1666}
1667
1668static int
1669unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1670{
1671    PyObject *unicode;
1672    Py_ssize_t old_length;
1673
1674    assert(p_unicode != NULL);
1675    unicode = *p_unicode;
1676
1677    assert(unicode != NULL);
1678    assert(PyUnicode_Check(unicode));
1679    assert(0 <= length);
1680
1681    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1682        old_length = PyUnicode_WSTR_LENGTH(unicode);
1683    else
1684        old_length = PyUnicode_GET_LENGTH(unicode);
1685    if (old_length == length)
1686        return 0;
1687
1688    if (length == 0) {
1689        _Py_INCREF_UNICODE_EMPTY();
1690        if (!unicode_empty)
1691            return -1;
1692        Py_DECREF(*p_unicode);
1693        *p_unicode = unicode_empty;
1694        return 0;
1695    }
1696
1697    if (!unicode_modifiable(unicode)) {
1698        PyObject *copy = resize_copy(unicode, length);
1699        if (copy == NULL)
1700            return -1;
1701        Py_DECREF(*p_unicode);
1702        *p_unicode = copy;
1703        return 0;
1704    }
1705
1706    if (PyUnicode_IS_COMPACT(unicode)) {
1707        PyObject *new_unicode = resize_compact(unicode, length);
1708        if (new_unicode == NULL)
1709            return -1;
1710        *p_unicode = new_unicode;
1711        return 0;
1712    }
1713    return resize_inplace(unicode, length);
1714}
1715
1716int
1717PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1718{
1719    PyObject *unicode;
1720    if (p_unicode == NULL) {
1721        PyErr_BadInternalCall();
1722        return -1;
1723    }
1724    unicode = *p_unicode;
1725    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1726    {
1727        PyErr_BadInternalCall();
1728        return -1;
1729    }
1730    return unicode_resize(p_unicode, length);
1731}
1732
1733/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1734
1735   WARNING: The function doesn't copy the terminating null character and
1736   doesn't check the maximum character (may write a latin1 character in an
1737   ASCII string). */
1738static void
1739unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1740                   const char *str, Py_ssize_t len)
1741{
1742    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1743    void *data = PyUnicode_DATA(unicode);
1744    const char *end = str + len;
1745
1746    switch (kind) {
1747    case PyUnicode_1BYTE_KIND: {
1748        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1749#ifdef Py_DEBUG
1750        if (PyUnicode_IS_ASCII(unicode)) {
1751            Py_UCS4 maxchar = ucs1lib_find_max_char(
1752                (const Py_UCS1*)str,
1753                (const Py_UCS1*)str + len);
1754            assert(maxchar < 128);
1755        }
1756#endif
1757        memcpy((char *) data + index, str, len);
1758        break;
1759    }
1760    case PyUnicode_2BYTE_KIND: {
1761        Py_UCS2 *start = (Py_UCS2 *)data + index;
1762        Py_UCS2 *ucs2 = start;
1763        assert(index <= PyUnicode_GET_LENGTH(unicode));
1764
1765        for (; str < end; ++ucs2, ++str)
1766            *ucs2 = (Py_UCS2)*str;
1767
1768        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1769        break;
1770    }
1771    default: {
1772        Py_UCS4 *start = (Py_UCS4 *)data + index;
1773        Py_UCS4 *ucs4 = start;
1774        assert(kind == PyUnicode_4BYTE_KIND);
1775        assert(index <= PyUnicode_GET_LENGTH(unicode));
1776
1777        for (; str < end; ++ucs4, ++str)
1778            *ucs4 = (Py_UCS4)*str;
1779
1780        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1781    }
1782    }
1783}
1784
1785static PyObject*
1786get_latin1_char(unsigned char ch)
1787{
1788    PyObject *unicode = unicode_latin1[ch];
1789    if (!unicode) {
1790        unicode = PyUnicode_New(1, ch);
1791        if (!unicode)
1792            return NULL;
1793        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1794        assert(_PyUnicode_CheckConsistency(unicode, 1));
1795        unicode_latin1[ch] = unicode;
1796    }
1797    Py_INCREF(unicode);
1798    return unicode;
1799}
1800
1801static PyObject*
1802unicode_char(Py_UCS4 ch)
1803{
1804    PyObject *unicode;
1805
1806    assert(ch <= MAX_UNICODE);
1807
1808    if (ch < 256)
1809        return get_latin1_char(ch);
1810
1811    unicode = PyUnicode_New(1, ch);
1812    if (unicode == NULL)
1813        return NULL;
1814    switch (PyUnicode_KIND(unicode)) {
1815    case PyUnicode_1BYTE_KIND:
1816        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1817        break;
1818    case PyUnicode_2BYTE_KIND:
1819        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1820        break;
1821    default:
1822        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1823        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1824    }
1825    assert(_PyUnicode_CheckConsistency(unicode, 1));
1826    return unicode;
1827}
1828
1829PyObject *
1830PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1831{
1832    PyObject *unicode;
1833    Py_UCS4 maxchar = 0;
1834    Py_ssize_t num_surrogates;
1835
1836    if (u == NULL)
1837        return (PyObject*)_PyUnicode_New(size);
1838
1839    /* If the Unicode data is known at construction time, we can apply
1840       some optimizations which share commonly used objects. */
1841
1842    /* Optimization for empty strings */
1843    if (size == 0)
1844        _Py_RETURN_UNICODE_EMPTY();
1845
1846    /* Single character Unicode objects in the Latin-1 range are
1847       shared when using this constructor */
1848    if (size == 1 && (Py_UCS4)*u < 256)
1849        return get_latin1_char((unsigned char)*u);
1850
1851    /* If not empty and not single character, copy the Unicode data
1852       into the new object */
1853    if (find_maxchar_surrogates(u, u + size,
1854                                &maxchar, &num_surrogates) == -1)
1855        return NULL;
1856
1857    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1858    if (!unicode)
1859        return NULL;
1860
1861    switch (PyUnicode_KIND(unicode)) {
1862    case PyUnicode_1BYTE_KIND:
1863        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1864                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1865        break;
1866    case PyUnicode_2BYTE_KIND:
1867#if Py_UNICODE_SIZE == 2
1868        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1869#else
1870        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1871                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1872#endif
1873        break;
1874    case PyUnicode_4BYTE_KIND:
1875#if SIZEOF_WCHAR_T == 2
1876        /* This is the only case which has to process surrogates, thus
1877           a simple copy loop is not enough and we need a function. */
1878        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1879#else
1880        assert(num_surrogates == 0);
1881        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1882#endif
1883        break;
1884    default:
1885        assert(0 && "Impossible state");
1886    }
1887
1888    return unicode_result(unicode);
1889}
1890
1891PyObject *
1892PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1893{
1894    if (size < 0) {
1895        PyErr_SetString(PyExc_SystemError,
1896                        "Negative size passed to PyUnicode_FromStringAndSize");
1897        return NULL;
1898    }
1899    if (u != NULL)
1900        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1901    else
1902        return (PyObject *)_PyUnicode_New(size);
1903}
1904
1905PyObject *
1906PyUnicode_FromString(const char *u)
1907{
1908    size_t size = strlen(u);
1909    if (size > PY_SSIZE_T_MAX) {
1910        PyErr_SetString(PyExc_OverflowError, "input too long");
1911        return NULL;
1912    }
1913    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1914}
1915
1916PyObject *
1917_PyUnicode_FromId(_Py_Identifier *id)
1918{
1919    if (!id->object) {
1920        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1921                                                  strlen(id->string),
1922                                                  NULL, NULL);
1923        if (!id->object)
1924            return NULL;
1925        PyUnicode_InternInPlace(&id->object);
1926        assert(!id->next);
1927        id->next = static_strings;
1928        static_strings = id;
1929    }
1930    return id->object;
1931}
1932
1933void
1934_PyUnicode_ClearStaticStrings()
1935{
1936    _Py_Identifier *tmp, *s = static_strings;
1937    while (s) {
1938        Py_CLEAR(s->object);
1939        tmp = s->next;
1940        s->next = NULL;
1941        s = tmp;
1942    }
1943    static_strings = NULL;
1944}
1945
1946/* Internal function, doesn't check maximum character */
1947
1948PyObject*
1949_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1950{
1951    const unsigned char *s = (const unsigned char *)buffer;
1952    PyObject *unicode;
1953    if (size == 1) {
1954#ifdef Py_DEBUG
1955        assert((unsigned char)s[0] < 128);
1956#endif
1957        return get_latin1_char(s[0]);
1958    }
1959    unicode = PyUnicode_New(size, 127);
1960    if (!unicode)
1961        return NULL;
1962    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1963    assert(_PyUnicode_CheckConsistency(unicode, 1));
1964    return unicode;
1965}
1966
1967static Py_UCS4
1968kind_maxchar_limit(unsigned int kind)
1969{
1970    switch (kind) {
1971    case PyUnicode_1BYTE_KIND:
1972        return 0x80;
1973    case PyUnicode_2BYTE_KIND:
1974        return 0x100;
1975    case PyUnicode_4BYTE_KIND:
1976        return 0x10000;
1977    default:
1978        assert(0 && "invalid kind");
1979        return MAX_UNICODE;
1980    }
1981}
1982
1983Py_LOCAL_INLINE(Py_UCS4)
1984align_maxchar(Py_UCS4 maxchar)
1985{
1986    if (maxchar <= 127)
1987        return 127;
1988    else if (maxchar <= 255)
1989        return 255;
1990    else if (maxchar <= 65535)
1991        return 65535;
1992    else
1993        return MAX_UNICODE;
1994}
1995
1996static PyObject*
1997_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1998{
1999    PyObject *res;
2000    unsigned char max_char;
2001
2002    if (size == 0)
2003        _Py_RETURN_UNICODE_EMPTY();
2004    assert(size > 0);
2005    if (size == 1)
2006        return get_latin1_char(u[0]);
2007
2008    max_char = ucs1lib_find_max_char(u, u + size);
2009    res = PyUnicode_New(size, max_char);
2010    if (!res)
2011        return NULL;
2012    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
2013    assert(_PyUnicode_CheckConsistency(res, 1));
2014    return res;
2015}
2016
2017static PyObject*
2018_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
2019{
2020    PyObject *res;
2021    Py_UCS2 max_char;
2022
2023    if (size == 0)
2024        _Py_RETURN_UNICODE_EMPTY();
2025    assert(size > 0);
2026    if (size == 1)
2027        return unicode_char(u[0]);
2028
2029    max_char = ucs2lib_find_max_char(u, u + size);
2030    res = PyUnicode_New(size, max_char);
2031    if (!res)
2032        return NULL;
2033    if (max_char >= 256)
2034        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2035    else {
2036        _PyUnicode_CONVERT_BYTES(
2037            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2038    }
2039    assert(_PyUnicode_CheckConsistency(res, 1));
2040    return res;
2041}
2042
2043static PyObject*
2044_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2045{
2046    PyObject *res;
2047    Py_UCS4 max_char;
2048
2049    if (size == 0)
2050        _Py_RETURN_UNICODE_EMPTY();
2051    assert(size > 0);
2052    if (size == 1)
2053        return unicode_char(u[0]);
2054
2055    max_char = ucs4lib_find_max_char(u, u + size);
2056    res = PyUnicode_New(size, max_char);
2057    if (!res)
2058        return NULL;
2059    if (max_char < 256)
2060        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2061                                 PyUnicode_1BYTE_DATA(res));
2062    else if (max_char < 0x10000)
2063        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2064                                 PyUnicode_2BYTE_DATA(res));
2065    else
2066        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2067    assert(_PyUnicode_CheckConsistency(res, 1));
2068    return res;
2069}
2070
2071PyObject*
2072PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2073{
2074    if (size < 0) {
2075        PyErr_SetString(PyExc_ValueError, "size must be positive");
2076        return NULL;
2077    }
2078    switch (kind) {
2079    case PyUnicode_1BYTE_KIND:
2080        return _PyUnicode_FromUCS1(buffer, size);
2081    case PyUnicode_2BYTE_KIND:
2082        return _PyUnicode_FromUCS2(buffer, size);
2083    case PyUnicode_4BYTE_KIND:
2084        return _PyUnicode_FromUCS4(buffer, size);
2085    default:
2086        PyErr_SetString(PyExc_SystemError, "invalid kind");
2087        return NULL;
2088    }
2089}
2090
2091Py_UCS4
2092_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2093{
2094    enum PyUnicode_Kind kind;
2095    void *startptr, *endptr;
2096
2097    assert(PyUnicode_IS_READY(unicode));
2098    assert(0 <= start);
2099    assert(end <= PyUnicode_GET_LENGTH(unicode));
2100    assert(start <= end);
2101
2102    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2103        return PyUnicode_MAX_CHAR_VALUE(unicode);
2104
2105    if (start == end)
2106        return 127;
2107
2108    if (PyUnicode_IS_ASCII(unicode))
2109        return 127;
2110
2111    kind = PyUnicode_KIND(unicode);
2112    startptr = PyUnicode_DATA(unicode);
2113    endptr = (char *)startptr + end * kind;
2114    startptr = (char *)startptr + start * kind;
2115    switch(kind) {
2116    case PyUnicode_1BYTE_KIND:
2117        return ucs1lib_find_max_char(startptr, endptr);
2118    case PyUnicode_2BYTE_KIND:
2119        return ucs2lib_find_max_char(startptr, endptr);
2120    case PyUnicode_4BYTE_KIND:
2121        return ucs4lib_find_max_char(startptr, endptr);
2122    default:
2123        assert(0);
2124        return 0;
2125    }
2126}
2127
2128/* Ensure that a string uses the most efficient storage, if it is not the
2129   case: create a new string with of the right kind. Write NULL into *p_unicode
2130   on error. */
2131static void
2132unicode_adjust_maxchar(PyObject **p_unicode)
2133{
2134    PyObject *unicode, *copy;
2135    Py_UCS4 max_char;
2136    Py_ssize_t len;
2137    unsigned int kind;
2138
2139    assert(p_unicode != NULL);
2140    unicode = *p_unicode;
2141    assert(PyUnicode_IS_READY(unicode));
2142    if (PyUnicode_IS_ASCII(unicode))
2143        return;
2144
2145    len = PyUnicode_GET_LENGTH(unicode);
2146    kind = PyUnicode_KIND(unicode);
2147    if (kind == PyUnicode_1BYTE_KIND) {
2148        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2149        max_char = ucs1lib_find_max_char(u, u + len);
2150        if (max_char >= 128)
2151            return;
2152    }
2153    else if (kind == PyUnicode_2BYTE_KIND) {
2154        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2155        max_char = ucs2lib_find_max_char(u, u + len);
2156        if (max_char >= 256)
2157            return;
2158    }
2159    else {
2160        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2161        assert(kind == PyUnicode_4BYTE_KIND);
2162        max_char = ucs4lib_find_max_char(u, u + len);
2163        if (max_char >= 0x10000)
2164            return;
2165    }
2166    copy = PyUnicode_New(len, max_char);
2167    if (copy != NULL)
2168        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2169    Py_DECREF(unicode);
2170    *p_unicode = copy;
2171}
2172
2173PyObject*
2174_PyUnicode_Copy(PyObject *unicode)
2175{
2176    Py_ssize_t length;
2177    PyObject *copy;
2178
2179    if (!PyUnicode_Check(unicode)) {
2180        PyErr_BadInternalCall();
2181        return NULL;
2182    }
2183    if (PyUnicode_READY(unicode) == -1)
2184        return NULL;
2185
2186    length = PyUnicode_GET_LENGTH(unicode);
2187    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2188    if (!copy)
2189        return NULL;
2190    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2191
2192    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2193              length * PyUnicode_KIND(unicode));
2194    assert(_PyUnicode_CheckConsistency(copy, 1));
2195    return copy;
2196}
2197
2198
2199/* Widen Unicode objects to larger buffers. Don't write terminating null
2200   character. Return NULL on error. */
2201
2202void*
2203_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2204{
2205    Py_ssize_t len;
2206    void *result;
2207    unsigned int skind;
2208
2209    if (PyUnicode_READY(s) == -1)
2210        return NULL;
2211
2212    len = PyUnicode_GET_LENGTH(s);
2213    skind = PyUnicode_KIND(s);
2214    if (skind >= kind) {
2215        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2216        return NULL;
2217    }
2218    switch (kind) {
2219    case PyUnicode_2BYTE_KIND:
2220        result = PyMem_New(Py_UCS2, len);
2221        if (!result)
2222            return PyErr_NoMemory();
2223        assert(skind == PyUnicode_1BYTE_KIND);
2224        _PyUnicode_CONVERT_BYTES(
2225            Py_UCS1, Py_UCS2,
2226            PyUnicode_1BYTE_DATA(s),
2227            PyUnicode_1BYTE_DATA(s) + len,
2228            result);
2229        return result;
2230    case PyUnicode_4BYTE_KIND:
2231        result = PyMem_New(Py_UCS4, len);
2232        if (!result)
2233            return PyErr_NoMemory();
2234        if (skind == PyUnicode_2BYTE_KIND) {
2235            _PyUnicode_CONVERT_BYTES(
2236                Py_UCS2, Py_UCS4,
2237                PyUnicode_2BYTE_DATA(s),
2238                PyUnicode_2BYTE_DATA(s) + len,
2239                result);
2240        }
2241        else {
2242            assert(skind == PyUnicode_1BYTE_KIND);
2243            _PyUnicode_CONVERT_BYTES(
2244                Py_UCS1, Py_UCS4,
2245                PyUnicode_1BYTE_DATA(s),
2246                PyUnicode_1BYTE_DATA(s) + len,
2247                result);
2248        }
2249        return result;
2250    default:
2251        break;
2252    }
2253    PyErr_SetString(PyExc_SystemError, "invalid kind");
2254    return NULL;
2255}
2256
2257static Py_UCS4*
2258as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2259        int copy_null)
2260{
2261    int kind;
2262    void *data;
2263    Py_ssize_t len, targetlen;
2264    if (PyUnicode_READY(string) == -1)
2265        return NULL;
2266    kind = PyUnicode_KIND(string);
2267    data = PyUnicode_DATA(string);
2268    len = PyUnicode_GET_LENGTH(string);
2269    targetlen = len;
2270    if (copy_null)
2271        targetlen++;
2272    if (!target) {
2273        target = PyMem_New(Py_UCS4, targetlen);
2274        if (!target) {
2275            PyErr_NoMemory();
2276            return NULL;
2277        }
2278    }
2279    else {
2280        if (targetsize < targetlen) {
2281            PyErr_Format(PyExc_SystemError,
2282                         "string is longer than the buffer");
2283            if (copy_null && 0 < targetsize)
2284                target[0] = 0;
2285            return NULL;
2286        }
2287    }
2288    if (kind == PyUnicode_1BYTE_KIND) {
2289        Py_UCS1 *start = (Py_UCS1 *) data;
2290        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2291    }
2292    else if (kind == PyUnicode_2BYTE_KIND) {
2293        Py_UCS2 *start = (Py_UCS2 *) data;
2294        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2295    }
2296    else {
2297        assert(kind == PyUnicode_4BYTE_KIND);
2298        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2299    }
2300    if (copy_null)
2301        target[len] = 0;
2302    return target;
2303}
2304
2305Py_UCS4*
2306PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2307                 int copy_null)
2308{
2309    if (target == NULL || targetsize < 0) {
2310        PyErr_BadInternalCall();
2311        return NULL;
2312    }
2313    return as_ucs4(string, target, targetsize, copy_null);
2314}
2315
2316Py_UCS4*
2317PyUnicode_AsUCS4Copy(PyObject *string)
2318{
2319    return as_ucs4(string, NULL, 0, 1);
2320}
2321
2322#ifdef HAVE_WCHAR_H
2323
2324PyObject *
2325PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2326{
2327    if (w == NULL) {
2328        if (size == 0)
2329            _Py_RETURN_UNICODE_EMPTY();
2330        PyErr_BadInternalCall();
2331        return NULL;
2332    }
2333
2334    if (size == -1) {
2335        size = wcslen(w);
2336    }
2337
2338    return PyUnicode_FromUnicode(w, size);
2339}
2340
2341#endif /* HAVE_WCHAR_H */
2342
2343/* maximum number of characters required for output of %lld or %p.
2344   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2345   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2346#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2347
2348static int
2349unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2350                             Py_ssize_t width, Py_ssize_t precision)
2351{
2352    Py_ssize_t length, fill, arglen;
2353    Py_UCS4 maxchar;
2354
2355    if (PyUnicode_READY(str) == -1)
2356        return -1;
2357
2358    length = PyUnicode_GET_LENGTH(str);
2359    if ((precision == -1 || precision >= length)
2360        && width <= length)
2361        return _PyUnicodeWriter_WriteStr(writer, str);
2362
2363    if (precision != -1)
2364        length = Py_MIN(precision, length);
2365
2366    arglen = Py_MAX(length, width);
2367    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2368        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2369    else
2370        maxchar = writer->maxchar;
2371
2372    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2373        return -1;
2374
2375    if (width > length) {
2376        fill = width - length;
2377        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2378            return -1;
2379        writer->pos += fill;
2380    }
2381
2382    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2383                                  str, 0, length);
2384    writer->pos += length;
2385    return 0;
2386}
2387
2388static int
2389unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2390                              Py_ssize_t width, Py_ssize_t precision)
2391{
2392    /* UTF-8 */
2393    Py_ssize_t length;
2394    PyObject *unicode;
2395    int res;
2396
2397    length = strlen(str);
2398    if (precision != -1)
2399        length = Py_MIN(length, precision);
2400    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2401    if (unicode == NULL)
2402        return -1;
2403
2404    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2405    Py_DECREF(unicode);
2406    return res;
2407}
2408
2409static const char*
2410unicode_fromformat_arg(_PyUnicodeWriter *writer,
2411                       const char *f, va_list *vargs)
2412{
2413    const char *p;
2414    Py_ssize_t len;
2415    int zeropad;
2416    Py_ssize_t width;
2417    Py_ssize_t precision;
2418    int longflag;
2419    int longlongflag;
2420    int size_tflag;
2421    Py_ssize_t fill;
2422
2423    p = f;
2424    f++;
2425    zeropad = 0;
2426    if (*f == '0') {
2427        zeropad = 1;
2428        f++;
2429    }
2430
2431    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2432    width = -1;
2433    if (Py_ISDIGIT((unsigned)*f)) {
2434        width = *f - '0';
2435        f++;
2436        while (Py_ISDIGIT((unsigned)*f)) {
2437            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2438                PyErr_SetString(PyExc_ValueError,
2439                                "width too big");
2440                return NULL;
2441            }
2442            width = (width * 10) + (*f - '0');
2443            f++;
2444        }
2445    }
2446    precision = -1;
2447    if (*f == '.') {
2448        f++;
2449        if (Py_ISDIGIT((unsigned)*f)) {
2450            precision = (*f - '0');
2451            f++;
2452            while (Py_ISDIGIT((unsigned)*f)) {
2453                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2454                    PyErr_SetString(PyExc_ValueError,
2455                                    "precision too big");
2456                    return NULL;
2457                }
2458                precision = (precision * 10) + (*f - '0');
2459                f++;
2460            }
2461        }
2462        if (*f == '%') {
2463            /* "%.3%s" => f points to "3" */
2464            f--;
2465        }
2466    }
2467    if (*f == '\0') {
2468        /* bogus format "%.123" => go backward, f points to "3" */
2469        f--;
2470    }
2471
2472    /* Handle %ld, %lu, %lld and %llu. */
2473    longflag = 0;
2474    longlongflag = 0;
2475    size_tflag = 0;
2476    if (*f == 'l') {
2477        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2478            longflag = 1;
2479            ++f;
2480        }
2481#ifdef HAVE_LONG_LONG
2482        else if (f[1] == 'l' &&
2483                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2484            longlongflag = 1;
2485            f += 2;
2486        }
2487#endif
2488    }
2489    /* handle the size_t flag. */
2490    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2491        size_tflag = 1;
2492        ++f;
2493    }
2494
2495    if (f[1] == '\0')
2496        writer->overallocate = 0;
2497
2498    switch (*f) {
2499    case 'c':
2500    {
2501        int ordinal = va_arg(*vargs, int);
2502        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2503            PyErr_SetString(PyExc_OverflowError,
2504                            "character argument not in range(0x110000)");
2505            return NULL;
2506        }
2507        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2508            return NULL;
2509        break;
2510    }
2511
2512    case 'i':
2513    case 'd':
2514    case 'u':
2515    case 'x':
2516    {
2517        /* used by sprintf */
2518        char buffer[MAX_LONG_LONG_CHARS];
2519        Py_ssize_t arglen;
2520
2521        if (*f == 'u') {
2522            if (longflag)
2523                len = sprintf(buffer, "%lu",
2524                        va_arg(*vargs, unsigned long));
2525#ifdef HAVE_LONG_LONG
2526            else if (longlongflag)
2527                len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "u",
2528                        va_arg(*vargs, unsigned PY_LONG_LONG));
2529#endif
2530            else if (size_tflag)
2531                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "u",
2532                        va_arg(*vargs, size_t));
2533            else
2534                len = sprintf(buffer, "%u",
2535                        va_arg(*vargs, unsigned int));
2536        }
2537        else if (*f == 'x') {
2538            len = sprintf(buffer, "%x", va_arg(*vargs, int));
2539        }
2540        else {
2541            if (longflag)
2542                len = sprintf(buffer, "%li",
2543                        va_arg(*vargs, long));
2544#ifdef HAVE_LONG_LONG
2545            else if (longlongflag)
2546                len = sprintf(buffer, "%" PY_FORMAT_LONG_LONG "i",
2547                        va_arg(*vargs, PY_LONG_LONG));
2548#endif
2549            else if (size_tflag)
2550                len = sprintf(buffer, "%" PY_FORMAT_SIZE_T "i",
2551                        va_arg(*vargs, Py_ssize_t));
2552            else
2553                len = sprintf(buffer, "%i",
2554                        va_arg(*vargs, int));
2555        }
2556        assert(len >= 0);
2557
2558        if (precision < len)
2559            precision = len;
2560
2561        arglen = Py_MAX(precision, width);
2562        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2563            return NULL;
2564
2565        if (width > precision) {
2566            Py_UCS4 fillchar;
2567            fill = width - precision;
2568            fillchar = zeropad?'0':' ';
2569            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2570                return NULL;
2571            writer->pos += fill;
2572        }
2573        if (precision > len) {
2574            fill = precision - len;
2575            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2576                return NULL;
2577            writer->pos += fill;
2578        }
2579
2580        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2581            return NULL;
2582        break;
2583    }
2584
2585    case 'p':
2586    {
2587        char number[MAX_LONG_LONG_CHARS];
2588
2589        len = sprintf(number, "%p", va_arg(*vargs, void*));
2590        assert(len >= 0);
2591
2592        /* %p is ill-defined:  ensure leading 0x. */
2593        if (number[1] == 'X')
2594            number[1] = 'x';
2595        else if (number[1] != 'x') {
2596            memmove(number + 2, number,
2597                    strlen(number) + 1);
2598            number[0] = '0';
2599            number[1] = 'x';
2600            len += 2;
2601        }
2602
2603        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2604            return NULL;
2605        break;
2606    }
2607
2608    case 's':
2609    {
2610        /* UTF-8 */
2611        const char *s = va_arg(*vargs, const char*);
2612        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2613            return NULL;
2614        break;
2615    }
2616
2617    case 'U':
2618    {
2619        PyObject *obj = va_arg(*vargs, PyObject *);
2620        assert(obj && _PyUnicode_CHECK(obj));
2621
2622        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2623            return NULL;
2624        break;
2625    }
2626
2627    case 'V':
2628    {
2629        PyObject *obj = va_arg(*vargs, PyObject *);
2630        const char *str = va_arg(*vargs, const char *);
2631        if (obj) {
2632            assert(_PyUnicode_CHECK(obj));
2633            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2634                return NULL;
2635        }
2636        else {
2637            assert(str != NULL);
2638            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2639                return NULL;
2640        }
2641        break;
2642    }
2643
2644    case 'S':
2645    {
2646        PyObject *obj = va_arg(*vargs, PyObject *);
2647        PyObject *str;
2648        assert(obj);
2649        str = PyObject_Str(obj);
2650        if (!str)
2651            return NULL;
2652        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2653            Py_DECREF(str);
2654            return NULL;
2655        }
2656        Py_DECREF(str);
2657        break;
2658    }
2659
2660    case 'R':
2661    {
2662        PyObject *obj = va_arg(*vargs, PyObject *);
2663        PyObject *repr;
2664        assert(obj);
2665        repr = PyObject_Repr(obj);
2666        if (!repr)
2667            return NULL;
2668        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2669            Py_DECREF(repr);
2670            return NULL;
2671        }
2672        Py_DECREF(repr);
2673        break;
2674    }
2675
2676    case 'A':
2677    {
2678        PyObject *obj = va_arg(*vargs, PyObject *);
2679        PyObject *ascii;
2680        assert(obj);
2681        ascii = PyObject_ASCII(obj);
2682        if (!ascii)
2683            return NULL;
2684        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2685            Py_DECREF(ascii);
2686            return NULL;
2687        }
2688        Py_DECREF(ascii);
2689        break;
2690    }
2691
2692    case '%':
2693        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2694            return NULL;
2695        break;
2696
2697    default:
2698        /* if we stumble upon an unknown formatting code, copy the rest
2699           of the format string to the output string. (we cannot just
2700           skip the code, since there's no way to know what's in the
2701           argument list) */
2702        len = strlen(p);
2703        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2704            return NULL;
2705        f = p+len;
2706        return f;
2707    }
2708
2709    f++;
2710    return f;
2711}
2712
2713PyObject *
2714PyUnicode_FromFormatV(const char *format, va_list vargs)
2715{
2716    va_list vargs2;
2717    const char *f;
2718    _PyUnicodeWriter writer;
2719
2720    _PyUnicodeWriter_Init(&writer);
2721    writer.min_length = strlen(format) + 100;
2722    writer.overallocate = 1;
2723
2724    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2725       Copy it to be able to pass a reference to a subfunction. */
2726    Py_VA_COPY(vargs2, vargs);
2727
2728    for (f = format; *f; ) {
2729        if (*f == '%') {
2730            f = unicode_fromformat_arg(&writer, f, &vargs2);
2731            if (f == NULL)
2732                goto fail;
2733        }
2734        else {
2735            const char *p;
2736            Py_ssize_t len;
2737
2738            p = f;
2739            do
2740            {
2741                if ((unsigned char)*p > 127) {
2742                    PyErr_Format(PyExc_ValueError,
2743                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2744                        "string, got a non-ASCII byte: 0x%02x",
2745                        (unsigned char)*p);
2746                    return NULL;
2747                }
2748                p++;
2749            }
2750            while (*p != '\0' && *p != '%');
2751            len = p - f;
2752
2753            if (*p == '\0')
2754                writer.overallocate = 0;
2755
2756            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2757                goto fail;
2758
2759            f = p;
2760        }
2761    }
2762    return _PyUnicodeWriter_Finish(&writer);
2763
2764  fail:
2765    _PyUnicodeWriter_Dealloc(&writer);
2766    return NULL;
2767}
2768
2769PyObject *
2770PyUnicode_FromFormat(const char *format, ...)
2771{
2772    PyObject* ret;
2773    va_list vargs;
2774
2775#ifdef HAVE_STDARG_PROTOTYPES
2776    va_start(vargs, format);
2777#else
2778    va_start(vargs);
2779#endif
2780    ret = PyUnicode_FromFormatV(format, vargs);
2781    va_end(vargs);
2782    return ret;
2783}
2784
2785#ifdef HAVE_WCHAR_H
2786
2787/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2788   convert a Unicode object to a wide character string.
2789
2790   - If w is NULL: return the number of wide characters (including the null
2791     character) required to convert the unicode object. Ignore size argument.
2792
2793   - Otherwise: return the number of wide characters (excluding the null
2794     character) written into w. Write at most size wide characters (including
2795     the null character). */
2796static Py_ssize_t
2797unicode_aswidechar(PyObject *unicode,
2798                   wchar_t *w,
2799                   Py_ssize_t size)
2800{
2801    Py_ssize_t res;
2802    const wchar_t *wstr;
2803
2804    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2805    if (wstr == NULL)
2806        return -1;
2807
2808    if (w != NULL) {
2809        if (size > res)
2810            size = res + 1;
2811        else
2812            res = size;
2813        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2814        return res;
2815    }
2816    else
2817        return res + 1;
2818}
2819
2820Py_ssize_t
2821PyUnicode_AsWideChar(PyObject *unicode,
2822                     wchar_t *w,
2823                     Py_ssize_t size)
2824{
2825    if (unicode == NULL) {
2826        PyErr_BadInternalCall();
2827        return -1;
2828    }
2829    return unicode_aswidechar(unicode, w, size);
2830}
2831
2832wchar_t*
2833PyUnicode_AsWideCharString(PyObject *unicode,
2834                           Py_ssize_t *size)
2835{
2836    wchar_t* buffer;
2837    Py_ssize_t buflen;
2838
2839    if (unicode == NULL) {
2840        PyErr_BadInternalCall();
2841        return NULL;
2842    }
2843
2844    buflen = unicode_aswidechar(unicode, NULL, 0);
2845    if (buflen == -1)
2846        return NULL;
2847    buffer = PyMem_NEW(wchar_t, buflen);
2848    if (buffer == NULL) {
2849        PyErr_NoMemory();
2850        return NULL;
2851    }
2852    buflen = unicode_aswidechar(unicode, buffer, buflen);
2853    if (buflen == -1) {
2854        PyMem_FREE(buffer);
2855        return NULL;
2856    }
2857    if (size != NULL)
2858        *size = buflen;
2859    return buffer;
2860}
2861
2862#endif /* HAVE_WCHAR_H */
2863
2864PyObject *
2865PyUnicode_FromOrdinal(int ordinal)
2866{
2867    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2868        PyErr_SetString(PyExc_ValueError,
2869                        "chr() arg not in range(0x110000)");
2870        return NULL;
2871    }
2872
2873    return unicode_char((Py_UCS4)ordinal);
2874}
2875
2876PyObject *
2877PyUnicode_FromObject(PyObject *obj)
2878{
2879    /* XXX Perhaps we should make this API an alias of
2880       PyObject_Str() instead ?! */
2881    if (PyUnicode_CheckExact(obj)) {
2882        if (PyUnicode_READY(obj) == -1)
2883            return NULL;
2884        Py_INCREF(obj);
2885        return obj;
2886    }
2887    if (PyUnicode_Check(obj)) {
2888        /* For a Unicode subtype that's not a Unicode object,
2889           return a true Unicode object with the same data. */
2890        return _PyUnicode_Copy(obj);
2891    }
2892    PyErr_Format(PyExc_TypeError,
2893                 "Can't convert '%.100s' object to str implicitly",
2894                 Py_TYPE(obj)->tp_name);
2895    return NULL;
2896}
2897
2898PyObject *
2899PyUnicode_FromEncodedObject(PyObject *obj,
2900                            const char *encoding,
2901                            const char *errors)
2902{
2903    Py_buffer buffer;
2904    PyObject *v;
2905
2906    if (obj == NULL) {
2907        PyErr_BadInternalCall();
2908        return NULL;
2909    }
2910
2911    /* Decoding bytes objects is the most common case and should be fast */
2912    if (PyBytes_Check(obj)) {
2913        if (PyBytes_GET_SIZE(obj) == 0)
2914            _Py_RETURN_UNICODE_EMPTY();
2915        v = PyUnicode_Decode(
2916                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2917                encoding, errors);
2918        return v;
2919    }
2920
2921    if (PyUnicode_Check(obj)) {
2922        PyErr_SetString(PyExc_TypeError,
2923                        "decoding str is not supported");
2924        return NULL;
2925    }
2926
2927    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2928    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2929        PyErr_Format(PyExc_TypeError,
2930                     "coercing to str: need a bytes-like object, %.80s found",
2931                     Py_TYPE(obj)->tp_name);
2932        return NULL;
2933    }
2934
2935    if (buffer.len == 0) {
2936        PyBuffer_Release(&buffer);
2937        _Py_RETURN_UNICODE_EMPTY();
2938    }
2939
2940    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2941    PyBuffer_Release(&buffer);
2942    return v;
2943}
2944
2945/* Convert encoding to lower case and replace '_' with '-' in order to
2946   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2947   1 on success. */
2948int
2949_Py_normalize_encoding(const char *encoding,
2950                       char *lower,
2951                       size_t lower_len)
2952{
2953    const char *e;
2954    char *l;
2955    char *l_end;
2956
2957    if (encoding == NULL) {
2958        /* 6 == strlen("utf-8") + 1 */
2959        if (lower_len < 6)
2960            return 0;
2961        strcpy(lower, "utf-8");
2962        return 1;
2963    }
2964    e = encoding;
2965    l = lower;
2966    l_end = &lower[lower_len - 1];
2967    while (*e) {
2968        if (l == l_end)
2969            return 0;
2970        if (Py_ISUPPER(*e)) {
2971            *l++ = Py_TOLOWER(*e++);
2972        }
2973        else if (*e == '_') {
2974            *l++ = '-';
2975            e++;
2976        }
2977        else {
2978            *l++ = *e++;
2979        }
2980    }
2981    *l = '\0';
2982    return 1;
2983}
2984
2985PyObject *
2986PyUnicode_Decode(const char *s,
2987                 Py_ssize_t size,
2988                 const char *encoding,
2989                 const char *errors)
2990{
2991    PyObject *buffer = NULL, *unicode;
2992    Py_buffer info;
2993    char lower[11];  /* Enough for any encoding shortcut */
2994
2995    /* Shortcuts for common default encodings */
2996    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
2997        if ((strcmp(lower, "utf-8") == 0) ||
2998            (strcmp(lower, "utf8") == 0))
2999            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3000        else if ((strcmp(lower, "latin-1") == 0) ||
3001                 (strcmp(lower, "latin1") == 0) ||
3002                 (strcmp(lower, "iso-8859-1") == 0) ||
3003                 (strcmp(lower, "iso8859-1") == 0))
3004            return PyUnicode_DecodeLatin1(s, size, errors);
3005#ifdef HAVE_MBCS
3006        else if (strcmp(lower, "mbcs") == 0)
3007            return PyUnicode_DecodeMBCS(s, size, errors);
3008#endif
3009        else if (strcmp(lower, "ascii") == 0)
3010            return PyUnicode_DecodeASCII(s, size, errors);
3011        else if (strcmp(lower, "utf-16") == 0)
3012            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3013        else if (strcmp(lower, "utf-32") == 0)
3014            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3015    }
3016
3017    /* Decode via the codec registry */
3018    buffer = NULL;
3019    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3020        goto onError;
3021    buffer = PyMemoryView_FromBuffer(&info);
3022    if (buffer == NULL)
3023        goto onError;
3024    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3025    if (unicode == NULL)
3026        goto onError;
3027    if (!PyUnicode_Check(unicode)) {
3028        PyErr_Format(PyExc_TypeError,
3029                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3030                     "use codecs.decode() to decode to arbitrary types",
3031                     encoding,
3032                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3033        Py_DECREF(unicode);
3034        goto onError;
3035    }
3036    Py_DECREF(buffer);
3037    return unicode_result(unicode);
3038
3039  onError:
3040    Py_XDECREF(buffer);
3041    return NULL;
3042}
3043
3044PyObject *
3045PyUnicode_AsDecodedObject(PyObject *unicode,
3046                          const char *encoding,
3047                          const char *errors)
3048{
3049    PyObject *v;
3050
3051    if (!PyUnicode_Check(unicode)) {
3052        PyErr_BadArgument();
3053        goto onError;
3054    }
3055
3056    if (encoding == NULL)
3057        encoding = PyUnicode_GetDefaultEncoding();
3058
3059    /* Decode via the codec registry */
3060    v = PyCodec_Decode(unicode, encoding, errors);
3061    if (v == NULL)
3062        goto onError;
3063    return unicode_result(v);
3064
3065  onError:
3066    return NULL;
3067}
3068
3069PyObject *
3070PyUnicode_AsDecodedUnicode(PyObject *unicode,
3071                           const char *encoding,
3072                           const char *errors)
3073{
3074    PyObject *v;
3075
3076    if (!PyUnicode_Check(unicode)) {
3077        PyErr_BadArgument();
3078        goto onError;
3079    }
3080
3081    if (encoding == NULL)
3082        encoding = PyUnicode_GetDefaultEncoding();
3083
3084    /* Decode via the codec registry */
3085    v = PyCodec_Decode(unicode, encoding, errors);
3086    if (v == NULL)
3087        goto onError;
3088    if (!PyUnicode_Check(v)) {
3089        PyErr_Format(PyExc_TypeError,
3090                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3091                     "use codecs.decode() to decode to arbitrary types",
3092                     encoding,
3093                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3094        Py_DECREF(v);
3095        goto onError;
3096    }
3097    return unicode_result(v);
3098
3099  onError:
3100    return NULL;
3101}
3102
3103PyObject *
3104PyUnicode_Encode(const Py_UNICODE *s,
3105                 Py_ssize_t size,
3106                 const char *encoding,
3107                 const char *errors)
3108{
3109    PyObject *v, *unicode;
3110
3111    unicode = PyUnicode_FromUnicode(s, size);
3112    if (unicode == NULL)
3113        return NULL;
3114    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3115    Py_DECREF(unicode);
3116    return v;
3117}
3118
3119PyObject *
3120PyUnicode_AsEncodedObject(PyObject *unicode,
3121                          const char *encoding,
3122                          const char *errors)
3123{
3124    PyObject *v;
3125
3126    if (!PyUnicode_Check(unicode)) {
3127        PyErr_BadArgument();
3128        goto onError;
3129    }
3130
3131    if (encoding == NULL)
3132        encoding = PyUnicode_GetDefaultEncoding();
3133
3134    /* Encode via the codec registry */
3135    v = PyCodec_Encode(unicode, encoding, errors);
3136    if (v == NULL)
3137        goto onError;
3138    return v;
3139
3140  onError:
3141    return NULL;
3142}
3143
3144static size_t
3145wcstombs_errorpos(const wchar_t *wstr)
3146{
3147    size_t len;
3148#if SIZEOF_WCHAR_T == 2
3149    wchar_t buf[3];
3150#else
3151    wchar_t buf[2];
3152#endif
3153    char outbuf[MB_LEN_MAX];
3154    const wchar_t *start, *previous;
3155
3156#if SIZEOF_WCHAR_T == 2
3157    buf[2] = 0;
3158#else
3159    buf[1] = 0;
3160#endif
3161    start = wstr;
3162    while (*wstr != L'\0')
3163    {
3164        previous = wstr;
3165#if SIZEOF_WCHAR_T == 2
3166        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3167            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3168        {
3169            buf[0] = wstr[0];
3170            buf[1] = wstr[1];
3171            wstr += 2;
3172        }
3173        else {
3174            buf[0] = *wstr;
3175            buf[1] = 0;
3176            wstr++;
3177        }
3178#else
3179        buf[0] = *wstr;
3180        wstr++;
3181#endif
3182        len = wcstombs(outbuf, buf, sizeof(outbuf));
3183        if (len == (size_t)-1)
3184            return previous - start;
3185    }
3186
3187    /* failed to find the unencodable character */
3188    return 0;
3189}
3190
3191static int
3192locale_error_handler(const char *errors, int *surrogateescape)
3193{
3194    _Py_error_handler error_handler = get_error_handler(errors);
3195    switch (error_handler)
3196    {
3197    case _Py_ERROR_STRICT:
3198        *surrogateescape = 0;
3199        return 0;
3200    case _Py_ERROR_SURROGATEESCAPE:
3201        *surrogateescape = 1;
3202        return 0;
3203    default:
3204        PyErr_Format(PyExc_ValueError,
3205                     "only 'strict' and 'surrogateescape' error handlers "
3206                     "are supported, not '%s'",
3207                     errors);
3208        return -1;
3209    }
3210}
3211
3212PyObject *
3213PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3214{
3215    Py_ssize_t wlen, wlen2;
3216    wchar_t *wstr;
3217    PyObject *bytes = NULL;
3218    char *errmsg;
3219    PyObject *reason = NULL;
3220    PyObject *exc;
3221    size_t error_pos;
3222    int surrogateescape;
3223
3224    if (locale_error_handler(errors, &surrogateescape) < 0)
3225        return NULL;
3226
3227    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3228    if (wstr == NULL)
3229        return NULL;
3230
3231    wlen2 = wcslen(wstr);
3232    if (wlen2 != wlen) {
3233        PyMem_Free(wstr);
3234        PyErr_SetString(PyExc_ValueError, "embedded null character");
3235        return NULL;
3236    }
3237
3238    if (surrogateescape) {
3239        /* "surrogateescape" error handler */
3240        char *str;
3241
3242        str = Py_EncodeLocale(wstr, &error_pos);
3243        if (str == NULL) {
3244            if (error_pos == (size_t)-1) {
3245                PyErr_NoMemory();
3246                PyMem_Free(wstr);
3247                return NULL;
3248            }
3249            else {
3250                goto encode_error;
3251            }
3252        }
3253        PyMem_Free(wstr);
3254
3255        bytes = PyBytes_FromString(str);
3256        PyMem_Free(str);
3257    }
3258    else {
3259        /* strict mode */
3260        size_t len, len2;
3261
3262        len = wcstombs(NULL, wstr, 0);
3263        if (len == (size_t)-1) {
3264            error_pos = (size_t)-1;
3265            goto encode_error;
3266        }
3267
3268        bytes = PyBytes_FromStringAndSize(NULL, len);
3269        if (bytes == NULL) {
3270            PyMem_Free(wstr);
3271            return NULL;
3272        }
3273
3274        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3275        if (len2 == (size_t)-1 || len2 > len) {
3276            error_pos = (size_t)-1;
3277            goto encode_error;
3278        }
3279        PyMem_Free(wstr);
3280    }
3281    return bytes;
3282
3283encode_error:
3284    errmsg = strerror(errno);
3285    assert(errmsg != NULL);
3286
3287    if (error_pos == (size_t)-1)
3288        error_pos = wcstombs_errorpos(wstr);
3289
3290    PyMem_Free(wstr);
3291    Py_XDECREF(bytes);
3292
3293    if (errmsg != NULL) {
3294        size_t errlen;
3295        wstr = Py_DecodeLocale(errmsg, &errlen);
3296        if (wstr != NULL) {
3297            reason = PyUnicode_FromWideChar(wstr, errlen);
3298            PyMem_RawFree(wstr);
3299        } else
3300            errmsg = NULL;
3301    }
3302    if (errmsg == NULL)
3303        reason = PyUnicode_FromString(
3304            "wcstombs() encountered an unencodable "
3305            "wide character");
3306    if (reason == NULL)
3307        return NULL;
3308
3309    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3310                                "locale", unicode,
3311                                (Py_ssize_t)error_pos,
3312                                (Py_ssize_t)(error_pos+1),
3313                                reason);
3314    Py_DECREF(reason);
3315    if (exc != NULL) {
3316        PyCodec_StrictErrors(exc);
3317        Py_XDECREF(exc);
3318    }
3319    return NULL;
3320}
3321
3322PyObject *
3323PyUnicode_EncodeFSDefault(PyObject *unicode)
3324{
3325#ifdef HAVE_MBCS
3326    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3327#elif defined(__APPLE__)
3328    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3329#else
3330    PyInterpreterState *interp = PyThreadState_GET()->interp;
3331    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3332       cannot use it to encode and decode filenames before it is loaded. Load
3333       the Python codec requires to encode at least its own filename. Use the C
3334       version of the locale codec until the codec registry is initialized and
3335       the Python codec is loaded.
3336
3337       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3338       cannot only rely on it: check also interp->fscodec_initialized for
3339       subinterpreters. */
3340    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3341        return PyUnicode_AsEncodedString(unicode,
3342                                         Py_FileSystemDefaultEncoding,
3343                                         "surrogateescape");
3344    }
3345    else {
3346        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3347    }
3348#endif
3349}
3350
3351PyObject *
3352PyUnicode_AsEncodedString(PyObject *unicode,
3353                          const char *encoding,
3354                          const char *errors)
3355{
3356    PyObject *v;
3357    char lower[11];  /* Enough for any encoding shortcut */
3358
3359    if (!PyUnicode_Check(unicode)) {
3360        PyErr_BadArgument();
3361        return NULL;
3362    }
3363
3364    /* Shortcuts for common default encodings */
3365    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3366        if ((strcmp(lower, "utf-8") == 0) ||
3367            (strcmp(lower, "utf8") == 0))
3368        {
3369            if (errors == NULL || strcmp(errors, "strict") == 0)
3370                return _PyUnicode_AsUTF8String(unicode, NULL);
3371            else
3372                return _PyUnicode_AsUTF8String(unicode, errors);
3373        }
3374        else if ((strcmp(lower, "latin-1") == 0) ||
3375                 (strcmp(lower, "latin1") == 0) ||
3376                 (strcmp(lower, "iso-8859-1") == 0) ||
3377                 (strcmp(lower, "iso8859-1") == 0))
3378            return _PyUnicode_AsLatin1String(unicode, errors);
3379#ifdef HAVE_MBCS
3380        else if (strcmp(lower, "mbcs") == 0)
3381            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3382#endif
3383        else if (strcmp(lower, "ascii") == 0)
3384            return _PyUnicode_AsASCIIString(unicode, errors);
3385    }
3386
3387    /* Encode via the codec registry */
3388    v = _PyCodec_EncodeText(unicode, encoding, errors);
3389    if (v == NULL)
3390        return NULL;
3391
3392    /* The normal path */
3393    if (PyBytes_Check(v))
3394        return v;
3395
3396    /* If the codec returns a buffer, raise a warning and convert to bytes */
3397    if (PyByteArray_Check(v)) {
3398        int error;
3399        PyObject *b;
3400
3401        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3402            "encoder %s returned bytearray instead of bytes; "
3403            "use codecs.encode() to encode to arbitrary types",
3404            encoding);
3405        if (error) {
3406            Py_DECREF(v);
3407            return NULL;
3408        }
3409
3410        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3411        Py_DECREF(v);
3412        return b;
3413    }
3414
3415    PyErr_Format(PyExc_TypeError,
3416                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3417                 "use codecs.encode() to encode to arbitrary types",
3418                 encoding,
3419                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3420    Py_DECREF(v);
3421    return NULL;
3422}
3423
3424PyObject *
3425PyUnicode_AsEncodedUnicode(PyObject *unicode,
3426                           const char *encoding,
3427                           const char *errors)
3428{
3429    PyObject *v;
3430
3431    if (!PyUnicode_Check(unicode)) {
3432        PyErr_BadArgument();
3433        goto onError;
3434    }
3435
3436    if (encoding == NULL)
3437        encoding = PyUnicode_GetDefaultEncoding();
3438
3439    /* Encode via the codec registry */
3440    v = PyCodec_Encode(unicode, encoding, errors);
3441    if (v == NULL)
3442        goto onError;
3443    if (!PyUnicode_Check(v)) {
3444        PyErr_Format(PyExc_TypeError,
3445                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3446                     "use codecs.encode() to encode to arbitrary types",
3447                     encoding,
3448                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3449        Py_DECREF(v);
3450        goto onError;
3451    }
3452    return v;
3453
3454  onError:
3455    return NULL;
3456}
3457
3458static size_t
3459mbstowcs_errorpos(const char *str, size_t len)
3460{
3461#ifdef HAVE_MBRTOWC
3462    const char *start = str;
3463    mbstate_t mbs;
3464    size_t converted;
3465    wchar_t ch;
3466
3467    memset(&mbs, 0, sizeof mbs);
3468    while (len)
3469    {
3470        converted = mbrtowc(&ch, str, len, &mbs);
3471        if (converted == 0)
3472            /* Reached end of string */
3473            break;
3474        if (converted == (size_t)-1 || converted == (size_t)-2) {
3475            /* Conversion error or incomplete character */
3476            return str - start;
3477        }
3478        else {
3479            str += converted;
3480            len -= converted;
3481        }
3482    }
3483    /* failed to find the undecodable byte sequence */
3484    return 0;
3485#endif
3486    return 0;
3487}
3488
3489PyObject*
3490PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3491                              const char *errors)
3492{
3493    wchar_t smallbuf[256];
3494    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3495    wchar_t *wstr;
3496    size_t wlen, wlen2;
3497    PyObject *unicode;
3498    int surrogateescape;
3499    size_t error_pos;
3500    char *errmsg;
3501    PyObject *reason = NULL;   /* initialize to prevent gcc warning */
3502    PyObject *exc;
3503
3504    if (locale_error_handler(errors, &surrogateescape) < 0)
3505        return NULL;
3506
3507    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3508        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3509        return NULL;
3510    }
3511
3512    if (surrogateescape) {
3513        /* "surrogateescape" error handler */
3514        wstr = Py_DecodeLocale(str, &wlen);
3515        if (wstr == NULL) {
3516            if (wlen == (size_t)-1)
3517                PyErr_NoMemory();
3518            else
3519                PyErr_SetFromErrno(PyExc_OSError);
3520            return NULL;
3521        }
3522
3523        unicode = PyUnicode_FromWideChar(wstr, wlen);
3524        PyMem_RawFree(wstr);
3525    }
3526    else {
3527        /* strict mode */
3528#ifndef HAVE_BROKEN_MBSTOWCS
3529        wlen = mbstowcs(NULL, str, 0);
3530#else
3531        wlen = len;
3532#endif
3533        if (wlen == (size_t)-1)
3534            goto decode_error;
3535        if (wlen+1 <= smallbuf_len) {
3536            wstr = smallbuf;
3537        }
3538        else {
3539            wstr = PyMem_New(wchar_t, wlen+1);
3540            if (!wstr)
3541                return PyErr_NoMemory();
3542        }
3543
3544        wlen2 = mbstowcs(wstr, str, wlen+1);
3545        if (wlen2 == (size_t)-1) {
3546            if (wstr != smallbuf)
3547                PyMem_Free(wstr);
3548            goto decode_error;
3549        }
3550#ifdef HAVE_BROKEN_MBSTOWCS
3551        assert(wlen2 == wlen);
3552#endif
3553        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3554        if (wstr != smallbuf)
3555            PyMem_Free(wstr);
3556    }
3557    return unicode;
3558
3559decode_error:
3560    reason = NULL;
3561    errmsg = strerror(errno);
3562    assert(errmsg != NULL);
3563
3564    error_pos = mbstowcs_errorpos(str, len);
3565    if (errmsg != NULL) {
3566        size_t errlen;
3567        wstr = Py_DecodeLocale(errmsg, &errlen);
3568        if (wstr != NULL) {
3569            reason = PyUnicode_FromWideChar(wstr, errlen);
3570            PyMem_RawFree(wstr);
3571        }
3572    }
3573    if (reason == NULL)
3574        reason = PyUnicode_FromString(
3575            "mbstowcs() encountered an invalid multibyte sequence");
3576    if (reason == NULL)
3577        return NULL;
3578
3579    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3580                                "locale", str, len,
3581                                (Py_ssize_t)error_pos,
3582                                (Py_ssize_t)(error_pos+1),
3583                                reason);
3584    Py_DECREF(reason);
3585    if (exc != NULL) {
3586        PyCodec_StrictErrors(exc);
3587        Py_XDECREF(exc);
3588    }
3589    return NULL;
3590}
3591
3592PyObject*
3593PyUnicode_DecodeLocale(const char *str, const char *errors)
3594{
3595    Py_ssize_t size = (Py_ssize_t)strlen(str);
3596    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3597}
3598
3599
3600PyObject*
3601PyUnicode_DecodeFSDefault(const char *s) {
3602    Py_ssize_t size = (Py_ssize_t)strlen(s);
3603    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3604}
3605
3606PyObject*
3607PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3608{
3609#ifdef HAVE_MBCS
3610    return PyUnicode_DecodeMBCS(s, size, NULL);
3611#elif defined(__APPLE__)
3612    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3613#else
3614    PyInterpreterState *interp = PyThreadState_GET()->interp;
3615    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3616       cannot use it to encode and decode filenames before it is loaded. Load
3617       the Python codec requires to encode at least its own filename. Use the C
3618       version of the locale codec until the codec registry is initialized and
3619       the Python codec is loaded.
3620
3621       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3622       cannot only rely on it: check also interp->fscodec_initialized for
3623       subinterpreters. */
3624    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3625        return PyUnicode_Decode(s, size,
3626                                Py_FileSystemDefaultEncoding,
3627                                "surrogateescape");
3628    }
3629    else {
3630        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3631    }
3632#endif
3633}
3634
3635
3636int
3637PyUnicode_FSConverter(PyObject* arg, void* addr)
3638{
3639    PyObject *output = NULL;
3640    Py_ssize_t size;
3641    void *data;
3642    if (arg == NULL) {
3643        Py_DECREF(*(PyObject**)addr);
3644        return 1;
3645    }
3646    if (PyBytes_Check(arg)) {
3647        output = arg;
3648        Py_INCREF(output);
3649    }
3650    else {
3651        arg = PyUnicode_FromObject(arg);
3652        if (!arg)
3653            return 0;
3654        output = PyUnicode_EncodeFSDefault(arg);
3655        Py_DECREF(arg);
3656        if (!output)
3657            return 0;
3658        if (!PyBytes_Check(output)) {
3659            Py_DECREF(output);
3660            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3661            return 0;
3662        }
3663    }
3664    size = PyBytes_GET_SIZE(output);
3665    data = PyBytes_AS_STRING(output);
3666    if ((size_t)size != strlen(data)) {
3667        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3668        Py_DECREF(output);
3669        return 0;
3670    }
3671    *(PyObject**)addr = output;
3672    return Py_CLEANUP_SUPPORTED;
3673}
3674
3675
3676int
3677PyUnicode_FSDecoder(PyObject* arg, void* addr)
3678{
3679    PyObject *output = NULL;
3680    if (arg == NULL) {
3681        Py_DECREF(*(PyObject**)addr);
3682        return 1;
3683    }
3684    if (PyUnicode_Check(arg)) {
3685        if (PyUnicode_READY(arg) == -1)
3686            return 0;
3687        output = arg;
3688        Py_INCREF(output);
3689    }
3690    else {
3691        arg = PyBytes_FromObject(arg);
3692        if (!arg)
3693            return 0;
3694        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3695                                                  PyBytes_GET_SIZE(arg));
3696        Py_DECREF(arg);
3697        if (!output)
3698            return 0;
3699        if (!PyUnicode_Check(output)) {
3700            Py_DECREF(output);
3701            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3702            return 0;
3703        }
3704    }
3705    if (PyUnicode_READY(output) == -1) {
3706        Py_DECREF(output);
3707        return 0;
3708    }
3709    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3710                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3711        PyErr_SetString(PyExc_ValueError, "embedded null character");
3712        Py_DECREF(output);
3713        return 0;
3714    }
3715    *(PyObject**)addr = output;
3716    return Py_CLEANUP_SUPPORTED;
3717}
3718
3719
3720char*
3721PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3722{
3723    PyObject *bytes;
3724
3725    if (!PyUnicode_Check(unicode)) {
3726        PyErr_BadArgument();
3727        return NULL;
3728    }
3729    if (PyUnicode_READY(unicode) == -1)
3730        return NULL;
3731
3732    if (PyUnicode_UTF8(unicode) == NULL) {
3733        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3734        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3735        if (bytes == NULL)
3736            return NULL;
3737        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3738        if (_PyUnicode_UTF8(unicode) == NULL) {
3739            PyErr_NoMemory();
3740            Py_DECREF(bytes);
3741            return NULL;
3742        }
3743        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3744        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3745                  PyBytes_AS_STRING(bytes),
3746                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3747        Py_DECREF(bytes);
3748    }
3749
3750    if (psize)
3751        *psize = PyUnicode_UTF8_LENGTH(unicode);
3752    return PyUnicode_UTF8(unicode);
3753}
3754
3755char*
3756PyUnicode_AsUTF8(PyObject *unicode)
3757{
3758    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3759}
3760
3761Py_UNICODE *
3762PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3763{
3764    const unsigned char *one_byte;
3765#if SIZEOF_WCHAR_T == 4
3766    const Py_UCS2 *two_bytes;
3767#else
3768    const Py_UCS4 *four_bytes;
3769    const Py_UCS4 *ucs4_end;
3770    Py_ssize_t num_surrogates;
3771#endif
3772    wchar_t *w;
3773    wchar_t *wchar_end;
3774
3775    if (!PyUnicode_Check(unicode)) {
3776        PyErr_BadArgument();
3777        return NULL;
3778    }
3779    if (_PyUnicode_WSTR(unicode) == NULL) {
3780        /* Non-ASCII compact unicode object */
3781        assert(_PyUnicode_KIND(unicode) != 0);
3782        assert(PyUnicode_IS_READY(unicode));
3783
3784        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3785#if SIZEOF_WCHAR_T == 2
3786            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3787            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3788            num_surrogates = 0;
3789
3790            for (; four_bytes < ucs4_end; ++four_bytes) {
3791                if (*four_bytes > 0xFFFF)
3792                    ++num_surrogates;
3793            }
3794
3795            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3796                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3797            if (!_PyUnicode_WSTR(unicode)) {
3798                PyErr_NoMemory();
3799                return NULL;
3800            }
3801            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3802
3803            w = _PyUnicode_WSTR(unicode);
3804            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3805            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3806            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3807                if (*four_bytes > 0xFFFF) {
3808                    assert(*four_bytes <= MAX_UNICODE);
3809                    /* encode surrogate pair in this case */
3810                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3811                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3812                }
3813                else
3814                    *w = *four_bytes;
3815
3816                if (w > wchar_end) {
3817                    assert(0 && "Miscalculated string end");
3818                }
3819            }
3820            *w = 0;
3821#else
3822            /* sizeof(wchar_t) == 4 */
3823            Py_FatalError("Impossible unicode object state, wstr and str "
3824                          "should share memory already.");
3825            return NULL;
3826#endif
3827        }
3828        else {
3829            if ((size_t)_PyUnicode_LENGTH(unicode) >
3830                    PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3831                PyErr_NoMemory();
3832                return NULL;
3833            }
3834            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3835                                                  (_PyUnicode_LENGTH(unicode) + 1));
3836            if (!_PyUnicode_WSTR(unicode)) {
3837                PyErr_NoMemory();
3838                return NULL;
3839            }
3840            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3841                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3842            w = _PyUnicode_WSTR(unicode);
3843            wchar_end = w + _PyUnicode_LENGTH(unicode);
3844
3845            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3846                one_byte = PyUnicode_1BYTE_DATA(unicode);
3847                for (; w < wchar_end; ++one_byte, ++w)
3848                    *w = *one_byte;
3849                /* null-terminate the wstr */
3850                *w = 0;
3851            }
3852            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3853#if SIZEOF_WCHAR_T == 4
3854                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3855                for (; w < wchar_end; ++two_bytes, ++w)
3856                    *w = *two_bytes;
3857                /* null-terminate the wstr */
3858                *w = 0;
3859#else
3860                /* sizeof(wchar_t) == 2 */
3861                PyObject_FREE(_PyUnicode_WSTR(unicode));
3862                _PyUnicode_WSTR(unicode) = NULL;
3863                Py_FatalError("Impossible unicode object state, wstr "
3864                              "and str should share memory already.");
3865                return NULL;
3866#endif
3867            }
3868            else {
3869                assert(0 && "This should never happen.");
3870            }
3871        }
3872    }
3873    if (size != NULL)
3874        *size = PyUnicode_WSTR_LENGTH(unicode);
3875    return _PyUnicode_WSTR(unicode);
3876}
3877
3878Py_UNICODE *
3879PyUnicode_AsUnicode(PyObject *unicode)
3880{
3881    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3882}
3883
3884
3885Py_ssize_t
3886PyUnicode_GetSize(PyObject *unicode)
3887{
3888    if (!PyUnicode_Check(unicode)) {
3889        PyErr_BadArgument();
3890        goto onError;
3891    }
3892    return PyUnicode_GET_SIZE(unicode);
3893
3894  onError:
3895    return -1;
3896}
3897
3898Py_ssize_t
3899PyUnicode_GetLength(PyObject *unicode)
3900{
3901    if (!PyUnicode_Check(unicode)) {
3902        PyErr_BadArgument();
3903        return -1;
3904    }
3905    if (PyUnicode_READY(unicode) == -1)
3906        return -1;
3907    return PyUnicode_GET_LENGTH(unicode);
3908}
3909
3910Py_UCS4
3911PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3912{
3913    void *data;
3914    int kind;
3915
3916    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3917        PyErr_BadArgument();
3918        return (Py_UCS4)-1;
3919    }
3920    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3921        PyErr_SetString(PyExc_IndexError, "string index out of range");
3922        return (Py_UCS4)-1;
3923    }
3924    data = PyUnicode_DATA(unicode);
3925    kind = PyUnicode_KIND(unicode);
3926    return PyUnicode_READ(kind, data, index);
3927}
3928
3929int
3930PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3931{
3932    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3933        PyErr_BadArgument();
3934        return -1;
3935    }
3936    assert(PyUnicode_IS_READY(unicode));
3937    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3938        PyErr_SetString(PyExc_IndexError, "string index out of range");
3939        return -1;
3940    }
3941    if (unicode_check_modifiable(unicode))
3942        return -1;
3943    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3944        PyErr_SetString(PyExc_ValueError, "character out of range");
3945        return -1;
3946    }
3947    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3948                    index, ch);
3949    return 0;
3950}
3951
3952const char *
3953PyUnicode_GetDefaultEncoding(void)
3954{
3955    return "utf-8";
3956}
3957
3958/* create or adjust a UnicodeDecodeError */
3959static void
3960make_decode_exception(PyObject **exceptionObject,
3961                      const char *encoding,
3962                      const char *input, Py_ssize_t length,
3963                      Py_ssize_t startpos, Py_ssize_t endpos,
3964                      const char *reason)
3965{
3966    if (*exceptionObject == NULL) {
3967        *exceptionObject = PyUnicodeDecodeError_Create(
3968            encoding, input, length, startpos, endpos, reason);
3969    }
3970    else {
3971        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3972            goto onError;
3973        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3974            goto onError;
3975        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
3976            goto onError;
3977    }
3978    return;
3979
3980onError:
3981    Py_CLEAR(*exceptionObject);
3982}
3983
3984#ifdef HAVE_MBCS
3985/* error handling callback helper:
3986   build arguments, call the callback and check the arguments,
3987   if no exception occurred, copy the replacement to the output
3988   and adjust various state variables.
3989   return 0 on success, -1 on error
3990*/
3991
3992static int
3993unicode_decode_call_errorhandler_wchar(
3994    const char *errors, PyObject **errorHandler,
3995    const char *encoding, const char *reason,
3996    const char **input, const char **inend, Py_ssize_t *startinpos,
3997    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
3998    PyObject **output, Py_ssize_t *outpos)
3999{
4000    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4001
4002    PyObject *restuple = NULL;
4003    PyObject *repunicode = NULL;
4004    Py_ssize_t outsize;
4005    Py_ssize_t insize;
4006    Py_ssize_t requiredsize;
4007    Py_ssize_t newpos;
4008    PyObject *inputobj = NULL;
4009    wchar_t *repwstr;
4010    Py_ssize_t repwlen;
4011
4012    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4013    outsize = _PyUnicode_WSTR_LENGTH(*output);
4014
4015    if (*errorHandler == NULL) {
4016        *errorHandler = PyCodec_LookupError(errors);
4017        if (*errorHandler == NULL)
4018            goto onError;
4019    }
4020
4021    make_decode_exception(exceptionObject,
4022        encoding,
4023        *input, *inend - *input,
4024        *startinpos, *endinpos,
4025        reason);
4026    if (*exceptionObject == NULL)
4027        goto onError;
4028
4029    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4030    if (restuple == NULL)
4031        goto onError;
4032    if (!PyTuple_Check(restuple)) {
4033        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4034        goto onError;
4035    }
4036    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4037        goto onError;
4038
4039    /* Copy back the bytes variables, which might have been modified by the
4040       callback */
4041    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4042    if (!inputobj)
4043        goto onError;
4044    if (!PyBytes_Check(inputobj)) {
4045        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4046    }
4047    *input = PyBytes_AS_STRING(inputobj);
4048    insize = PyBytes_GET_SIZE(inputobj);
4049    *inend = *input + insize;
4050    /* we can DECREF safely, as the exception has another reference,
4051       so the object won't go away. */
4052    Py_DECREF(inputobj);
4053
4054    if (newpos<0)
4055        newpos = insize+newpos;
4056    if (newpos<0 || newpos>insize) {
4057        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4058        goto onError;
4059    }
4060
4061    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4062    if (repwstr == NULL)
4063        goto onError;
4064    /* need more space? (at least enough for what we
4065       have+the replacement+the rest of the string (starting
4066       at the new input position), so we won't have to check space
4067       when there are no errors in the rest of the string) */
4068    requiredsize = *outpos;
4069    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4070        goto overflow;
4071    requiredsize += repwlen;
4072    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4073        goto overflow;
4074    requiredsize += insize - newpos;
4075    if (requiredsize > outsize) {
4076        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4077            requiredsize = 2*outsize;
4078        if (unicode_resize(output, requiredsize) < 0)
4079            goto onError;
4080    }
4081    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4082    *outpos += repwlen;
4083    *endinpos = newpos;
4084    *inptr = *input + newpos;
4085
4086    /* we made it! */
4087    Py_XDECREF(restuple);
4088    return 0;
4089
4090  overflow:
4091    PyErr_SetString(PyExc_OverflowError,
4092                    "decoded result is too long for a Python string");
4093
4094  onError:
4095    Py_XDECREF(restuple);
4096    return -1;
4097}
4098#endif   /* HAVE_MBCS */
4099
4100static int
4101unicode_decode_call_errorhandler_writer(
4102    const char *errors, PyObject **errorHandler,
4103    const char *encoding, const char *reason,
4104    const char **input, const char **inend, Py_ssize_t *startinpos,
4105    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4106    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4107{
4108    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4109
4110    PyObject *restuple = NULL;
4111    PyObject *repunicode = NULL;
4112    Py_ssize_t insize;
4113    Py_ssize_t newpos;
4114    Py_ssize_t replen;
4115    PyObject *inputobj = NULL;
4116
4117    if (*errorHandler == NULL) {
4118        *errorHandler = PyCodec_LookupError(errors);
4119        if (*errorHandler == NULL)
4120            goto onError;
4121    }
4122
4123    make_decode_exception(exceptionObject,
4124        encoding,
4125        *input, *inend - *input,
4126        *startinpos, *endinpos,
4127        reason);
4128    if (*exceptionObject == NULL)
4129        goto onError;
4130
4131    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4132    if (restuple == NULL)
4133        goto onError;
4134    if (!PyTuple_Check(restuple)) {
4135        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4136        goto onError;
4137    }
4138    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4139        goto onError;
4140
4141    /* Copy back the bytes variables, which might have been modified by the
4142       callback */
4143    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4144    if (!inputobj)
4145        goto onError;
4146    if (!PyBytes_Check(inputobj)) {
4147        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4148    }
4149    *input = PyBytes_AS_STRING(inputobj);
4150    insize = PyBytes_GET_SIZE(inputobj);
4151    *inend = *input + insize;
4152    /* we can DECREF safely, as the exception has another reference,
4153       so the object won't go away. */
4154    Py_DECREF(inputobj);
4155
4156    if (newpos<0)
4157        newpos = insize+newpos;
4158    if (newpos<0 || newpos>insize) {
4159        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4160        goto onError;
4161    }
4162
4163    if (PyUnicode_READY(repunicode) < 0)
4164        goto onError;
4165    replen = PyUnicode_GET_LENGTH(repunicode);
4166    if (replen > 1) {
4167        writer->min_length += replen - 1;
4168        writer->overallocate = 1;
4169        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4170                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4171            goto onError;
4172    }
4173    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4174        goto onError;
4175
4176    *endinpos = newpos;
4177    *inptr = *input + newpos;
4178
4179    /* we made it! */
4180    Py_XDECREF(restuple);
4181    return 0;
4182
4183  onError:
4184    Py_XDECREF(restuple);
4185    return -1;
4186}
4187
4188/* --- UTF-7 Codec -------------------------------------------------------- */
4189
4190/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4191
4192/* Three simple macros defining base-64. */
4193
4194/* Is c a base-64 character? */
4195
4196#define IS_BASE64(c) \
4197    (((c) >= 'A' && (c) <= 'Z') ||     \
4198     ((c) >= 'a' && (c) <= 'z') ||     \
4199     ((c) >= '0' && (c) <= '9') ||     \
4200     (c) == '+' || (c) == '/')
4201
4202/* given that c is a base-64 character, what is its base-64 value? */
4203
4204#define FROM_BASE64(c)                                                  \
4205    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4206     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4207     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4208     (c) == '+' ? 62 : 63)
4209
4210/* What is the base-64 character of the bottom 6 bits of n? */
4211
4212#define TO_BASE64(n)  \
4213    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4214
4215/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4216 * decoded as itself.  We are permissive on decoding; the only ASCII
4217 * byte not decoding to itself is the + which begins a base64
4218 * string. */
4219
4220#define DECODE_DIRECT(c)                                \
4221    ((c) <= 127 && (c) != '+')
4222
4223/* The UTF-7 encoder treats ASCII characters differently according to
4224 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4225 * the above).  See RFC2152.  This array identifies these different
4226 * sets:
4227 * 0 : "Set D"
4228 *     alphanumeric and '(),-./:?
4229 * 1 : "Set O"
4230 *     !"#$%&*;<=>@[]^_`{|}
4231 * 2 : "whitespace"
4232 *     ht nl cr sp
4233 * 3 : special (must be base64 encoded)
4234 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4235 */
4236
4237static
4238char utf7_category[128] = {
4239/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4240    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4241/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4242    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4243/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4244    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4245/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4246    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4247/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4248    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4249/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4250    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4251/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4252    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4253/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4254    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4255};
4256
4257/* ENCODE_DIRECT: this character should be encoded as itself.  The
4258 * answer depends on whether we are encoding set O as itself, and also
4259 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4260 * clear that the answers to these questions vary between
4261 * applications, so this code needs to be flexible.  */
4262
4263#define ENCODE_DIRECT(c, directO, directWS)             \
4264    ((c) < 128 && (c) > 0 &&                            \
4265     ((utf7_category[(c)] == 0) ||                      \
4266      (directWS && (utf7_category[(c)] == 2)) ||        \
4267      (directO && (utf7_category[(c)] == 1))))
4268
4269PyObject *
4270PyUnicode_DecodeUTF7(const char *s,
4271                     Py_ssize_t size,
4272                     const char *errors)
4273{
4274    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4275}
4276
4277/* The decoder.  The only state we preserve is our read position,
4278 * i.e. how many characters we have consumed.  So if we end in the
4279 * middle of a shift sequence we have to back off the read position
4280 * and the output to the beginning of the sequence, otherwise we lose
4281 * all the shift state (seen bits, number of bits seen, high
4282 * surrogate). */
4283
4284PyObject *
4285PyUnicode_DecodeUTF7Stateful(const char *s,
4286                             Py_ssize_t size,
4287                             const char *errors,
4288                             Py_ssize_t *consumed)
4289{
4290    const char *starts = s;
4291    Py_ssize_t startinpos;
4292    Py_ssize_t endinpos;
4293    const char *e;
4294    _PyUnicodeWriter writer;
4295    const char *errmsg = "";
4296    int inShift = 0;
4297    Py_ssize_t shiftOutStart;
4298    unsigned int base64bits = 0;
4299    unsigned long base64buffer = 0;
4300    Py_UCS4 surrogate = 0;
4301    PyObject *errorHandler = NULL;
4302    PyObject *exc = NULL;
4303
4304    if (size == 0) {
4305        if (consumed)
4306            *consumed = 0;
4307        _Py_RETURN_UNICODE_EMPTY();
4308    }
4309
4310    /* Start off assuming it's all ASCII. Widen later as necessary. */
4311    _PyUnicodeWriter_Init(&writer);
4312    writer.min_length = size;
4313
4314    shiftOutStart = 0;
4315    e = s + size;
4316
4317    while (s < e) {
4318        Py_UCS4 ch;
4319      restart:
4320        ch = (unsigned char) *s;
4321
4322        if (inShift) { /* in a base-64 section */
4323            if (IS_BASE64(ch)) { /* consume a base-64 character */
4324                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4325                base64bits += 6;
4326                s++;
4327                if (base64bits >= 16) {
4328                    /* we have enough bits for a UTF-16 value */
4329                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4330                    base64bits -= 16;
4331                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4332                    assert(outCh <= 0xffff);
4333                    if (surrogate) {
4334                        /* expecting a second surrogate */
4335                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4336                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4337                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4338                                goto onError;
4339                            surrogate = 0;
4340                            continue;
4341                        }
4342                        else {
4343                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4344                                goto onError;
4345                            surrogate = 0;
4346                        }
4347                    }
4348                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4349                        /* first surrogate */
4350                        surrogate = outCh;
4351                    }
4352                    else {
4353                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4354                            goto onError;
4355                    }
4356                }
4357            }
4358            else { /* now leaving a base-64 section */
4359                inShift = 0;
4360                s++;
4361                if (surrogate) {
4362                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4363                        goto onError;
4364                    surrogate = 0;
4365                }
4366                if (base64bits > 0) { /* left-over bits */
4367                    if (base64bits >= 6) {
4368                        /* We've seen at least one base-64 character */
4369                        errmsg = "partial character in shift sequence";
4370                        goto utf7Error;
4371                    }
4372                    else {
4373                        /* Some bits remain; they should be zero */
4374                        if (base64buffer != 0) {
4375                            errmsg = "non-zero padding bits in shift sequence";
4376                            goto utf7Error;
4377                        }
4378                    }
4379                }
4380                if (ch != '-') {
4381                    /* '-' is absorbed; other terminating
4382                       characters are preserved */
4383                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4384                        goto onError;
4385                }
4386            }
4387        }
4388        else if ( ch == '+' ) {
4389            startinpos = s-starts;
4390            s++; /* consume '+' */
4391            if (s < e && *s == '-') { /* '+-' encodes '+' */
4392                s++;
4393                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4394                    goto onError;
4395            }
4396            else { /* begin base64-encoded section */
4397                inShift = 1;
4398                shiftOutStart = writer.pos;
4399                base64bits = 0;
4400                base64buffer = 0;
4401            }
4402        }
4403        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4404            s++;
4405            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4406                goto onError;
4407        }
4408        else {
4409            startinpos = s-starts;
4410            s++;
4411            errmsg = "unexpected special character";
4412            goto utf7Error;
4413        }
4414        continue;
4415utf7Error:
4416        endinpos = s-starts;
4417        if (unicode_decode_call_errorhandler_writer(
4418                errors, &errorHandler,
4419                "utf7", errmsg,
4420                &starts, &e, &startinpos, &endinpos, &exc, &s,
4421                &writer))
4422            goto onError;
4423    }
4424
4425    /* end of string */
4426
4427    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4428        /* if we're in an inconsistent state, that's an error */
4429        if (surrogate ||
4430                (base64bits >= 6) ||
4431                (base64bits > 0 && base64buffer != 0)) {
4432            endinpos = size;
4433            if (unicode_decode_call_errorhandler_writer(
4434                    errors, &errorHandler,
4435                    "utf7", "unterminated shift sequence",
4436                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4437                    &writer))
4438                goto onError;
4439            if (s < e)
4440                goto restart;
4441        }
4442    }
4443
4444    /* return state */
4445    if (consumed) {
4446        if (inShift) {
4447            *consumed = startinpos;
4448            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4449                PyObject *result = PyUnicode_FromKindAndData(
4450                        writer.kind, writer.data, shiftOutStart);
4451                Py_XDECREF(errorHandler);
4452                Py_XDECREF(exc);
4453                _PyUnicodeWriter_Dealloc(&writer);
4454                return result;
4455            }
4456            writer.pos = shiftOutStart; /* back off output */
4457        }
4458        else {
4459            *consumed = s-starts;
4460        }
4461    }
4462
4463    Py_XDECREF(errorHandler);
4464    Py_XDECREF(exc);
4465    return _PyUnicodeWriter_Finish(&writer);
4466
4467  onError:
4468    Py_XDECREF(errorHandler);
4469    Py_XDECREF(exc);
4470    _PyUnicodeWriter_Dealloc(&writer);
4471    return NULL;
4472}
4473
4474
4475PyObject *
4476_PyUnicode_EncodeUTF7(PyObject *str,
4477                      int base64SetO,
4478                      int base64WhiteSpace,
4479                      const char *errors)
4480{
4481    int kind;
4482    void *data;
4483    Py_ssize_t len;
4484    PyObject *v;
4485    int inShift = 0;
4486    Py_ssize_t i;
4487    unsigned int base64bits = 0;
4488    unsigned long base64buffer = 0;
4489    char * out;
4490    char * start;
4491
4492    if (PyUnicode_READY(str) == -1)
4493        return NULL;
4494    kind = PyUnicode_KIND(str);
4495    data = PyUnicode_DATA(str);
4496    len = PyUnicode_GET_LENGTH(str);
4497
4498    if (len == 0)
4499        return PyBytes_FromStringAndSize(NULL, 0);
4500
4501    /* It might be possible to tighten this worst case */
4502    if (len > PY_SSIZE_T_MAX / 8)
4503        return PyErr_NoMemory();
4504    v = PyBytes_FromStringAndSize(NULL, len * 8);
4505    if (v == NULL)
4506        return NULL;
4507
4508    start = out = PyBytes_AS_STRING(v);
4509    for (i = 0; i < len; ++i) {
4510        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4511
4512        if (inShift) {
4513            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4514                /* shifting out */
4515                if (base64bits) { /* output remaining bits */
4516                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4517                    base64buffer = 0;
4518                    base64bits = 0;
4519                }
4520                inShift = 0;
4521                /* Characters not in the BASE64 set implicitly unshift the sequence
4522                   so no '-' is required, except if the character is itself a '-' */
4523                if (IS_BASE64(ch) || ch == '-') {
4524                    *out++ = '-';
4525                }
4526                *out++ = (char) ch;
4527            }
4528            else {
4529                goto encode_char;
4530            }
4531        }
4532        else { /* not in a shift sequence */
4533            if (ch == '+') {
4534                *out++ = '+';
4535                        *out++ = '-';
4536            }
4537            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4538                *out++ = (char) ch;
4539            }
4540            else {
4541                *out++ = '+';
4542                inShift = 1;
4543                goto encode_char;
4544            }
4545        }
4546        continue;
4547encode_char:
4548        if (ch >= 0x10000) {
4549            assert(ch <= MAX_UNICODE);
4550
4551            /* code first surrogate */
4552            base64bits += 16;
4553            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4554            while (base64bits >= 6) {
4555                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4556                base64bits -= 6;
4557            }
4558            /* prepare second surrogate */
4559            ch = Py_UNICODE_LOW_SURROGATE(ch);
4560        }
4561        base64bits += 16;
4562        base64buffer = (base64buffer << 16) | ch;
4563        while (base64bits >= 6) {
4564            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4565            base64bits -= 6;
4566        }
4567    }
4568    if (base64bits)
4569        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4570    if (inShift)
4571        *out++ = '-';
4572    if (_PyBytes_Resize(&v, out - start) < 0)
4573        return NULL;
4574    return v;
4575}
4576PyObject *
4577PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4578                     Py_ssize_t size,
4579                     int base64SetO,
4580                     int base64WhiteSpace,
4581                     const char *errors)
4582{
4583    PyObject *result;
4584    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4585    if (tmp == NULL)
4586        return NULL;
4587    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4588                                   base64WhiteSpace, errors);
4589    Py_DECREF(tmp);
4590    return result;
4591}
4592
4593#undef IS_BASE64
4594#undef FROM_BASE64
4595#undef TO_BASE64
4596#undef DECODE_DIRECT
4597#undef ENCODE_DIRECT
4598
4599/* --- UTF-8 Codec -------------------------------------------------------- */
4600
4601PyObject *
4602PyUnicode_DecodeUTF8(const char *s,
4603                     Py_ssize_t size,
4604                     const char *errors)
4605{
4606    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4607}
4608
4609#include "stringlib/asciilib.h"
4610#include "stringlib/codecs.h"
4611#include "stringlib/undef.h"
4612
4613#include "stringlib/ucs1lib.h"
4614#include "stringlib/codecs.h"
4615#include "stringlib/undef.h"
4616
4617#include "stringlib/ucs2lib.h"
4618#include "stringlib/codecs.h"
4619#include "stringlib/undef.h"
4620
4621#include "stringlib/ucs4lib.h"
4622#include "stringlib/codecs.h"
4623#include "stringlib/undef.h"
4624
4625/* Mask to quickly check whether a C 'long' contains a
4626   non-ASCII, UTF8-encoded char. */
4627#if (SIZEOF_LONG == 8)
4628# define ASCII_CHAR_MASK 0x8080808080808080UL
4629#elif (SIZEOF_LONG == 4)
4630# define ASCII_CHAR_MASK 0x80808080UL
4631#else
4632# error C 'long' size should be either 4 or 8!
4633#endif
4634
4635static Py_ssize_t
4636ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4637{
4638    const char *p = start;
4639    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4640
4641    /*
4642     * Issue #17237: m68k is a bit different from most architectures in
4643     * that objects do not use "natural alignment" - for example, int and
4644     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4645     * won't work; also, tests have shown that skipping the "optimised
4646     * version" will even speed up m68k.
4647     */
4648#if !defined(__m68k__)
4649#if SIZEOF_LONG <= SIZEOF_VOID_P
4650    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4651    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4652        /* Fast path, see in STRINGLIB(utf8_decode) for
4653           an explanation. */
4654        /* Help allocation */
4655        const char *_p = p;
4656        Py_UCS1 * q = dest;
4657        while (_p < aligned_end) {
4658            unsigned long value = *(const unsigned long *) _p;
4659            if (value & ASCII_CHAR_MASK)
4660                break;
4661            *((unsigned long *)q) = value;
4662            _p += SIZEOF_LONG;
4663            q += SIZEOF_LONG;
4664        }
4665        p = _p;
4666        while (p < end) {
4667            if ((unsigned char)*p & 0x80)
4668                break;
4669            *q++ = *p++;
4670        }
4671        return p - start;
4672    }
4673#endif
4674#endif
4675    while (p < end) {
4676        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4677           for an explanation. */
4678        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4679            /* Help allocation */
4680            const char *_p = p;
4681            while (_p < aligned_end) {
4682                unsigned long value = *(unsigned long *) _p;
4683                if (value & ASCII_CHAR_MASK)
4684                    break;
4685                _p += SIZEOF_LONG;
4686            }
4687            p = _p;
4688            if (_p == end)
4689                break;
4690        }
4691        if ((unsigned char)*p & 0x80)
4692            break;
4693        ++p;
4694    }
4695    memcpy(dest, start, p - start);
4696    return p - start;
4697}
4698
4699PyObject *
4700PyUnicode_DecodeUTF8Stateful(const char *s,
4701                             Py_ssize_t size,
4702                             const char *errors,
4703                             Py_ssize_t *consumed)
4704{
4705    _PyUnicodeWriter writer;
4706    const char *starts = s;
4707    const char *end = s + size;
4708
4709    Py_ssize_t startinpos;
4710    Py_ssize_t endinpos;
4711    const char *errmsg = "";
4712    PyObject *errorHandler = NULL;
4713    PyObject *exc = NULL;
4714
4715    if (size == 0) {
4716        if (consumed)
4717            *consumed = 0;
4718        _Py_RETURN_UNICODE_EMPTY();
4719    }
4720
4721    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4722    if (size == 1 && (unsigned char)s[0] < 128) {
4723        if (consumed)
4724            *consumed = 1;
4725        return get_latin1_char((unsigned char)s[0]);
4726    }
4727
4728    _PyUnicodeWriter_Init(&writer);
4729    writer.min_length = size;
4730    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4731        goto onError;
4732
4733    writer.pos = ascii_decode(s, end, writer.data);
4734    s += writer.pos;
4735    while (s < end) {
4736        Py_UCS4 ch;
4737        int kind = writer.kind;
4738        if (kind == PyUnicode_1BYTE_KIND) {
4739            if (PyUnicode_IS_ASCII(writer.buffer))
4740                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4741            else
4742                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4743        } else if (kind == PyUnicode_2BYTE_KIND) {
4744            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4745        } else {
4746            assert(kind == PyUnicode_4BYTE_KIND);
4747            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4748        }
4749
4750        switch (ch) {
4751        case 0:
4752            if (s == end || consumed)
4753                goto End;
4754            errmsg = "unexpected end of data";
4755            startinpos = s - starts;
4756            endinpos = end - starts;
4757            break;
4758        case 1:
4759            errmsg = "invalid start byte";
4760            startinpos = s - starts;
4761            endinpos = startinpos + 1;
4762            break;
4763        case 2:
4764        case 3:
4765        case 4:
4766            errmsg = "invalid continuation byte";
4767            startinpos = s - starts;
4768            endinpos = startinpos + ch - 1;
4769            break;
4770        default:
4771            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4772                goto onError;
4773            continue;
4774        }
4775
4776        if (unicode_decode_call_errorhandler_writer(
4777                errors, &errorHandler,
4778                "utf-8", errmsg,
4779                &starts, &end, &startinpos, &endinpos, &exc, &s,
4780                &writer))
4781            goto onError;
4782    }
4783
4784End:
4785    if (consumed)
4786        *consumed = s - starts;
4787
4788    Py_XDECREF(errorHandler);
4789    Py_XDECREF(exc);
4790    return _PyUnicodeWriter_Finish(&writer);
4791
4792onError:
4793    Py_XDECREF(errorHandler);
4794    Py_XDECREF(exc);
4795    _PyUnicodeWriter_Dealloc(&writer);
4796    return NULL;
4797}
4798
4799#ifdef __APPLE__
4800
4801/* Simplified UTF-8 decoder using surrogateescape error handler,
4802   used to decode the command line arguments on Mac OS X.
4803
4804   Return a pointer to a newly allocated wide character string (use
4805   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4806
4807wchar_t*
4808_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4809{
4810    const char *e;
4811    wchar_t *unicode;
4812    Py_ssize_t outpos;
4813
4814    /* Note: size will always be longer than the resulting Unicode
4815       character count */
4816    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < (size + 1))
4817        return NULL;
4818    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4819    if (!unicode)
4820        return NULL;
4821
4822    /* Unpack UTF-8 encoded data */
4823    e = s + size;
4824    outpos = 0;
4825    while (s < e) {
4826        Py_UCS4 ch;
4827#if SIZEOF_WCHAR_T == 4
4828        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4829#else
4830        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4831#endif
4832        if (ch > 0xFF) {
4833#if SIZEOF_WCHAR_T == 4
4834            assert(0);
4835#else
4836            assert(Py_UNICODE_IS_SURROGATE(ch));
4837            /*  compute and append the two surrogates: */
4838            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4839            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4840#endif
4841        }
4842        else {
4843            if (!ch && s == e)
4844                break;
4845            /* surrogateescape */
4846            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4847        }
4848    }
4849    unicode[outpos] = L'\0';
4850    return unicode;
4851}
4852
4853#endif /* __APPLE__ */
4854
4855/* Primary internal function which creates utf8 encoded bytes objects.
4856
4857   Allocation strategy:  if the string is short, convert into a stack buffer
4858   and allocate exactly as much space needed at the end.  Else allocate the
4859   maximum possible needed (4 result bytes per Unicode character), and return
4860   the excess memory at the end.
4861*/
4862PyObject *
4863_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4864{
4865    enum PyUnicode_Kind kind;
4866    void *data;
4867    Py_ssize_t size;
4868
4869    if (!PyUnicode_Check(unicode)) {
4870        PyErr_BadArgument();
4871        return NULL;
4872    }
4873
4874    if (PyUnicode_READY(unicode) == -1)
4875        return NULL;
4876
4877    if (PyUnicode_UTF8(unicode))
4878        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4879                                         PyUnicode_UTF8_LENGTH(unicode));
4880
4881    kind = PyUnicode_KIND(unicode);
4882    data = PyUnicode_DATA(unicode);
4883    size = PyUnicode_GET_LENGTH(unicode);
4884
4885    switch (kind) {
4886    default:
4887        assert(0);
4888    case PyUnicode_1BYTE_KIND:
4889        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4890        assert(!PyUnicode_IS_ASCII(unicode));
4891        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4892    case PyUnicode_2BYTE_KIND:
4893        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4894    case PyUnicode_4BYTE_KIND:
4895        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4896    }
4897}
4898
4899PyObject *
4900PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4901                     Py_ssize_t size,
4902                     const char *errors)
4903{
4904    PyObject *v, *unicode;
4905
4906    unicode = PyUnicode_FromUnicode(s, size);
4907    if (unicode == NULL)
4908        return NULL;
4909    v = _PyUnicode_AsUTF8String(unicode, errors);
4910    Py_DECREF(unicode);
4911    return v;
4912}
4913
4914PyObject *
4915PyUnicode_AsUTF8String(PyObject *unicode)
4916{
4917    return _PyUnicode_AsUTF8String(unicode, NULL);
4918}
4919
4920/* --- UTF-32 Codec ------------------------------------------------------- */
4921
4922PyObject *
4923PyUnicode_DecodeUTF32(const char *s,
4924                      Py_ssize_t size,
4925                      const char *errors,
4926                      int *byteorder)
4927{
4928    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4929}
4930
4931PyObject *
4932PyUnicode_DecodeUTF32Stateful(const char *s,
4933                              Py_ssize_t size,
4934                              const char *errors,
4935                              int *byteorder,
4936                              Py_ssize_t *consumed)
4937{
4938    const char *starts = s;
4939    Py_ssize_t startinpos;
4940    Py_ssize_t endinpos;
4941    _PyUnicodeWriter writer;
4942    const unsigned char *q, *e;
4943    int le, bo = 0;       /* assume native ordering by default */
4944    const char *encoding;
4945    const char *errmsg = "";
4946    PyObject *errorHandler = NULL;
4947    PyObject *exc = NULL;
4948
4949    q = (unsigned char *)s;
4950    e = q + size;
4951
4952    if (byteorder)
4953        bo = *byteorder;
4954
4955    /* Check for BOM marks (U+FEFF) in the input and adjust current
4956       byte order setting accordingly. In native mode, the leading BOM
4957       mark is skipped, in all other modes, it is copied to the output
4958       stream as-is (giving a ZWNBSP character). */
4959    if (bo == 0 && size >= 4) {
4960        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4961        if (bom == 0x0000FEFF) {
4962            bo = -1;
4963            q += 4;
4964        }
4965        else if (bom == 0xFFFE0000) {
4966            bo = 1;
4967            q += 4;
4968        }
4969        if (byteorder)
4970            *byteorder = bo;
4971    }
4972
4973    if (q == e) {
4974        if (consumed)
4975            *consumed = size;
4976        _Py_RETURN_UNICODE_EMPTY();
4977    }
4978
4979#ifdef WORDS_BIGENDIAN
4980    le = bo < 0;
4981#else
4982    le = bo <= 0;
4983#endif
4984    encoding = le ? "utf-32-le" : "utf-32-be";
4985
4986    _PyUnicodeWriter_Init(&writer);
4987    writer.min_length = (e - q + 3) / 4;
4988    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4989        goto onError;
4990
4991    while (1) {
4992        Py_UCS4 ch = 0;
4993        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
4994
4995        if (e - q >= 4) {
4996            enum PyUnicode_Kind kind = writer.kind;
4997            void *data = writer.data;
4998            const unsigned char *last = e - 4;
4999            Py_ssize_t pos = writer.pos;
5000            if (le) {
5001                do {
5002                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5003                    if (ch > maxch)
5004                        break;
5005                    if (kind != PyUnicode_1BYTE_KIND &&
5006                        Py_UNICODE_IS_SURROGATE(ch))
5007                        break;
5008                    PyUnicode_WRITE(kind, data, pos++, ch);
5009                    q += 4;
5010                } while (q <= last);
5011            }
5012            else {
5013                do {
5014                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5015                    if (ch > maxch)
5016                        break;
5017                    if (kind != PyUnicode_1BYTE_KIND &&
5018                        Py_UNICODE_IS_SURROGATE(ch))
5019                        break;
5020                    PyUnicode_WRITE(kind, data, pos++, ch);
5021                    q += 4;
5022                } while (q <= last);
5023            }
5024            writer.pos = pos;
5025        }
5026
5027        if (Py_UNICODE_IS_SURROGATE(ch)) {
5028            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5029            startinpos = ((const char *)q) - starts;
5030            endinpos = startinpos + 4;
5031        }
5032        else if (ch <= maxch) {
5033            if (q == e || consumed)
5034                break;
5035            /* remaining bytes at the end? (size should be divisible by 4) */
5036            errmsg = "truncated data";
5037            startinpos = ((const char *)q) - starts;
5038            endinpos = ((const char *)e) - starts;
5039        }
5040        else {
5041            if (ch < 0x110000) {
5042                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5043                    goto onError;
5044                q += 4;
5045                continue;
5046            }
5047            errmsg = "code point not in range(0x110000)";
5048            startinpos = ((const char *)q) - starts;
5049            endinpos = startinpos + 4;
5050        }
5051
5052        /* The remaining input chars are ignored if the callback
5053           chooses to skip the input */
5054        if (unicode_decode_call_errorhandler_writer(
5055                errors, &errorHandler,
5056                encoding, errmsg,
5057                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5058                &writer))
5059            goto onError;
5060    }
5061
5062    if (consumed)
5063        *consumed = (const char *)q-starts;
5064
5065    Py_XDECREF(errorHandler);
5066    Py_XDECREF(exc);
5067    return _PyUnicodeWriter_Finish(&writer);
5068
5069  onError:
5070    _PyUnicodeWriter_Dealloc(&writer);
5071    Py_XDECREF(errorHandler);
5072    Py_XDECREF(exc);
5073    return NULL;
5074}
5075
5076PyObject *
5077_PyUnicode_EncodeUTF32(PyObject *str,
5078                       const char *errors,
5079                       int byteorder)
5080{
5081    enum PyUnicode_Kind kind;
5082    const void *data;
5083    Py_ssize_t len;
5084    PyObject *v;
5085    PY_UINT32_T *out;
5086#if PY_LITTLE_ENDIAN
5087    int native_ordering = byteorder <= 0;
5088#else
5089    int native_ordering = byteorder >= 0;
5090#endif
5091    const char *encoding;
5092    Py_ssize_t nsize, pos;
5093    PyObject *errorHandler = NULL;
5094    PyObject *exc = NULL;
5095    PyObject *rep = NULL;
5096
5097    if (!PyUnicode_Check(str)) {
5098        PyErr_BadArgument();
5099        return NULL;
5100    }
5101    if (PyUnicode_READY(str) == -1)
5102        return NULL;
5103    kind = PyUnicode_KIND(str);
5104    data = PyUnicode_DATA(str);
5105    len = PyUnicode_GET_LENGTH(str);
5106
5107    if (len > PY_SSIZE_T_MAX / 4 - (byteorder == 0))
5108        return PyErr_NoMemory();
5109    nsize = len + (byteorder == 0);
5110    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5111    if (v == NULL)
5112        return NULL;
5113
5114    /* output buffer is 4-bytes aligned */
5115    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 4));
5116    out = (PY_UINT32_T *)PyBytes_AS_STRING(v);
5117    if (byteorder == 0)
5118        *out++ = 0xFEFF;
5119    if (len == 0)
5120        goto done;
5121
5122    if (byteorder == -1)
5123        encoding = "utf-32-le";
5124    else if (byteorder == 1)
5125        encoding = "utf-32-be";
5126    else
5127        encoding = "utf-32";
5128
5129    if (kind == PyUnicode_1BYTE_KIND) {
5130        ucs1lib_utf32_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5131        goto done;
5132    }
5133
5134    pos = 0;
5135    while (pos < len) {
5136        Py_ssize_t repsize, moreunits;
5137
5138        if (kind == PyUnicode_2BYTE_KIND) {
5139            pos += ucs2lib_utf32_encode((const Py_UCS2 *)data + pos, len - pos,
5140                                        &out, native_ordering);
5141        }
5142        else {
5143            assert(kind == PyUnicode_4BYTE_KIND);
5144            pos += ucs4lib_utf32_encode((const Py_UCS4 *)data + pos, len - pos,
5145                                        &out, native_ordering);
5146        }
5147        if (pos == len)
5148            break;
5149
5150        rep = unicode_encode_call_errorhandler(
5151                errors, &errorHandler,
5152                encoding, "surrogates not allowed",
5153                str, &exc, pos, pos + 1, &pos);
5154        if (!rep)
5155            goto error;
5156
5157        if (PyBytes_Check(rep)) {
5158            repsize = PyBytes_GET_SIZE(rep);
5159            if (repsize & 3) {
5160                raise_encode_exception(&exc, encoding,
5161                                       str, pos - 1, pos,
5162                                       "surrogates not allowed");
5163                goto error;
5164            }
5165            moreunits = repsize / 4;
5166        }
5167        else {
5168            assert(PyUnicode_Check(rep));
5169            if (PyUnicode_READY(rep) < 0)
5170                goto error;
5171            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5172            if (!PyUnicode_IS_ASCII(rep)) {
5173                raise_encode_exception(&exc, encoding,
5174                                       str, pos - 1, pos,
5175                                       "surrogates not allowed");
5176                goto error;
5177            }
5178        }
5179
5180        /* four bytes are reserved for each surrogate */
5181        if (moreunits > 1) {
5182            Py_ssize_t outpos = out - (PY_UINT32_T*) PyBytes_AS_STRING(v);
5183            Py_ssize_t morebytes = 4 * (moreunits - 1);
5184            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5185                /* integer overflow */
5186                PyErr_NoMemory();
5187                goto error;
5188            }
5189            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5190                goto error;
5191            out = (PY_UINT32_T*) PyBytes_AS_STRING(v) + outpos;
5192        }
5193
5194        if (PyBytes_Check(rep)) {
5195            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5196            out += moreunits;
5197        } else /* rep is unicode */ {
5198            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5199            ucs1lib_utf32_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5200                                 &out, native_ordering);
5201        }
5202
5203        Py_CLEAR(rep);
5204    }
5205
5206    /* Cut back to size actually needed. This is necessary for, for example,
5207       encoding of a string containing isolated surrogates and the 'ignore'
5208       handler is used. */
5209    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5210    if (nsize != PyBytes_GET_SIZE(v))
5211      _PyBytes_Resize(&v, nsize);
5212    Py_XDECREF(errorHandler);
5213    Py_XDECREF(exc);
5214  done:
5215    return v;
5216  error:
5217    Py_XDECREF(rep);
5218    Py_XDECREF(errorHandler);
5219    Py_XDECREF(exc);
5220    Py_XDECREF(v);
5221    return NULL;
5222}
5223
5224PyObject *
5225PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5226                      Py_ssize_t size,
5227                      const char *errors,
5228                      int byteorder)
5229{
5230    PyObject *result;
5231    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5232    if (tmp == NULL)
5233        return NULL;
5234    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5235    Py_DECREF(tmp);
5236    return result;
5237}
5238
5239PyObject *
5240PyUnicode_AsUTF32String(PyObject *unicode)
5241{
5242    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5243}
5244
5245/* --- UTF-16 Codec ------------------------------------------------------- */
5246
5247PyObject *
5248PyUnicode_DecodeUTF16(const char *s,
5249                      Py_ssize_t size,
5250                      const char *errors,
5251                      int *byteorder)
5252{
5253    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5254}
5255
5256PyObject *
5257PyUnicode_DecodeUTF16Stateful(const char *s,
5258                              Py_ssize_t size,
5259                              const char *errors,
5260                              int *byteorder,
5261                              Py_ssize_t *consumed)
5262{
5263    const char *starts = s;
5264    Py_ssize_t startinpos;
5265    Py_ssize_t endinpos;
5266    _PyUnicodeWriter writer;
5267    const unsigned char *q, *e;
5268    int bo = 0;       /* assume native ordering by default */
5269    int native_ordering;
5270    const char *errmsg = "";
5271    PyObject *errorHandler = NULL;
5272    PyObject *exc = NULL;
5273    const char *encoding;
5274
5275    q = (unsigned char *)s;
5276    e = q + size;
5277
5278    if (byteorder)
5279        bo = *byteorder;
5280
5281    /* Check for BOM marks (U+FEFF) in the input and adjust current
5282       byte order setting accordingly. In native mode, the leading BOM
5283       mark is skipped, in all other modes, it is copied to the output
5284       stream as-is (giving a ZWNBSP character). */
5285    if (bo == 0 && size >= 2) {
5286        const Py_UCS4 bom = (q[1] << 8) | q[0];
5287        if (bom == 0xFEFF) {
5288            q += 2;
5289            bo = -1;
5290        }
5291        else if (bom == 0xFFFE) {
5292            q += 2;
5293            bo = 1;
5294        }
5295        if (byteorder)
5296            *byteorder = bo;
5297    }
5298
5299    if (q == e) {
5300        if (consumed)
5301            *consumed = size;
5302        _Py_RETURN_UNICODE_EMPTY();
5303    }
5304
5305#if PY_LITTLE_ENDIAN
5306    native_ordering = bo <= 0;
5307    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5308#else
5309    native_ordering = bo >= 0;
5310    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5311#endif
5312
5313    /* Note: size will always be longer than the resulting Unicode
5314       character count */
5315    _PyUnicodeWriter_Init(&writer);
5316    writer.min_length = (e - q + 1) / 2;
5317    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5318        goto onError;
5319
5320    while (1) {
5321        Py_UCS4 ch = 0;
5322        if (e - q >= 2) {
5323            int kind = writer.kind;
5324            if (kind == PyUnicode_1BYTE_KIND) {
5325                if (PyUnicode_IS_ASCII(writer.buffer))
5326                    ch = asciilib_utf16_decode(&q, e,
5327                            (Py_UCS1*)writer.data, &writer.pos,
5328                            native_ordering);
5329                else
5330                    ch = ucs1lib_utf16_decode(&q, e,
5331                            (Py_UCS1*)writer.data, &writer.pos,
5332                            native_ordering);
5333            } else if (kind == PyUnicode_2BYTE_KIND) {
5334                ch = ucs2lib_utf16_decode(&q, e,
5335                        (Py_UCS2*)writer.data, &writer.pos,
5336                        native_ordering);
5337            } else {
5338                assert(kind == PyUnicode_4BYTE_KIND);
5339                ch = ucs4lib_utf16_decode(&q, e,
5340                        (Py_UCS4*)writer.data, &writer.pos,
5341                        native_ordering);
5342            }
5343        }
5344
5345        switch (ch)
5346        {
5347        case 0:
5348            /* remaining byte at the end? (size should be even) */
5349            if (q == e || consumed)
5350                goto End;
5351            errmsg = "truncated data";
5352            startinpos = ((const char *)q) - starts;
5353            endinpos = ((const char *)e) - starts;
5354            break;
5355            /* The remaining input chars are ignored if the callback
5356               chooses to skip the input */
5357        case 1:
5358            q -= 2;
5359            if (consumed)
5360                goto End;
5361            errmsg = "unexpected end of data";
5362            startinpos = ((const char *)q) - starts;
5363            endinpos = ((const char *)e) - starts;
5364            break;
5365        case 2:
5366            errmsg = "illegal encoding";
5367            startinpos = ((const char *)q) - 2 - starts;
5368            endinpos = startinpos + 2;
5369            break;
5370        case 3:
5371            errmsg = "illegal UTF-16 surrogate";
5372            startinpos = ((const char *)q) - 4 - starts;
5373            endinpos = startinpos + 2;
5374            break;
5375        default:
5376            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5377                goto onError;
5378            continue;
5379        }
5380
5381        if (unicode_decode_call_errorhandler_writer(
5382                errors,
5383                &errorHandler,
5384                encoding, errmsg,
5385                &starts,
5386                (const char **)&e,
5387                &startinpos,
5388                &endinpos,
5389                &exc,
5390                (const char **)&q,
5391                &writer))
5392            goto onError;
5393    }
5394
5395End:
5396    if (consumed)
5397        *consumed = (const char *)q-starts;
5398
5399    Py_XDECREF(errorHandler);
5400    Py_XDECREF(exc);
5401    return _PyUnicodeWriter_Finish(&writer);
5402
5403  onError:
5404    _PyUnicodeWriter_Dealloc(&writer);
5405    Py_XDECREF(errorHandler);
5406    Py_XDECREF(exc);
5407    return NULL;
5408}
5409
5410PyObject *
5411_PyUnicode_EncodeUTF16(PyObject *str,
5412                       const char *errors,
5413                       int byteorder)
5414{
5415    enum PyUnicode_Kind kind;
5416    const void *data;
5417    Py_ssize_t len;
5418    PyObject *v;
5419    unsigned short *out;
5420    Py_ssize_t pairs;
5421#if PY_BIG_ENDIAN
5422    int native_ordering = byteorder >= 0;
5423#else
5424    int native_ordering = byteorder <= 0;
5425#endif
5426    const char *encoding;
5427    Py_ssize_t nsize, pos;
5428    PyObject *errorHandler = NULL;
5429    PyObject *exc = NULL;
5430    PyObject *rep = NULL;
5431
5432    if (!PyUnicode_Check(str)) {
5433        PyErr_BadArgument();
5434        return NULL;
5435    }
5436    if (PyUnicode_READY(str) == -1)
5437        return NULL;
5438    kind = PyUnicode_KIND(str);
5439    data = PyUnicode_DATA(str);
5440    len = PyUnicode_GET_LENGTH(str);
5441
5442    pairs = 0;
5443    if (kind == PyUnicode_4BYTE_KIND) {
5444        const Py_UCS4 *in = (const Py_UCS4 *)data;
5445        const Py_UCS4 *end = in + len;
5446        while (in < end)
5447            if (*in++ >= 0x10000)
5448                pairs++;
5449    }
5450    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5451        return PyErr_NoMemory();
5452    nsize = len + pairs + (byteorder == 0);
5453    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5454    if (v == NULL)
5455        return NULL;
5456
5457    /* output buffer is 2-bytes aligned */
5458    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5459    out = (unsigned short *)PyBytes_AS_STRING(v);
5460    if (byteorder == 0)
5461        *out++ = 0xFEFF;
5462    if (len == 0)
5463        goto done;
5464
5465    if (kind == PyUnicode_1BYTE_KIND) {
5466        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5467        goto done;
5468    }
5469
5470    if (byteorder < 0)
5471        encoding = "utf-16-le";
5472    else if (byteorder > 0)
5473        encoding = "utf-16-be";
5474    else
5475        encoding = "utf-16";
5476
5477    pos = 0;
5478    while (pos < len) {
5479        Py_ssize_t repsize, moreunits;
5480
5481        if (kind == PyUnicode_2BYTE_KIND) {
5482            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5483                                        &out, native_ordering);
5484        }
5485        else {
5486            assert(kind == PyUnicode_4BYTE_KIND);
5487            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5488                                        &out, native_ordering);
5489        }
5490        if (pos == len)
5491            break;
5492
5493        rep = unicode_encode_call_errorhandler(
5494                errors, &errorHandler,
5495                encoding, "surrogates not allowed",
5496                str, &exc, pos, pos + 1, &pos);
5497        if (!rep)
5498            goto error;
5499
5500        if (PyBytes_Check(rep)) {
5501            repsize = PyBytes_GET_SIZE(rep);
5502            if (repsize & 1) {
5503                raise_encode_exception(&exc, encoding,
5504                                       str, pos - 1, pos,
5505                                       "surrogates not allowed");
5506                goto error;
5507            }
5508            moreunits = repsize / 2;
5509        }
5510        else {
5511            assert(PyUnicode_Check(rep));
5512            if (PyUnicode_READY(rep) < 0)
5513                goto error;
5514            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5515            if (!PyUnicode_IS_ASCII(rep)) {
5516                raise_encode_exception(&exc, encoding,
5517                                       str, pos - 1, pos,
5518                                       "surrogates not allowed");
5519                goto error;
5520            }
5521        }
5522
5523        /* two bytes are reserved for each surrogate */
5524        if (moreunits > 1) {
5525            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5526            Py_ssize_t morebytes = 2 * (moreunits - 1);
5527            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5528                /* integer overflow */
5529                PyErr_NoMemory();
5530                goto error;
5531            }
5532            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5533                goto error;
5534            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5535        }
5536
5537        if (PyBytes_Check(rep)) {
5538            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5539            out += moreunits;
5540        } else /* rep is unicode */ {
5541            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5542            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5543                                 &out, native_ordering);
5544        }
5545
5546        Py_CLEAR(rep);
5547    }
5548
5549    /* Cut back to size actually needed. This is necessary for, for example,
5550    encoding of a string containing isolated surrogates and the 'ignore' handler
5551    is used. */
5552    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5553    if (nsize != PyBytes_GET_SIZE(v))
5554      _PyBytes_Resize(&v, nsize);
5555    Py_XDECREF(errorHandler);
5556    Py_XDECREF(exc);
5557  done:
5558    return v;
5559  error:
5560    Py_XDECREF(rep);
5561    Py_XDECREF(errorHandler);
5562    Py_XDECREF(exc);
5563    Py_XDECREF(v);
5564    return NULL;
5565#undef STORECHAR
5566}
5567
5568PyObject *
5569PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5570                      Py_ssize_t size,
5571                      const char *errors,
5572                      int byteorder)
5573{
5574    PyObject *result;
5575    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5576    if (tmp == NULL)
5577        return NULL;
5578    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5579    Py_DECREF(tmp);
5580    return result;
5581}
5582
5583PyObject *
5584PyUnicode_AsUTF16String(PyObject *unicode)
5585{
5586    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5587}
5588
5589/* --- Unicode Escape Codec ----------------------------------------------- */
5590
5591/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5592   if all the escapes in the string make it still a valid ASCII string.
5593   Returns -1 if any escapes were found which cause the string to
5594   pop out of ASCII range.  Otherwise returns the length of the
5595   required buffer to hold the string.
5596   */
5597static Py_ssize_t
5598length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5599{
5600    const unsigned char *p = (const unsigned char *)s;
5601    const unsigned char *end = p + size;
5602    Py_ssize_t length = 0;
5603
5604    if (size < 0)
5605        return -1;
5606
5607    for (; p < end; ++p) {
5608        if (*p > 127) {
5609            /* Non-ASCII */
5610            return -1;
5611        }
5612        else if (*p != '\\') {
5613            /* Normal character */
5614            ++length;
5615        }
5616        else {
5617            /* Backslash-escape, check next char */
5618            ++p;
5619            /* Escape sequence reaches till end of string or
5620               non-ASCII follow-up. */
5621            if (p >= end || *p > 127)
5622                return -1;
5623            switch (*p) {
5624            case '\n':
5625                /* backslash + \n result in zero characters */
5626                break;
5627            case '\\': case '\'': case '\"':
5628            case 'b': case 'f': case 't':
5629            case 'n': case 'r': case 'v': case 'a':
5630                ++length;
5631                break;
5632            case '0': case '1': case '2': case '3':
5633            case '4': case '5': case '6': case '7':
5634            case 'x': case 'u': case 'U': case 'N':
5635                /* these do not guarantee ASCII characters */
5636                return -1;
5637            default:
5638                /* count the backslash + the other character */
5639                length += 2;
5640            }
5641        }
5642    }
5643    return length;
5644}
5645
5646static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5647
5648PyObject *
5649PyUnicode_DecodeUnicodeEscape(const char *s,
5650                              Py_ssize_t size,
5651                              const char *errors)
5652{
5653    const char *starts = s;
5654    Py_ssize_t startinpos;
5655    Py_ssize_t endinpos;
5656    _PyUnicodeWriter writer;
5657    const char *end;
5658    char* message;
5659    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5660    PyObject *errorHandler = NULL;
5661    PyObject *exc = NULL;
5662    Py_ssize_t len;
5663
5664    len = length_of_escaped_ascii_string(s, size);
5665    if (len == 0)
5666        _Py_RETURN_UNICODE_EMPTY();
5667
5668    /* After length_of_escaped_ascii_string() there are two alternatives,
5669       either the string is pure ASCII with named escapes like \n, etc.
5670       and we determined it's exact size (common case)
5671       or it contains \x, \u, ... escape sequences.  then we create a
5672       legacy wchar string and resize it at the end of this function. */
5673    _PyUnicodeWriter_Init(&writer);
5674    if (len > 0) {
5675        writer.min_length = len;
5676    }
5677    else {
5678        /* Escaped strings will always be longer than the resulting
5679           Unicode string, so we start with size here and then reduce the
5680           length after conversion to the true value.
5681           (but if the error callback returns a long replacement string
5682           we'll have to allocate more space) */
5683        writer.min_length = size;
5684    }
5685
5686    if (size == 0)
5687        return _PyUnicodeWriter_Finish(&writer);
5688    end = s + size;
5689
5690    while (s < end) {
5691        unsigned char c;
5692        Py_UCS4 x;
5693        int digits;
5694
5695        /* Non-escape characters are interpreted as Unicode ordinals */
5696        if (*s != '\\') {
5697            x = (unsigned char)*s;
5698            s++;
5699            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5700                goto onError;
5701            continue;
5702        }
5703
5704        startinpos = s-starts;
5705        /* \ - Escapes */
5706        s++;
5707        c = *s++;
5708        if (s > end)
5709            c = '\0'; /* Invalid after \ */
5710
5711        switch (c) {
5712
5713            /* \x escapes */
5714#define WRITECHAR(ch)                                                      \
5715            do {                                                           \
5716                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5717                    goto onError;                                          \
5718            } while(0)
5719
5720        case '\n': break;
5721        case '\\': WRITECHAR('\\'); break;
5722        case '\'': WRITECHAR('\''); break;
5723        case '\"': WRITECHAR('\"'); break;
5724        case 'b': WRITECHAR('\b'); break;
5725        /* FF */
5726        case 'f': WRITECHAR('\014'); break;
5727        case 't': WRITECHAR('\t'); break;
5728        case 'n': WRITECHAR('\n'); break;
5729        case 'r': WRITECHAR('\r'); break;
5730        /* VT */
5731        case 'v': WRITECHAR('\013'); break;
5732        /* BEL, not classic C */
5733        case 'a': WRITECHAR('\007'); break;
5734
5735            /* \OOO (octal) escapes */
5736        case '0': case '1': case '2': case '3':
5737        case '4': case '5': case '6': case '7':
5738            x = s[-1] - '0';
5739            if (s < end && '0' <= *s && *s <= '7') {
5740                x = (x<<3) + *s++ - '0';
5741                if (s < end && '0' <= *s && *s <= '7')
5742                    x = (x<<3) + *s++ - '0';
5743            }
5744            WRITECHAR(x);
5745            break;
5746
5747            /* hex escapes */
5748            /* \xXX */
5749        case 'x':
5750            digits = 2;
5751            message = "truncated \\xXX escape";
5752            goto hexescape;
5753
5754            /* \uXXXX */
5755        case 'u':
5756            digits = 4;
5757            message = "truncated \\uXXXX escape";
5758            goto hexescape;
5759
5760            /* \UXXXXXXXX */
5761        case 'U':
5762            digits = 8;
5763            message = "truncated \\UXXXXXXXX escape";
5764        hexescape:
5765            chr = 0;
5766            if (end - s < digits) {
5767                /* count only hex digits */
5768                for (; s < end; ++s) {
5769                    c = (unsigned char)*s;
5770                    if (!Py_ISXDIGIT(c))
5771                        goto error;
5772                }
5773                goto error;
5774            }
5775            for (; digits--; ++s) {
5776                c = (unsigned char)*s;
5777                if (!Py_ISXDIGIT(c))
5778                    goto error;
5779                chr = (chr<<4) & ~0xF;
5780                if (c >= '0' && c <= '9')
5781                    chr += c - '0';
5782                else if (c >= 'a' && c <= 'f')
5783                    chr += 10 + c - 'a';
5784                else
5785                    chr += 10 + c - 'A';
5786            }
5787            if (chr == 0xffffffff && PyErr_Occurred())
5788                /* _decoding_error will have already written into the
5789                   target buffer. */
5790                break;
5791        store:
5792            /* when we get here, chr is a 32-bit unicode character */
5793            message = "illegal Unicode character";
5794            if (chr > MAX_UNICODE)
5795                goto error;
5796            WRITECHAR(chr);
5797            break;
5798
5799            /* \N{name} */
5800        case 'N':
5801            message = "malformed \\N character escape";
5802            if (ucnhash_CAPI == NULL) {
5803                /* load the unicode data module */
5804                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5805                                                PyUnicodeData_CAPSULE_NAME, 1);
5806                if (ucnhash_CAPI == NULL)
5807                    goto ucnhashError;
5808            }
5809            if (*s == '{') {
5810                const char *start = s+1;
5811                /* look for the closing brace */
5812                while (*s != '}' && s < end)
5813                    s++;
5814                if (s > start && s < end && *s == '}') {
5815                    /* found a name.  look it up in the unicode database */
5816                    message = "unknown Unicode character name";
5817                    s++;
5818                    if (s - start - 1 <= INT_MAX &&
5819                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5820                                              &chr, 0))
5821                        goto store;
5822                }
5823            }
5824            goto error;
5825
5826        default:
5827            if (s > end) {
5828                message = "\\ at end of string";
5829                s--;
5830                goto error;
5831            }
5832            else {
5833                WRITECHAR('\\');
5834                WRITECHAR((unsigned char)s[-1]);
5835            }
5836            break;
5837        }
5838        continue;
5839
5840      error:
5841        endinpos = s-starts;
5842        if (unicode_decode_call_errorhandler_writer(
5843                errors, &errorHandler,
5844                "unicodeescape", message,
5845                &starts, &end, &startinpos, &endinpos, &exc, &s,
5846                &writer))
5847            goto onError;
5848        continue;
5849    }
5850#undef WRITECHAR
5851
5852    Py_XDECREF(errorHandler);
5853    Py_XDECREF(exc);
5854    return _PyUnicodeWriter_Finish(&writer);
5855
5856  ucnhashError:
5857    PyErr_SetString(
5858        PyExc_UnicodeError,
5859        "\\N escapes not supported (can't load unicodedata module)"
5860        );
5861    _PyUnicodeWriter_Dealloc(&writer);
5862    Py_XDECREF(errorHandler);
5863    Py_XDECREF(exc);
5864    return NULL;
5865
5866  onError:
5867    _PyUnicodeWriter_Dealloc(&writer);
5868    Py_XDECREF(errorHandler);
5869    Py_XDECREF(exc);
5870    return NULL;
5871}
5872
5873/* Return a Unicode-Escape string version of the Unicode object.
5874
5875   If quotes is true, the string is enclosed in u"" or u'' quotes as
5876   appropriate.
5877
5878*/
5879
5880PyObject *
5881PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5882{
5883    Py_ssize_t i, len;
5884    PyObject *repr;
5885    char *p;
5886    int kind;
5887    void *data;
5888    Py_ssize_t expandsize = 0;
5889
5890    /* Initial allocation is based on the longest-possible character
5891       escape.
5892
5893       For UCS1 strings it's '\xxx', 4 bytes per source character.
5894       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5895       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5896    */
5897
5898    if (!PyUnicode_Check(unicode)) {
5899        PyErr_BadArgument();
5900        return NULL;
5901    }
5902    if (PyUnicode_READY(unicode) == -1)
5903        return NULL;
5904    len = PyUnicode_GET_LENGTH(unicode);
5905    kind = PyUnicode_KIND(unicode);
5906    data = PyUnicode_DATA(unicode);
5907    switch (kind) {
5908    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5909    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5910    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5911    }
5912
5913    if (len == 0)
5914        return PyBytes_FromStringAndSize(NULL, 0);
5915
5916    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5917        return PyErr_NoMemory();
5918
5919    repr = PyBytes_FromStringAndSize(NULL,
5920                                     2
5921                                     + expandsize*len
5922                                     + 1);
5923    if (repr == NULL)
5924        return NULL;
5925
5926    p = PyBytes_AS_STRING(repr);
5927
5928    for (i = 0; i < len; i++) {
5929        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5930
5931        /* Escape backslashes */
5932        if (ch == '\\') {
5933            *p++ = '\\';
5934            *p++ = (char) ch;
5935            continue;
5936        }
5937
5938        /* Map 21-bit characters to '\U00xxxxxx' */
5939        else if (ch >= 0x10000) {
5940            assert(ch <= MAX_UNICODE);
5941            *p++ = '\\';
5942            *p++ = 'U';
5943            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5944            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5945            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5946            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5947            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5948            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5949            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5950            *p++ = Py_hexdigits[ch & 0x0000000F];
5951            continue;
5952        }
5953
5954        /* Map 16-bit characters to '\uxxxx' */
5955        if (ch >= 256) {
5956            *p++ = '\\';
5957            *p++ = 'u';
5958            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5959            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5960            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5961            *p++ = Py_hexdigits[ch & 0x000F];
5962        }
5963
5964        /* Map special whitespace to '\t', \n', '\r' */
5965        else if (ch == '\t') {
5966            *p++ = '\\';
5967            *p++ = 't';
5968        }
5969        else if (ch == '\n') {
5970            *p++ = '\\';
5971            *p++ = 'n';
5972        }
5973        else if (ch == '\r') {
5974            *p++ = '\\';
5975            *p++ = 'r';
5976        }
5977
5978        /* Map non-printable US ASCII to '\xhh' */
5979        else if (ch < ' ' || ch >= 0x7F) {
5980            *p++ = '\\';
5981            *p++ = 'x';
5982            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5983            *p++ = Py_hexdigits[ch & 0x000F];
5984        }
5985
5986        /* Copy everything else as-is */
5987        else
5988            *p++ = (char) ch;
5989    }
5990
5991    assert(p - PyBytes_AS_STRING(repr) > 0);
5992    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5993        return NULL;
5994    return repr;
5995}
5996
5997PyObject *
5998PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5999                              Py_ssize_t size)
6000{
6001    PyObject *result;
6002    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6003    if (tmp == NULL)
6004        return NULL;
6005    result = PyUnicode_AsUnicodeEscapeString(tmp);
6006    Py_DECREF(tmp);
6007    return result;
6008}
6009
6010/* --- Raw Unicode Escape Codec ------------------------------------------- */
6011
6012PyObject *
6013PyUnicode_DecodeRawUnicodeEscape(const char *s,
6014                                 Py_ssize_t size,
6015                                 const char *errors)
6016{
6017    const char *starts = s;
6018    Py_ssize_t startinpos;
6019    Py_ssize_t endinpos;
6020    _PyUnicodeWriter writer;
6021    const char *end;
6022    const char *bs;
6023    PyObject *errorHandler = NULL;
6024    PyObject *exc = NULL;
6025
6026    if (size == 0)
6027        _Py_RETURN_UNICODE_EMPTY();
6028
6029    /* Escaped strings will always be longer than the resulting
6030       Unicode string, so we start with size here and then reduce the
6031       length after conversion to the true value. (But decoding error
6032       handler might have to resize the string) */
6033    _PyUnicodeWriter_Init(&writer);
6034    writer.min_length = size;
6035
6036    end = s + size;
6037    while (s < end) {
6038        unsigned char c;
6039        Py_UCS4 x;
6040        int i;
6041        int count;
6042
6043        /* Non-escape characters are interpreted as Unicode ordinals */
6044        if (*s != '\\') {
6045            x = (unsigned char)*s++;
6046            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6047                goto onError;
6048            continue;
6049        }
6050        startinpos = s-starts;
6051
6052        /* \u-escapes are only interpreted iff the number of leading
6053           backslashes if odd */
6054        bs = s;
6055        for (;s < end;) {
6056            if (*s != '\\')
6057                break;
6058            x = (unsigned char)*s++;
6059            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6060                goto onError;
6061        }
6062        if (((s - bs) & 1) == 0 ||
6063            s >= end ||
6064            (*s != 'u' && *s != 'U')) {
6065            continue;
6066        }
6067        writer.pos--;
6068        count = *s=='u' ? 4 : 8;
6069        s++;
6070
6071        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6072        for (x = 0, i = 0; i < count; ++i, ++s) {
6073            c = (unsigned char)*s;
6074            if (!Py_ISXDIGIT(c)) {
6075                endinpos = s-starts;
6076                if (unicode_decode_call_errorhandler_writer(
6077                        errors, &errorHandler,
6078                        "rawunicodeescape", "truncated \\uXXXX",
6079                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6080                        &writer))
6081                    goto onError;
6082                goto nextByte;
6083            }
6084            x = (x<<4) & ~0xF;
6085            if (c >= '0' && c <= '9')
6086                x += c - '0';
6087            else if (c >= 'a' && c <= 'f')
6088                x += 10 + c - 'a';
6089            else
6090                x += 10 + c - 'A';
6091        }
6092        if (x <= MAX_UNICODE) {
6093            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6094                goto onError;
6095        }
6096        else {
6097            endinpos = s-starts;
6098            if (unicode_decode_call_errorhandler_writer(
6099                    errors, &errorHandler,
6100                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6101                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6102                    &writer))
6103                goto onError;
6104        }
6105      nextByte:
6106        ;
6107    }
6108    Py_XDECREF(errorHandler);
6109    Py_XDECREF(exc);
6110    return _PyUnicodeWriter_Finish(&writer);
6111
6112  onError:
6113    _PyUnicodeWriter_Dealloc(&writer);
6114    Py_XDECREF(errorHandler);
6115    Py_XDECREF(exc);
6116    return NULL;
6117}
6118
6119
6120PyObject *
6121PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6122{
6123    PyObject *repr;
6124    char *p;
6125    char *q;
6126    Py_ssize_t expandsize, pos;
6127    int kind;
6128    void *data;
6129    Py_ssize_t len;
6130
6131    if (!PyUnicode_Check(unicode)) {
6132        PyErr_BadArgument();
6133        return NULL;
6134    }
6135    if (PyUnicode_READY(unicode) == -1)
6136        return NULL;
6137    kind = PyUnicode_KIND(unicode);
6138    data = PyUnicode_DATA(unicode);
6139    len = PyUnicode_GET_LENGTH(unicode);
6140    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6141       bytes, and 1 byte characters 4. */
6142    expandsize = kind * 2 + 2;
6143
6144    if (len > PY_SSIZE_T_MAX / expandsize)
6145        return PyErr_NoMemory();
6146
6147    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6148    if (repr == NULL)
6149        return NULL;
6150    if (len == 0)
6151        return repr;
6152
6153    p = q = PyBytes_AS_STRING(repr);
6154    for (pos = 0; pos < len; pos++) {
6155        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6156        /* Map 32-bit characters to '\Uxxxxxxxx' */
6157        if (ch >= 0x10000) {
6158            assert(ch <= MAX_UNICODE);
6159            *p++ = '\\';
6160            *p++ = 'U';
6161            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6162            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6163            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6164            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6165            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6166            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6167            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6168            *p++ = Py_hexdigits[ch & 15];
6169        }
6170        /* Map 16-bit characters to '\uxxxx' */
6171        else if (ch >= 256) {
6172            *p++ = '\\';
6173            *p++ = 'u';
6174            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6175            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6176            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6177            *p++ = Py_hexdigits[ch & 15];
6178        }
6179        /* Copy everything else as-is */
6180        else
6181            *p++ = (char) ch;
6182    }
6183
6184    assert(p > q);
6185    if (_PyBytes_Resize(&repr, p - q) < 0)
6186        return NULL;
6187    return repr;
6188}
6189
6190PyObject *
6191PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6192                                 Py_ssize_t size)
6193{
6194    PyObject *result;
6195    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6196    if (tmp == NULL)
6197        return NULL;
6198    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6199    Py_DECREF(tmp);
6200    return result;
6201}
6202
6203/* --- Unicode Internal Codec ------------------------------------------- */
6204
6205PyObject *
6206_PyUnicode_DecodeUnicodeInternal(const char *s,
6207                                 Py_ssize_t size,
6208                                 const char *errors)
6209{
6210    const char *starts = s;
6211    Py_ssize_t startinpos;
6212    Py_ssize_t endinpos;
6213    _PyUnicodeWriter writer;
6214    const char *end;
6215    const char *reason;
6216    PyObject *errorHandler = NULL;
6217    PyObject *exc = NULL;
6218
6219    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6220                     "unicode_internal codec has been deprecated",
6221                     1))
6222        return NULL;
6223
6224    if (size == 0)
6225        _Py_RETURN_UNICODE_EMPTY();
6226
6227    _PyUnicodeWriter_Init(&writer);
6228    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6229        PyErr_NoMemory();
6230        goto onError;
6231    }
6232    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6233
6234    end = s + size;
6235    while (s < end) {
6236        Py_UNICODE uch;
6237        Py_UCS4 ch;
6238        if (end - s < Py_UNICODE_SIZE) {
6239            endinpos = end-starts;
6240            reason = "truncated input";
6241            goto error;
6242        }
6243        /* We copy the raw representation one byte at a time because the
6244           pointer may be unaligned (see test_codeccallbacks). */
6245        ((char *) &uch)[0] = s[0];
6246        ((char *) &uch)[1] = s[1];
6247#ifdef Py_UNICODE_WIDE
6248        ((char *) &uch)[2] = s[2];
6249        ((char *) &uch)[3] = s[3];
6250#endif
6251        ch = uch;
6252#ifdef Py_UNICODE_WIDE
6253        /* We have to sanity check the raw data, otherwise doom looms for
6254           some malformed UCS-4 data. */
6255        if (ch > 0x10ffff) {
6256            endinpos = s - starts + Py_UNICODE_SIZE;
6257            reason = "illegal code point (> 0x10FFFF)";
6258            goto error;
6259        }
6260#endif
6261        s += Py_UNICODE_SIZE;
6262#ifndef Py_UNICODE_WIDE
6263        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6264        {
6265            Py_UNICODE uch2;
6266            ((char *) &uch2)[0] = s[0];
6267            ((char *) &uch2)[1] = s[1];
6268            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6269            {
6270                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6271                s += Py_UNICODE_SIZE;
6272            }
6273        }
6274#endif
6275
6276        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6277            goto onError;
6278        continue;
6279
6280  error:
6281        startinpos = s - starts;
6282        if (unicode_decode_call_errorhandler_writer(
6283                errors, &errorHandler,
6284                "unicode_internal", reason,
6285                &starts, &end, &startinpos, &endinpos, &exc, &s,
6286                &writer))
6287            goto onError;
6288    }
6289
6290    Py_XDECREF(errorHandler);
6291    Py_XDECREF(exc);
6292    return _PyUnicodeWriter_Finish(&writer);
6293
6294  onError:
6295    _PyUnicodeWriter_Dealloc(&writer);
6296    Py_XDECREF(errorHandler);
6297    Py_XDECREF(exc);
6298    return NULL;
6299}
6300
6301/* --- Latin-1 Codec ------------------------------------------------------ */
6302
6303PyObject *
6304PyUnicode_DecodeLatin1(const char *s,
6305                       Py_ssize_t size,
6306                       const char *errors)
6307{
6308    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6309    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6310}
6311
6312/* create or adjust a UnicodeEncodeError */
6313static void
6314make_encode_exception(PyObject **exceptionObject,
6315                      const char *encoding,
6316                      PyObject *unicode,
6317                      Py_ssize_t startpos, Py_ssize_t endpos,
6318                      const char *reason)
6319{
6320    if (*exceptionObject == NULL) {
6321        *exceptionObject = PyObject_CallFunction(
6322            PyExc_UnicodeEncodeError, "sOnns",
6323            encoding, unicode, startpos, endpos, reason);
6324    }
6325    else {
6326        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6327            goto onError;
6328        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6329            goto onError;
6330        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6331            goto onError;
6332        return;
6333      onError:
6334        Py_CLEAR(*exceptionObject);
6335    }
6336}
6337
6338/* raises a UnicodeEncodeError */
6339static void
6340raise_encode_exception(PyObject **exceptionObject,
6341                       const char *encoding,
6342                       PyObject *unicode,
6343                       Py_ssize_t startpos, Py_ssize_t endpos,
6344                       const char *reason)
6345{
6346    make_encode_exception(exceptionObject,
6347                          encoding, unicode, startpos, endpos, reason);
6348    if (*exceptionObject != NULL)
6349        PyCodec_StrictErrors(*exceptionObject);
6350}
6351
6352/* error handling callback helper:
6353   build arguments, call the callback and check the arguments,
6354   put the result into newpos and return the replacement string, which
6355   has to be freed by the caller */
6356static PyObject *
6357unicode_encode_call_errorhandler(const char *errors,
6358                                 PyObject **errorHandler,
6359                                 const char *encoding, const char *reason,
6360                                 PyObject *unicode, PyObject **exceptionObject,
6361                                 Py_ssize_t startpos, Py_ssize_t endpos,
6362                                 Py_ssize_t *newpos)
6363{
6364    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6365    Py_ssize_t len;
6366    PyObject *restuple;
6367    PyObject *resunicode;
6368
6369    if (*errorHandler == NULL) {
6370        *errorHandler = PyCodec_LookupError(errors);
6371        if (*errorHandler == NULL)
6372            return NULL;
6373    }
6374
6375    if (PyUnicode_READY(unicode) == -1)
6376        return NULL;
6377    len = PyUnicode_GET_LENGTH(unicode);
6378
6379    make_encode_exception(exceptionObject,
6380                          encoding, unicode, startpos, endpos, reason);
6381    if (*exceptionObject == NULL)
6382        return NULL;
6383
6384    restuple = PyObject_CallFunctionObjArgs(
6385        *errorHandler, *exceptionObject, NULL);
6386    if (restuple == NULL)
6387        return NULL;
6388    if (!PyTuple_Check(restuple)) {
6389        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6390        Py_DECREF(restuple);
6391        return NULL;
6392    }
6393    if (!PyArg_ParseTuple(restuple, argparse,
6394                          &resunicode, newpos)) {
6395        Py_DECREF(restuple);
6396        return NULL;
6397    }
6398    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6399        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6400        Py_DECREF(restuple);
6401        return NULL;
6402    }
6403    if (*newpos<0)
6404        *newpos = len + *newpos;
6405    if (*newpos<0 || *newpos>len) {
6406        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6407        Py_DECREF(restuple);
6408        return NULL;
6409    }
6410    Py_INCREF(resunicode);
6411    Py_DECREF(restuple);
6412    return resunicode;
6413}
6414
6415static PyObject *
6416unicode_encode_ucs1(PyObject *unicode,
6417                    const char *errors,
6418                    const Py_UCS4 limit)
6419{
6420    /* input state */
6421    Py_ssize_t pos=0, size;
6422    int kind;
6423    void *data;
6424    /* output object */
6425    PyObject *res;
6426    /* pointer into the output */
6427    char *str;
6428    /* current output position */
6429    Py_ssize_t ressize;
6430    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6431    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6432    PyObject *error_handler_obj = NULL;
6433    PyObject *exc = NULL;
6434    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6435
6436    if (PyUnicode_READY(unicode) == -1)
6437        return NULL;
6438    size = PyUnicode_GET_LENGTH(unicode);
6439    kind = PyUnicode_KIND(unicode);
6440    data = PyUnicode_DATA(unicode);
6441    /* allocate enough for a simple encoding without
6442       replacements, if we need more, we'll resize */
6443    if (size == 0)
6444        return PyBytes_FromStringAndSize(NULL, 0);
6445    res = PyBytes_FromStringAndSize(NULL, size);
6446    if (res == NULL)
6447        return NULL;
6448    str = PyBytes_AS_STRING(res);
6449    ressize = size;
6450
6451    while (pos < size) {
6452        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6453
6454        /* can we encode this? */
6455        if (ch < limit) {
6456            /* no overflow check, because we know that the space is enough */
6457            *str++ = (char)ch;
6458            ++pos;
6459        }
6460        else {
6461            Py_ssize_t requiredsize;
6462            PyObject *repunicode;
6463            Py_ssize_t repsize, newpos, respos, i;
6464            /* startpos for collecting unencodable chars */
6465            Py_ssize_t collstart = pos;
6466            Py_ssize_t collend = pos;
6467            /* find all unecodable characters */
6468
6469            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6470                ++collend;
6471
6472            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6473            if (error_handler == _Py_ERROR_UNKNOWN)
6474                error_handler = get_error_handler(errors);
6475
6476            switch (error_handler) {
6477            case _Py_ERROR_STRICT:
6478                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6479                goto onError;
6480
6481            case _Py_ERROR_REPLACE:
6482                while (collstart++ < collend)
6483                    *str++ = '?';
6484                /* fall through ignore error handler */
6485            case _Py_ERROR_IGNORE:
6486                pos = collend;
6487                break;
6488
6489            case _Py_ERROR_XMLCHARREFREPLACE:
6490                respos = str - PyBytes_AS_STRING(res);
6491                requiredsize = respos;
6492                /* determine replacement size */
6493                for (i = collstart; i < collend; ++i) {
6494                    Py_ssize_t incr;
6495
6496                    ch = PyUnicode_READ(kind, data, i);
6497                    if (ch < 10)
6498                        incr = 2+1+1;
6499                    else if (ch < 100)
6500                        incr = 2+2+1;
6501                    else if (ch < 1000)
6502                        incr = 2+3+1;
6503                    else if (ch < 10000)
6504                        incr = 2+4+1;
6505                    else if (ch < 100000)
6506                        incr = 2+5+1;
6507                    else if (ch < 1000000)
6508                        incr = 2+6+1;
6509                    else {
6510                        assert(ch <= MAX_UNICODE);
6511                        incr = 2+7+1;
6512                    }
6513                    if (requiredsize > PY_SSIZE_T_MAX - incr)
6514                        goto overflow;
6515                    requiredsize += incr;
6516                }
6517                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6518                    goto overflow;
6519                requiredsize += size - collend;
6520                if (requiredsize > ressize) {
6521                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6522                        requiredsize = 2*ressize;
6523                    if (_PyBytes_Resize(&res, requiredsize))
6524                        goto onError;
6525                    str = PyBytes_AS_STRING(res) + respos;
6526                    ressize = requiredsize;
6527                }
6528                /* generate replacement */
6529                for (i = collstart; i < collend; ++i) {
6530                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6531                }
6532                pos = collend;
6533                break;
6534
6535            case _Py_ERROR_SURROGATEESCAPE:
6536                for (i = collstart; i < collend; ++i) {
6537                    ch = PyUnicode_READ(kind, data, i);
6538                    if (ch < 0xdc80 || 0xdcff < ch) {
6539                        /* Not a UTF-8b surrogate */
6540                        break;
6541                    }
6542                    *str++ = (char)(ch - 0xdc00);
6543                    ++pos;
6544                }
6545                if (i >= collend)
6546                    break;
6547                collstart = pos;
6548                assert(collstart != collend);
6549                /* fallback to general error handling */
6550
6551            default:
6552                repunicode = unicode_encode_call_errorhandler(errors, &error_handler_obj,
6553                                                              encoding, reason, unicode, &exc,
6554                                                              collstart, collend, &newpos);
6555                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6556                                           PyUnicode_READY(repunicode) == -1))
6557                    goto onError;
6558
6559                if (PyBytes_Check(repunicode)) {
6560                    /* Directly copy bytes result to output. */
6561                    repsize = PyBytes_Size(repunicode);
6562                    if (repsize > 1) {
6563                        /* Make room for all additional bytes. */
6564                        respos = str - PyBytes_AS_STRING(res);
6565                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6566                            Py_DECREF(repunicode);
6567                            goto overflow;
6568                        }
6569                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6570                            Py_DECREF(repunicode);
6571                            goto onError;
6572                        }
6573                        str = PyBytes_AS_STRING(res) + respos;
6574                        ressize += repsize-1;
6575                    }
6576                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6577                    str += repsize;
6578                    pos = newpos;
6579                    Py_DECREF(repunicode);
6580                    break;
6581                }
6582
6583                /* need more space? (at least enough for what we
6584                   have+the replacement+the rest of the string, so
6585                   we won't have to check space for encodable characters) */
6586                respos = str - PyBytes_AS_STRING(res);
6587                repsize = PyUnicode_GET_LENGTH(repunicode);
6588                requiredsize = respos;
6589                if (requiredsize > PY_SSIZE_T_MAX - repsize)
6590                    goto overflow;
6591                requiredsize += repsize;
6592                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6593                    goto overflow;
6594                requiredsize += size - collend;
6595                if (requiredsize > ressize) {
6596                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6597                        requiredsize = 2*ressize;
6598                    if (_PyBytes_Resize(&res, requiredsize)) {
6599                        Py_DECREF(repunicode);
6600                        goto onError;
6601                    }
6602                    str = PyBytes_AS_STRING(res) + respos;
6603                    ressize = requiredsize;
6604                }
6605
6606                /* check if there is anything unencodable in the replacement
6607                   and copy it to the output */
6608                for (i = 0; repsize-->0; ++i, ++str) {
6609                    ch = PyUnicode_READ_CHAR(repunicode, i);
6610                    if (ch >= limit) {
6611                        raise_encode_exception(&exc, encoding, unicode,
6612                                               pos, pos+1, reason);
6613                        Py_DECREF(repunicode);
6614                        goto onError;
6615                    }
6616                    *str = (char)ch;
6617                }
6618                pos = newpos;
6619                Py_DECREF(repunicode);
6620            }
6621        }
6622    }
6623    /* Resize if we allocated to much */
6624    size = str - PyBytes_AS_STRING(res);
6625    if (size < ressize) { /* If this falls res will be NULL */
6626        assert(size >= 0);
6627        if (_PyBytes_Resize(&res, size) < 0)
6628            goto onError;
6629    }
6630
6631    Py_XDECREF(error_handler_obj);
6632    Py_XDECREF(exc);
6633    return res;
6634
6635  overflow:
6636    PyErr_SetString(PyExc_OverflowError,
6637                    "encoded result is too long for a Python string");
6638
6639  onError:
6640    Py_XDECREF(res);
6641    Py_XDECREF(error_handler_obj);
6642    Py_XDECREF(exc);
6643    return NULL;
6644}
6645
6646/* Deprecated */
6647PyObject *
6648PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6649                       Py_ssize_t size,
6650                       const char *errors)
6651{
6652    PyObject *result;
6653    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6654    if (unicode == NULL)
6655        return NULL;
6656    result = unicode_encode_ucs1(unicode, errors, 256);
6657    Py_DECREF(unicode);
6658    return result;
6659}
6660
6661PyObject *
6662_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6663{
6664    if (!PyUnicode_Check(unicode)) {
6665        PyErr_BadArgument();
6666        return NULL;
6667    }
6668    if (PyUnicode_READY(unicode) == -1)
6669        return NULL;
6670    /* Fast path: if it is a one-byte string, construct
6671       bytes object directly. */
6672    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6673        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6674                                         PyUnicode_GET_LENGTH(unicode));
6675    /* Non-Latin-1 characters present. Defer to above function to
6676       raise the exception. */
6677    return unicode_encode_ucs1(unicode, errors, 256);
6678}
6679
6680PyObject*
6681PyUnicode_AsLatin1String(PyObject *unicode)
6682{
6683    return _PyUnicode_AsLatin1String(unicode, NULL);
6684}
6685
6686/* --- 7-bit ASCII Codec -------------------------------------------------- */
6687
6688PyObject *
6689PyUnicode_DecodeASCII(const char *s,
6690                      Py_ssize_t size,
6691                      const char *errors)
6692{
6693    const char *starts = s;
6694    _PyUnicodeWriter writer;
6695    int kind;
6696    void *data;
6697    Py_ssize_t startinpos;
6698    Py_ssize_t endinpos;
6699    Py_ssize_t outpos;
6700    const char *e;
6701    PyObject *error_handler_obj = NULL;
6702    PyObject *exc = NULL;
6703    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
6704
6705    if (size == 0)
6706        _Py_RETURN_UNICODE_EMPTY();
6707
6708    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6709    if (size == 1 && (unsigned char)s[0] < 128)
6710        return get_latin1_char((unsigned char)s[0]);
6711
6712    _PyUnicodeWriter_Init(&writer);
6713    writer.min_length = size;
6714    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6715        return NULL;
6716
6717    e = s + size;
6718    data = writer.data;
6719    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6720    writer.pos = outpos;
6721    if (writer.pos == size)
6722        return _PyUnicodeWriter_Finish(&writer);
6723
6724    s += writer.pos;
6725    kind = writer.kind;
6726    while (s < e) {
6727        unsigned char c = (unsigned char)*s;
6728        if (c < 128) {
6729            PyUnicode_WRITE(kind, data, writer.pos, c);
6730            writer.pos++;
6731            ++s;
6732            continue;
6733        }
6734
6735        /* byte outsize range 0x00..0x7f: call the error handler */
6736
6737        if (error_handler == _Py_ERROR_UNKNOWN)
6738            error_handler = get_error_handler(errors);
6739
6740        switch (error_handler)
6741        {
6742        case _Py_ERROR_REPLACE:
6743        case _Py_ERROR_SURROGATEESCAPE:
6744            /* Fast-path: the error handler only writes one character,
6745               but we may switch to UCS2 at the first write */
6746            if (_PyUnicodeWriter_PrepareKind(&writer, PyUnicode_2BYTE_KIND) < 0)
6747                goto onError;
6748            kind = writer.kind;
6749            data = writer.data;
6750
6751            if (error_handler == _Py_ERROR_REPLACE)
6752                PyUnicode_WRITE(kind, data, writer.pos, 0xfffd);
6753            else
6754                PyUnicode_WRITE(kind, data, writer.pos, c + 0xdc00);
6755            writer.pos++;
6756            ++s;
6757            break;
6758
6759        case _Py_ERROR_IGNORE:
6760            ++s;
6761            break;
6762
6763        default:
6764            startinpos = s-starts;
6765            endinpos = startinpos + 1;
6766            if (unicode_decode_call_errorhandler_writer(
6767                    errors, &error_handler_obj,
6768                    "ascii", "ordinal not in range(128)",
6769                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6770                    &writer))
6771                goto onError;
6772            kind = writer.kind;
6773            data = writer.data;
6774        }
6775    }
6776    Py_XDECREF(error_handler_obj);
6777    Py_XDECREF(exc);
6778    return _PyUnicodeWriter_Finish(&writer);
6779
6780  onError:
6781    _PyUnicodeWriter_Dealloc(&writer);
6782    Py_XDECREF(error_handler_obj);
6783    Py_XDECREF(exc);
6784    return NULL;
6785}
6786
6787/* Deprecated */
6788PyObject *
6789PyUnicode_EncodeASCII(const Py_UNICODE *p,
6790                      Py_ssize_t size,
6791                      const char *errors)
6792{
6793    PyObject *result;
6794    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6795    if (unicode == NULL)
6796        return NULL;
6797    result = unicode_encode_ucs1(unicode, errors, 128);
6798    Py_DECREF(unicode);
6799    return result;
6800}
6801
6802PyObject *
6803_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6804{
6805    if (!PyUnicode_Check(unicode)) {
6806        PyErr_BadArgument();
6807        return NULL;
6808    }
6809    if (PyUnicode_READY(unicode) == -1)
6810        return NULL;
6811    /* Fast path: if it is an ASCII-only string, construct bytes object
6812       directly. Else defer to above function to raise the exception. */
6813    if (PyUnicode_IS_ASCII(unicode))
6814        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6815                                         PyUnicode_GET_LENGTH(unicode));
6816    return unicode_encode_ucs1(unicode, errors, 128);
6817}
6818
6819PyObject *
6820PyUnicode_AsASCIIString(PyObject *unicode)
6821{
6822    return _PyUnicode_AsASCIIString(unicode, NULL);
6823}
6824
6825#ifdef HAVE_MBCS
6826
6827/* --- MBCS codecs for Windows -------------------------------------------- */
6828
6829#if SIZEOF_INT < SIZEOF_SIZE_T
6830#define NEED_RETRY
6831#endif
6832
6833#ifndef WC_ERR_INVALID_CHARS
6834#  define WC_ERR_INVALID_CHARS 0x0080
6835#endif
6836
6837static char*
6838code_page_name(UINT code_page, PyObject **obj)
6839{
6840    *obj = NULL;
6841    if (code_page == CP_ACP)
6842        return "mbcs";
6843    if (code_page == CP_UTF7)
6844        return "CP_UTF7";
6845    if (code_page == CP_UTF8)
6846        return "CP_UTF8";
6847
6848    *obj = PyBytes_FromFormat("cp%u", code_page);
6849    if (*obj == NULL)
6850        return NULL;
6851    return PyBytes_AS_STRING(*obj);
6852}
6853
6854static DWORD
6855decode_code_page_flags(UINT code_page)
6856{
6857    if (code_page == CP_UTF7) {
6858        /* The CP_UTF7 decoder only supports flags=0 */
6859        return 0;
6860    }
6861    else
6862        return MB_ERR_INVALID_CHARS;
6863}
6864
6865/*
6866 * Decode a byte string from a Windows code page into unicode object in strict
6867 * mode.
6868 *
6869 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6870 * OSError and returns -1 on other error.
6871 */
6872static int
6873decode_code_page_strict(UINT code_page,
6874                        PyObject **v,
6875                        const char *in,
6876                        int insize)
6877{
6878    const DWORD flags = decode_code_page_flags(code_page);
6879    wchar_t *out;
6880    DWORD outsize;
6881
6882    /* First get the size of the result */
6883    assert(insize > 0);
6884    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6885    if (outsize <= 0)
6886        goto error;
6887
6888    if (*v == NULL) {
6889        /* Create unicode object */
6890        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6891        *v = (PyObject*)_PyUnicode_New(outsize);
6892        if (*v == NULL)
6893            return -1;
6894        out = PyUnicode_AS_UNICODE(*v);
6895    }
6896    else {
6897        /* Extend unicode object */
6898        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6899        if (unicode_resize(v, n + outsize) < 0)
6900            return -1;
6901        out = PyUnicode_AS_UNICODE(*v) + n;
6902    }
6903
6904    /* Do the conversion */
6905    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6906    if (outsize <= 0)
6907        goto error;
6908    return insize;
6909
6910error:
6911    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6912        return -2;
6913    PyErr_SetFromWindowsErr(0);
6914    return -1;
6915}
6916
6917/*
6918 * Decode a byte string from a code page into unicode object with an error
6919 * handler.
6920 *
6921 * Returns consumed size if succeed, or raise an OSError or
6922 * UnicodeDecodeError exception and returns -1 on error.
6923 */
6924static int
6925decode_code_page_errors(UINT code_page,
6926                        PyObject **v,
6927                        const char *in, const int size,
6928                        const char *errors, int final)
6929{
6930    const char *startin = in;
6931    const char *endin = in + size;
6932    const DWORD flags = decode_code_page_flags(code_page);
6933    /* Ideally, we should get reason from FormatMessage. This is the Windows
6934       2000 English version of the message. */
6935    const char *reason = "No mapping for the Unicode character exists "
6936                         "in the target code page.";
6937    /* each step cannot decode more than 1 character, but a character can be
6938       represented as a surrogate pair */
6939    wchar_t buffer[2], *startout, *out;
6940    int insize;
6941    Py_ssize_t outsize;
6942    PyObject *errorHandler = NULL;
6943    PyObject *exc = NULL;
6944    PyObject *encoding_obj = NULL;
6945    char *encoding;
6946    DWORD err;
6947    int ret = -1;
6948
6949    assert(size > 0);
6950
6951    encoding = code_page_name(code_page, &encoding_obj);
6952    if (encoding == NULL)
6953        return -1;
6954
6955    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6956        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6957           UnicodeDecodeError. */
6958        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6959        if (exc != NULL) {
6960            PyCodec_StrictErrors(exc);
6961            Py_CLEAR(exc);
6962        }
6963        goto error;
6964    }
6965
6966    if (*v == NULL) {
6967        /* Create unicode object */
6968        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6969            PyErr_NoMemory();
6970            goto error;
6971        }
6972        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6973        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6974        if (*v == NULL)
6975            goto error;
6976        startout = PyUnicode_AS_UNICODE(*v);
6977    }
6978    else {
6979        /* Extend unicode object */
6980        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6981        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6982            PyErr_NoMemory();
6983            goto error;
6984        }
6985        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6986            goto error;
6987        startout = PyUnicode_AS_UNICODE(*v) + n;
6988    }
6989
6990    /* Decode the byte string character per character */
6991    out = startout;
6992    while (in < endin)
6993    {
6994        /* Decode a character */
6995        insize = 1;
6996        do
6997        {
6998            outsize = MultiByteToWideChar(code_page, flags,
6999                                          in, insize,
7000                                          buffer, Py_ARRAY_LENGTH(buffer));
7001            if (outsize > 0)
7002                break;
7003            err = GetLastError();
7004            if (err != ERROR_NO_UNICODE_TRANSLATION
7005                && err != ERROR_INSUFFICIENT_BUFFER)
7006            {
7007                PyErr_SetFromWindowsErr(0);
7008                goto error;
7009            }
7010            insize++;
7011        }
7012        /* 4=maximum length of a UTF-8 sequence */
7013        while (insize <= 4 && (in + insize) <= endin);
7014
7015        if (outsize <= 0) {
7016            Py_ssize_t startinpos, endinpos, outpos;
7017
7018            /* last character in partial decode? */
7019            if (in + insize >= endin && !final)
7020                break;
7021
7022            startinpos = in - startin;
7023            endinpos = startinpos + 1;
7024            outpos = out - PyUnicode_AS_UNICODE(*v);
7025            if (unicode_decode_call_errorhandler_wchar(
7026                    errors, &errorHandler,
7027                    encoding, reason,
7028                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7029                    v, &outpos))
7030            {
7031                goto error;
7032            }
7033            out = PyUnicode_AS_UNICODE(*v) + outpos;
7034        }
7035        else {
7036            in += insize;
7037            memcpy(out, buffer, outsize * sizeof(wchar_t));
7038            out += outsize;
7039        }
7040    }
7041
7042    /* write a NUL character at the end */
7043    *out = 0;
7044
7045    /* Extend unicode object */
7046    outsize = out - startout;
7047    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7048    if (unicode_resize(v, outsize) < 0)
7049        goto error;
7050    /* (in - startin) <= size and size is an int */
7051    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7052
7053error:
7054    Py_XDECREF(encoding_obj);
7055    Py_XDECREF(errorHandler);
7056    Py_XDECREF(exc);
7057    return ret;
7058}
7059
7060static PyObject *
7061decode_code_page_stateful(int code_page,
7062                          const char *s, Py_ssize_t size,
7063                          const char *errors, Py_ssize_t *consumed)
7064{
7065    PyObject *v = NULL;
7066    int chunk_size, final, converted, done;
7067
7068    if (code_page < 0) {
7069        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7070        return NULL;
7071    }
7072
7073    if (consumed)
7074        *consumed = 0;
7075
7076    do
7077    {
7078#ifdef NEED_RETRY
7079        if (size > INT_MAX) {
7080            chunk_size = INT_MAX;
7081            final = 0;
7082            done = 0;
7083        }
7084        else
7085#endif
7086        {
7087            chunk_size = (int)size;
7088            final = (consumed == NULL);
7089            done = 1;
7090        }
7091
7092        if (chunk_size == 0 && done) {
7093            if (v != NULL)
7094                break;
7095            _Py_RETURN_UNICODE_EMPTY();
7096        }
7097
7098        converted = decode_code_page_strict(code_page, &v,
7099                                            s, chunk_size);
7100        if (converted == -2)
7101            converted = decode_code_page_errors(code_page, &v,
7102                                                s, chunk_size,
7103                                                errors, final);
7104        assert(converted != 0 || done);
7105
7106        if (converted < 0) {
7107            Py_XDECREF(v);
7108            return NULL;
7109        }
7110
7111        if (consumed)
7112            *consumed += converted;
7113
7114        s += converted;
7115        size -= converted;
7116    } while (!done);
7117
7118    return unicode_result(v);
7119}
7120
7121PyObject *
7122PyUnicode_DecodeCodePageStateful(int code_page,
7123                                 const char *s,
7124                                 Py_ssize_t size,
7125                                 const char *errors,
7126                                 Py_ssize_t *consumed)
7127{
7128    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7129}
7130
7131PyObject *
7132PyUnicode_DecodeMBCSStateful(const char *s,
7133                             Py_ssize_t size,
7134                             const char *errors,
7135                             Py_ssize_t *consumed)
7136{
7137    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7138}
7139
7140PyObject *
7141PyUnicode_DecodeMBCS(const char *s,
7142                     Py_ssize_t size,
7143                     const char *errors)
7144{
7145    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7146}
7147
7148static DWORD
7149encode_code_page_flags(UINT code_page, const char *errors)
7150{
7151    if (code_page == CP_UTF8) {
7152        return WC_ERR_INVALID_CHARS;
7153    }
7154    else if (code_page == CP_UTF7) {
7155        /* CP_UTF7 only supports flags=0 */
7156        return 0;
7157    }
7158    else {
7159        if (errors != NULL && strcmp(errors, "replace") == 0)
7160            return 0;
7161        else
7162            return WC_NO_BEST_FIT_CHARS;
7163    }
7164}
7165
7166/*
7167 * Encode a Unicode string to a Windows code page into a byte string in strict
7168 * mode.
7169 *
7170 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7171 * an OSError and returns -1 on other error.
7172 */
7173static int
7174encode_code_page_strict(UINT code_page, PyObject **outbytes,
7175                        PyObject *unicode, Py_ssize_t offset, int len,
7176                        const char* errors)
7177{
7178    BOOL usedDefaultChar = FALSE;
7179    BOOL *pusedDefaultChar = &usedDefaultChar;
7180    int outsize;
7181    PyObject *exc = NULL;
7182    wchar_t *p;
7183    Py_ssize_t size;
7184    const DWORD flags = encode_code_page_flags(code_page, NULL);
7185    char *out;
7186    /* Create a substring so that we can get the UTF-16 representation
7187       of just the slice under consideration. */
7188    PyObject *substring;
7189
7190    assert(len > 0);
7191
7192    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7193        pusedDefaultChar = &usedDefaultChar;
7194    else
7195        pusedDefaultChar = NULL;
7196
7197    substring = PyUnicode_Substring(unicode, offset, offset+len);
7198    if (substring == NULL)
7199        return -1;
7200    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7201    if (p == NULL) {
7202        Py_DECREF(substring);
7203        return -1;
7204    }
7205    assert(size <= INT_MAX);
7206
7207    /* First get the size of the result */
7208    outsize = WideCharToMultiByte(code_page, flags,
7209                                  p, (int)size,
7210                                  NULL, 0,
7211                                  NULL, pusedDefaultChar);
7212    if (outsize <= 0)
7213        goto error;
7214    /* If we used a default char, then we failed! */
7215    if (pusedDefaultChar && *pusedDefaultChar) {
7216        Py_DECREF(substring);
7217        return -2;
7218    }
7219
7220    if (*outbytes == NULL) {
7221        /* Create string object */
7222        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7223        if (*outbytes == NULL) {
7224            Py_DECREF(substring);
7225            return -1;
7226        }
7227        out = PyBytes_AS_STRING(*outbytes);
7228    }
7229    else {
7230        /* Extend string object */
7231        const Py_ssize_t n = PyBytes_Size(*outbytes);
7232        if (outsize > PY_SSIZE_T_MAX - n) {
7233            PyErr_NoMemory();
7234            Py_DECREF(substring);
7235            return -1;
7236        }
7237        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7238            Py_DECREF(substring);
7239            return -1;
7240        }
7241        out = PyBytes_AS_STRING(*outbytes) + n;
7242    }
7243
7244    /* Do the conversion */
7245    outsize = WideCharToMultiByte(code_page, flags,
7246                                  p, (int)size,
7247                                  out, outsize,
7248                                  NULL, pusedDefaultChar);
7249    Py_CLEAR(substring);
7250    if (outsize <= 0)
7251        goto error;
7252    if (pusedDefaultChar && *pusedDefaultChar)
7253        return -2;
7254    return 0;
7255
7256error:
7257    Py_XDECREF(substring);
7258    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7259        return -2;
7260    PyErr_SetFromWindowsErr(0);
7261    return -1;
7262}
7263
7264/*
7265 * Encode a Unicode string to a Windows code page into a byte string using a
7266 * error handler.
7267 *
7268 * Returns consumed characters if succeed, or raise an OSError and returns
7269 * -1 on other error.
7270 */
7271static int
7272encode_code_page_errors(UINT code_page, PyObject **outbytes,
7273                        PyObject *unicode, Py_ssize_t unicode_offset,
7274                        Py_ssize_t insize, const char* errors)
7275{
7276    const DWORD flags = encode_code_page_flags(code_page, errors);
7277    Py_ssize_t pos = unicode_offset;
7278    Py_ssize_t endin = unicode_offset + insize;
7279    /* Ideally, we should get reason from FormatMessage. This is the Windows
7280       2000 English version of the message. */
7281    const char *reason = "invalid character";
7282    /* 4=maximum length of a UTF-8 sequence */
7283    char buffer[4];
7284    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7285    Py_ssize_t outsize;
7286    char *out;
7287    PyObject *errorHandler = NULL;
7288    PyObject *exc = NULL;
7289    PyObject *encoding_obj = NULL;
7290    char *encoding;
7291    Py_ssize_t newpos, newoutsize;
7292    PyObject *rep;
7293    int ret = -1;
7294
7295    assert(insize > 0);
7296
7297    encoding = code_page_name(code_page, &encoding_obj);
7298    if (encoding == NULL)
7299        return -1;
7300
7301    if (errors == NULL || strcmp(errors, "strict") == 0) {
7302        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7303           then we raise a UnicodeEncodeError. */
7304        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7305        if (exc != NULL) {
7306            PyCodec_StrictErrors(exc);
7307            Py_DECREF(exc);
7308        }
7309        Py_XDECREF(encoding_obj);
7310        return -1;
7311    }
7312
7313    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7314        pusedDefaultChar = &usedDefaultChar;
7315    else
7316        pusedDefaultChar = NULL;
7317
7318    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7319        PyErr_NoMemory();
7320        goto error;
7321    }
7322    outsize = insize * Py_ARRAY_LENGTH(buffer);
7323
7324    if (*outbytes == NULL) {
7325        /* Create string object */
7326        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7327        if (*outbytes == NULL)
7328            goto error;
7329        out = PyBytes_AS_STRING(*outbytes);
7330    }
7331    else {
7332        /* Extend string object */
7333        Py_ssize_t n = PyBytes_Size(*outbytes);
7334        if (n > PY_SSIZE_T_MAX - outsize) {
7335            PyErr_NoMemory();
7336            goto error;
7337        }
7338        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7339            goto error;
7340        out = PyBytes_AS_STRING(*outbytes) + n;
7341    }
7342
7343    /* Encode the string character per character */
7344    while (pos < endin)
7345    {
7346        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7347        wchar_t chars[2];
7348        int charsize;
7349        if (ch < 0x10000) {
7350            chars[0] = (wchar_t)ch;
7351            charsize = 1;
7352        }
7353        else {
7354            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7355            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7356            charsize = 2;
7357        }
7358
7359        outsize = WideCharToMultiByte(code_page, flags,
7360                                      chars, charsize,
7361                                      buffer, Py_ARRAY_LENGTH(buffer),
7362                                      NULL, pusedDefaultChar);
7363        if (outsize > 0) {
7364            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7365            {
7366                pos++;
7367                memcpy(out, buffer, outsize);
7368                out += outsize;
7369                continue;
7370            }
7371        }
7372        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7373            PyErr_SetFromWindowsErr(0);
7374            goto error;
7375        }
7376
7377        rep = unicode_encode_call_errorhandler(
7378                  errors, &errorHandler, encoding, reason,
7379                  unicode, &exc,
7380                  pos, pos + 1, &newpos);
7381        if (rep == NULL)
7382            goto error;
7383        pos = newpos;
7384
7385        if (PyBytes_Check(rep)) {
7386            outsize = PyBytes_GET_SIZE(rep);
7387            if (outsize != 1) {
7388                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7389                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7390                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7391                    Py_DECREF(rep);
7392                    goto error;
7393                }
7394                out = PyBytes_AS_STRING(*outbytes) + offset;
7395            }
7396            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7397            out += outsize;
7398        }
7399        else {
7400            Py_ssize_t i;
7401            enum PyUnicode_Kind kind;
7402            void *data;
7403
7404            if (PyUnicode_READY(rep) == -1) {
7405                Py_DECREF(rep);
7406                goto error;
7407            }
7408
7409            outsize = PyUnicode_GET_LENGTH(rep);
7410            if (outsize != 1) {
7411                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7412                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7413                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7414                    Py_DECREF(rep);
7415                    goto error;
7416                }
7417                out = PyBytes_AS_STRING(*outbytes) + offset;
7418            }
7419            kind = PyUnicode_KIND(rep);
7420            data = PyUnicode_DATA(rep);
7421            for (i=0; i < outsize; i++) {
7422                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7423                if (ch > 127) {
7424                    raise_encode_exception(&exc,
7425                        encoding, unicode,
7426                        pos, pos + 1,
7427                        "unable to encode error handler result to ASCII");
7428                    Py_DECREF(rep);
7429                    goto error;
7430                }
7431                *out = (unsigned char)ch;
7432                out++;
7433            }
7434        }
7435        Py_DECREF(rep);
7436    }
7437    /* write a NUL byte */
7438    *out = 0;
7439    outsize = out - PyBytes_AS_STRING(*outbytes);
7440    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7441    if (_PyBytes_Resize(outbytes, outsize) < 0)
7442        goto error;
7443    ret = 0;
7444
7445error:
7446    Py_XDECREF(encoding_obj);
7447    Py_XDECREF(errorHandler);
7448    Py_XDECREF(exc);
7449    return ret;
7450}
7451
7452static PyObject *
7453encode_code_page(int code_page,
7454                 PyObject *unicode,
7455                 const char *errors)
7456{
7457    Py_ssize_t len;
7458    PyObject *outbytes = NULL;
7459    Py_ssize_t offset;
7460    int chunk_len, ret, done;
7461
7462    if (!PyUnicode_Check(unicode)) {
7463        PyErr_BadArgument();
7464        return NULL;
7465    }
7466
7467    if (PyUnicode_READY(unicode) == -1)
7468        return NULL;
7469    len = PyUnicode_GET_LENGTH(unicode);
7470
7471    if (code_page < 0) {
7472        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7473        return NULL;
7474    }
7475
7476    if (len == 0)
7477        return PyBytes_FromStringAndSize(NULL, 0);
7478
7479    offset = 0;
7480    do
7481    {
7482#ifdef NEED_RETRY
7483        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7484           chunks. */
7485        if (len > INT_MAX/2) {
7486            chunk_len = INT_MAX/2;
7487            done = 0;
7488        }
7489        else
7490#endif
7491        {
7492            chunk_len = (int)len;
7493            done = 1;
7494        }
7495
7496        ret = encode_code_page_strict(code_page, &outbytes,
7497                                      unicode, offset, chunk_len,
7498                                      errors);
7499        if (ret == -2)
7500            ret = encode_code_page_errors(code_page, &outbytes,
7501                                          unicode, offset,
7502                                          chunk_len, errors);
7503        if (ret < 0) {
7504            Py_XDECREF(outbytes);
7505            return NULL;
7506        }
7507
7508        offset += chunk_len;
7509        len -= chunk_len;
7510    } while (!done);
7511
7512    return outbytes;
7513}
7514
7515PyObject *
7516PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7517                     Py_ssize_t size,
7518                     const char *errors)
7519{
7520    PyObject *unicode, *res;
7521    unicode = PyUnicode_FromUnicode(p, size);
7522    if (unicode == NULL)
7523        return NULL;
7524    res = encode_code_page(CP_ACP, unicode, errors);
7525    Py_DECREF(unicode);
7526    return res;
7527}
7528
7529PyObject *
7530PyUnicode_EncodeCodePage(int code_page,
7531                         PyObject *unicode,
7532                         const char *errors)
7533{
7534    return encode_code_page(code_page, unicode, errors);
7535}
7536
7537PyObject *
7538PyUnicode_AsMBCSString(PyObject *unicode)
7539{
7540    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7541}
7542
7543#undef NEED_RETRY
7544
7545#endif /* HAVE_MBCS */
7546
7547/* --- Character Mapping Codec -------------------------------------------- */
7548
7549static int
7550charmap_decode_string(const char *s,
7551                      Py_ssize_t size,
7552                      PyObject *mapping,
7553                      const char *errors,
7554                      _PyUnicodeWriter *writer)
7555{
7556    const char *starts = s;
7557    const char *e;
7558    Py_ssize_t startinpos, endinpos;
7559    PyObject *errorHandler = NULL, *exc = NULL;
7560    Py_ssize_t maplen;
7561    enum PyUnicode_Kind mapkind;
7562    void *mapdata;
7563    Py_UCS4 x;
7564    unsigned char ch;
7565
7566    if (PyUnicode_READY(mapping) == -1)
7567        return -1;
7568
7569    maplen = PyUnicode_GET_LENGTH(mapping);
7570    mapdata = PyUnicode_DATA(mapping);
7571    mapkind = PyUnicode_KIND(mapping);
7572
7573    e = s + size;
7574
7575    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7576        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7577         * is disabled in encoding aliases, latin1 is preferred because
7578         * its implementation is faster. */
7579        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7580        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7581        Py_UCS4 maxchar = writer->maxchar;
7582
7583        assert (writer->kind == PyUnicode_1BYTE_KIND);
7584        while (s < e) {
7585            ch = *s;
7586            x = mapdata_ucs1[ch];
7587            if (x > maxchar) {
7588                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7589                    goto onError;
7590                maxchar = writer->maxchar;
7591                outdata = (Py_UCS1 *)writer->data;
7592            }
7593            outdata[writer->pos] = x;
7594            writer->pos++;
7595            ++s;
7596        }
7597        return 0;
7598    }
7599
7600    while (s < e) {
7601        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7602            enum PyUnicode_Kind outkind = writer->kind;
7603            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7604            if (outkind == PyUnicode_1BYTE_KIND) {
7605                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7606                Py_UCS4 maxchar = writer->maxchar;
7607                while (s < e) {
7608                    ch = *s;
7609                    x = mapdata_ucs2[ch];
7610                    if (x > maxchar)
7611                        goto Error;
7612                    outdata[writer->pos] = x;
7613                    writer->pos++;
7614                    ++s;
7615                }
7616                break;
7617            }
7618            else if (outkind == PyUnicode_2BYTE_KIND) {
7619                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7620                while (s < e) {
7621                    ch = *s;
7622                    x = mapdata_ucs2[ch];
7623                    if (x == 0xFFFE)
7624                        goto Error;
7625                    outdata[writer->pos] = x;
7626                    writer->pos++;
7627                    ++s;
7628                }
7629                break;
7630            }
7631        }
7632        ch = *s;
7633
7634        if (ch < maplen)
7635            x = PyUnicode_READ(mapkind, mapdata, ch);
7636        else
7637            x = 0xfffe; /* invalid value */
7638Error:
7639        if (x == 0xfffe)
7640        {
7641            /* undefined mapping */
7642            startinpos = s-starts;
7643            endinpos = startinpos+1;
7644            if (unicode_decode_call_errorhandler_writer(
7645                    errors, &errorHandler,
7646                    "charmap", "character maps to <undefined>",
7647                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7648                    writer)) {
7649                goto onError;
7650            }
7651            continue;
7652        }
7653
7654        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7655            goto onError;
7656        ++s;
7657    }
7658    Py_XDECREF(errorHandler);
7659    Py_XDECREF(exc);
7660    return 0;
7661
7662onError:
7663    Py_XDECREF(errorHandler);
7664    Py_XDECREF(exc);
7665    return -1;
7666}
7667
7668static int
7669charmap_decode_mapping(const char *s,
7670                       Py_ssize_t size,
7671                       PyObject *mapping,
7672                       const char *errors,
7673                       _PyUnicodeWriter *writer)
7674{
7675    const char *starts = s;
7676    const char *e;
7677    Py_ssize_t startinpos, endinpos;
7678    PyObject *errorHandler = NULL, *exc = NULL;
7679    unsigned char ch;
7680    PyObject *key, *item = NULL;
7681
7682    e = s + size;
7683
7684    while (s < e) {
7685        ch = *s;
7686
7687        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7688        key = PyLong_FromLong((long)ch);
7689        if (key == NULL)
7690            goto onError;
7691
7692        item = PyObject_GetItem(mapping, key);
7693        Py_DECREF(key);
7694        if (item == NULL) {
7695            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7696                /* No mapping found means: mapping is undefined. */
7697                PyErr_Clear();
7698                goto Undefined;
7699            } else
7700                goto onError;
7701        }
7702
7703        /* Apply mapping */
7704        if (item == Py_None)
7705            goto Undefined;
7706        if (PyLong_Check(item)) {
7707            long value = PyLong_AS_LONG(item);
7708            if (value == 0xFFFE)
7709                goto Undefined;
7710            if (value < 0 || value > MAX_UNICODE) {
7711                PyErr_Format(PyExc_TypeError,
7712                             "character mapping must be in range(0x%lx)",
7713                             (unsigned long)MAX_UNICODE + 1);
7714                goto onError;
7715            }
7716
7717            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7718                goto onError;
7719        }
7720        else if (PyUnicode_Check(item)) {
7721            if (PyUnicode_READY(item) == -1)
7722                goto onError;
7723            if (PyUnicode_GET_LENGTH(item) == 1) {
7724                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7725                if (value == 0xFFFE)
7726                    goto Undefined;
7727                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7728                    goto onError;
7729            }
7730            else {
7731                writer->overallocate = 1;
7732                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7733                    goto onError;
7734            }
7735        }
7736        else {
7737            /* wrong return value */
7738            PyErr_SetString(PyExc_TypeError,
7739                            "character mapping must return integer, None or str");
7740            goto onError;
7741        }
7742        Py_CLEAR(item);
7743        ++s;
7744        continue;
7745
7746Undefined:
7747        /* undefined mapping */
7748        Py_CLEAR(item);
7749        startinpos = s-starts;
7750        endinpos = startinpos+1;
7751        if (unicode_decode_call_errorhandler_writer(
7752                errors, &errorHandler,
7753                "charmap", "character maps to <undefined>",
7754                &starts, &e, &startinpos, &endinpos, &exc, &s,
7755                writer)) {
7756            goto onError;
7757        }
7758    }
7759    Py_XDECREF(errorHandler);
7760    Py_XDECREF(exc);
7761    return 0;
7762
7763onError:
7764    Py_XDECREF(item);
7765    Py_XDECREF(errorHandler);
7766    Py_XDECREF(exc);
7767    return -1;
7768}
7769
7770PyObject *
7771PyUnicode_DecodeCharmap(const char *s,
7772                        Py_ssize_t size,
7773                        PyObject *mapping,
7774                        const char *errors)
7775{
7776    _PyUnicodeWriter writer;
7777
7778    /* Default to Latin-1 */
7779    if (mapping == NULL)
7780        return PyUnicode_DecodeLatin1(s, size, errors);
7781
7782    if (size == 0)
7783        _Py_RETURN_UNICODE_EMPTY();
7784    _PyUnicodeWriter_Init(&writer);
7785    writer.min_length = size;
7786    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7787        goto onError;
7788
7789    if (PyUnicode_CheckExact(mapping)) {
7790        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7791            goto onError;
7792    }
7793    else {
7794        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7795            goto onError;
7796    }
7797    return _PyUnicodeWriter_Finish(&writer);
7798
7799  onError:
7800    _PyUnicodeWriter_Dealloc(&writer);
7801    return NULL;
7802}
7803
7804/* Charmap encoding: the lookup table */
7805
7806struct encoding_map {
7807    PyObject_HEAD
7808    unsigned char level1[32];
7809    int count2, count3;
7810    unsigned char level23[1];
7811};
7812
7813static PyObject*
7814encoding_map_size(PyObject *obj, PyObject* args)
7815{
7816    struct encoding_map *map = (struct encoding_map*)obj;
7817    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7818                           128*map->count3);
7819}
7820
7821static PyMethodDef encoding_map_methods[] = {
7822    {"size", encoding_map_size, METH_NOARGS,
7823     PyDoc_STR("Return the size (in bytes) of this object") },
7824    { 0 }
7825};
7826
7827static void
7828encoding_map_dealloc(PyObject* o)
7829{
7830    PyObject_FREE(o);
7831}
7832
7833static PyTypeObject EncodingMapType = {
7834    PyVarObject_HEAD_INIT(NULL, 0)
7835    "EncodingMap",          /*tp_name*/
7836    sizeof(struct encoding_map),   /*tp_basicsize*/
7837    0,                      /*tp_itemsize*/
7838    /* methods */
7839    encoding_map_dealloc,   /*tp_dealloc*/
7840    0,                      /*tp_print*/
7841    0,                      /*tp_getattr*/
7842    0,                      /*tp_setattr*/
7843    0,                      /*tp_reserved*/
7844    0,                      /*tp_repr*/
7845    0,                      /*tp_as_number*/
7846    0,                      /*tp_as_sequence*/
7847    0,                      /*tp_as_mapping*/
7848    0,                      /*tp_hash*/
7849    0,                      /*tp_call*/
7850    0,                      /*tp_str*/
7851    0,                      /*tp_getattro*/
7852    0,                      /*tp_setattro*/
7853    0,                      /*tp_as_buffer*/
7854    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7855    0,                      /*tp_doc*/
7856    0,                      /*tp_traverse*/
7857    0,                      /*tp_clear*/
7858    0,                      /*tp_richcompare*/
7859    0,                      /*tp_weaklistoffset*/
7860    0,                      /*tp_iter*/
7861    0,                      /*tp_iternext*/
7862    encoding_map_methods,   /*tp_methods*/
7863    0,                      /*tp_members*/
7864    0,                      /*tp_getset*/
7865    0,                      /*tp_base*/
7866    0,                      /*tp_dict*/
7867    0,                      /*tp_descr_get*/
7868    0,                      /*tp_descr_set*/
7869    0,                      /*tp_dictoffset*/
7870    0,                      /*tp_init*/
7871    0,                      /*tp_alloc*/
7872    0,                      /*tp_new*/
7873    0,                      /*tp_free*/
7874    0,                      /*tp_is_gc*/
7875};
7876
7877PyObject*
7878PyUnicode_BuildEncodingMap(PyObject* string)
7879{
7880    PyObject *result;
7881    struct encoding_map *mresult;
7882    int i;
7883    int need_dict = 0;
7884    unsigned char level1[32];
7885    unsigned char level2[512];
7886    unsigned char *mlevel1, *mlevel2, *mlevel3;
7887    int count2 = 0, count3 = 0;
7888    int kind;
7889    void *data;
7890    Py_ssize_t length;
7891    Py_UCS4 ch;
7892
7893    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7894        PyErr_BadArgument();
7895        return NULL;
7896    }
7897    kind = PyUnicode_KIND(string);
7898    data = PyUnicode_DATA(string);
7899    length = PyUnicode_GET_LENGTH(string);
7900    length = Py_MIN(length, 256);
7901    memset(level1, 0xFF, sizeof level1);
7902    memset(level2, 0xFF, sizeof level2);
7903
7904    /* If there isn't a one-to-one mapping of NULL to \0,
7905       or if there are non-BMP characters, we need to use
7906       a mapping dictionary. */
7907    if (PyUnicode_READ(kind, data, 0) != 0)
7908        need_dict = 1;
7909    for (i = 1; i < length; i++) {
7910        int l1, l2;
7911        ch = PyUnicode_READ(kind, data, i);
7912        if (ch == 0 || ch > 0xFFFF) {
7913            need_dict = 1;
7914            break;
7915        }
7916        if (ch == 0xFFFE)
7917            /* unmapped character */
7918            continue;
7919        l1 = ch >> 11;
7920        l2 = ch >> 7;
7921        if (level1[l1] == 0xFF)
7922            level1[l1] = count2++;
7923        if (level2[l2] == 0xFF)
7924            level2[l2] = count3++;
7925    }
7926
7927    if (count2 >= 0xFF || count3 >= 0xFF)
7928        need_dict = 1;
7929
7930    if (need_dict) {
7931        PyObject *result = PyDict_New();
7932        PyObject *key, *value;
7933        if (!result)
7934            return NULL;
7935        for (i = 0; i < length; i++) {
7936            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7937            value = PyLong_FromLong(i);
7938            if (!key || !value)
7939                goto failed1;
7940            if (PyDict_SetItem(result, key, value) == -1)
7941                goto failed1;
7942            Py_DECREF(key);
7943            Py_DECREF(value);
7944        }
7945        return result;
7946      failed1:
7947        Py_XDECREF(key);
7948        Py_XDECREF(value);
7949        Py_DECREF(result);
7950        return NULL;
7951    }
7952
7953    /* Create a three-level trie */
7954    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7955                             16*count2 + 128*count3 - 1);
7956    if (!result)
7957        return PyErr_NoMemory();
7958    PyObject_Init(result, &EncodingMapType);
7959    mresult = (struct encoding_map*)result;
7960    mresult->count2 = count2;
7961    mresult->count3 = count3;
7962    mlevel1 = mresult->level1;
7963    mlevel2 = mresult->level23;
7964    mlevel3 = mresult->level23 + 16*count2;
7965    memcpy(mlevel1, level1, 32);
7966    memset(mlevel2, 0xFF, 16*count2);
7967    memset(mlevel3, 0, 128*count3);
7968    count3 = 0;
7969    for (i = 1; i < length; i++) {
7970        int o1, o2, o3, i2, i3;
7971        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7972        if (ch == 0xFFFE)
7973            /* unmapped character */
7974            continue;
7975        o1 = ch>>11;
7976        o2 = (ch>>7) & 0xF;
7977        i2 = 16*mlevel1[o1] + o2;
7978        if (mlevel2[i2] == 0xFF)
7979            mlevel2[i2] = count3++;
7980        o3 = ch & 0x7F;
7981        i3 = 128*mlevel2[i2] + o3;
7982        mlevel3[i3] = i;
7983    }
7984    return result;
7985}
7986
7987static int
7988encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7989{
7990    struct encoding_map *map = (struct encoding_map*)mapping;
7991    int l1 = c>>11;
7992    int l2 = (c>>7) & 0xF;
7993    int l3 = c & 0x7F;
7994    int i;
7995
7996    if (c > 0xFFFF)
7997        return -1;
7998    if (c == 0)
7999        return 0;
8000    /* level 1*/
8001    i = map->level1[l1];
8002    if (i == 0xFF) {
8003        return -1;
8004    }
8005    /* level 2*/
8006    i = map->level23[16*i+l2];
8007    if (i == 0xFF) {
8008        return -1;
8009    }
8010    /* level 3 */
8011    i = map->level23[16*map->count2 + 128*i + l3];
8012    if (i == 0) {
8013        return -1;
8014    }
8015    return i;
8016}
8017
8018/* Lookup the character ch in the mapping. If the character
8019   can't be found, Py_None is returned (or NULL, if another
8020   error occurred). */
8021static PyObject *
8022charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8023{
8024    PyObject *w = PyLong_FromLong((long)c);
8025    PyObject *x;
8026
8027    if (w == NULL)
8028        return NULL;
8029    x = PyObject_GetItem(mapping, w);
8030    Py_DECREF(w);
8031    if (x == NULL) {
8032        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8033            /* No mapping found means: mapping is undefined. */
8034            PyErr_Clear();
8035            x = Py_None;
8036            Py_INCREF(x);
8037            return x;
8038        } else
8039            return NULL;
8040    }
8041    else if (x == Py_None)
8042        return x;
8043    else if (PyLong_Check(x)) {
8044        long value = PyLong_AS_LONG(x);
8045        if (value < 0 || value > 255) {
8046            PyErr_SetString(PyExc_TypeError,
8047                            "character mapping must be in range(256)");
8048            Py_DECREF(x);
8049            return NULL;
8050        }
8051        return x;
8052    }
8053    else if (PyBytes_Check(x))
8054        return x;
8055    else {
8056        /* wrong return value */
8057        PyErr_Format(PyExc_TypeError,
8058                     "character mapping must return integer, bytes or None, not %.400s",
8059                     x->ob_type->tp_name);
8060        Py_DECREF(x);
8061        return NULL;
8062    }
8063}
8064
8065static int
8066charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8067{
8068    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8069    /* exponentially overallocate to minimize reallocations */
8070    if (requiredsize < 2*outsize)
8071        requiredsize = 2*outsize;
8072    if (_PyBytes_Resize(outobj, requiredsize))
8073        return -1;
8074    return 0;
8075}
8076
8077typedef enum charmapencode_result {
8078    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8079} charmapencode_result;
8080/* lookup the character, put the result in the output string and adjust
8081   various state variables. Resize the output bytes object if not enough
8082   space is available. Return a new reference to the object that
8083   was put in the output buffer, or Py_None, if the mapping was undefined
8084   (in which case no character was written) or NULL, if a
8085   reallocation error occurred. The caller must decref the result */
8086static charmapencode_result
8087charmapencode_output(Py_UCS4 c, PyObject *mapping,
8088                     PyObject **outobj, Py_ssize_t *outpos)
8089{
8090    PyObject *rep;
8091    char *outstart;
8092    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8093
8094    if (Py_TYPE(mapping) == &EncodingMapType) {
8095        int res = encoding_map_lookup(c, mapping);
8096        Py_ssize_t requiredsize = *outpos+1;
8097        if (res == -1)
8098            return enc_FAILED;
8099        if (outsize<requiredsize)
8100            if (charmapencode_resize(outobj, outpos, requiredsize))
8101                return enc_EXCEPTION;
8102        outstart = PyBytes_AS_STRING(*outobj);
8103        outstart[(*outpos)++] = (char)res;
8104        return enc_SUCCESS;
8105    }
8106
8107    rep = charmapencode_lookup(c, mapping);
8108    if (rep==NULL)
8109        return enc_EXCEPTION;
8110    else if (rep==Py_None) {
8111        Py_DECREF(rep);
8112        return enc_FAILED;
8113    } else {
8114        if (PyLong_Check(rep)) {
8115            Py_ssize_t requiredsize = *outpos+1;
8116            if (outsize<requiredsize)
8117                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8118                    Py_DECREF(rep);
8119                    return enc_EXCEPTION;
8120                }
8121            outstart = PyBytes_AS_STRING(*outobj);
8122            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8123        }
8124        else {
8125            const char *repchars = PyBytes_AS_STRING(rep);
8126            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8127            Py_ssize_t requiredsize = *outpos+repsize;
8128            if (outsize<requiredsize)
8129                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8130                    Py_DECREF(rep);
8131                    return enc_EXCEPTION;
8132                }
8133            outstart = PyBytes_AS_STRING(*outobj);
8134            memcpy(outstart + *outpos, repchars, repsize);
8135            *outpos += repsize;
8136        }
8137    }
8138    Py_DECREF(rep);
8139    return enc_SUCCESS;
8140}
8141
8142/* handle an error in PyUnicode_EncodeCharmap
8143   Return 0 on success, -1 on error */
8144static int
8145charmap_encoding_error(
8146    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8147    PyObject **exceptionObject,
8148    _Py_error_handler *error_handler, PyObject **error_handler_obj, const char *errors,
8149    PyObject **res, Py_ssize_t *respos)
8150{
8151    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8152    Py_ssize_t size, repsize;
8153    Py_ssize_t newpos;
8154    enum PyUnicode_Kind kind;
8155    void *data;
8156    Py_ssize_t index;
8157    /* startpos for collecting unencodable chars */
8158    Py_ssize_t collstartpos = *inpos;
8159    Py_ssize_t collendpos = *inpos+1;
8160    Py_ssize_t collpos;
8161    char *encoding = "charmap";
8162    char *reason = "character maps to <undefined>";
8163    charmapencode_result x;
8164    Py_UCS4 ch;
8165    int val;
8166
8167    if (PyUnicode_READY(unicode) == -1)
8168        return -1;
8169    size = PyUnicode_GET_LENGTH(unicode);
8170    /* find all unencodable characters */
8171    while (collendpos < size) {
8172        PyObject *rep;
8173        if (Py_TYPE(mapping) == &EncodingMapType) {
8174            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8175            val = encoding_map_lookup(ch, mapping);
8176            if (val != -1)
8177                break;
8178            ++collendpos;
8179            continue;
8180        }
8181
8182        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8183        rep = charmapencode_lookup(ch, mapping);
8184        if (rep==NULL)
8185            return -1;
8186        else if (rep!=Py_None) {
8187            Py_DECREF(rep);
8188            break;
8189        }
8190        Py_DECREF(rep);
8191        ++collendpos;
8192    }
8193    /* cache callback name lookup
8194     * (if not done yet, i.e. it's the first error) */
8195    if (*error_handler == _Py_ERROR_UNKNOWN)
8196        *error_handler = get_error_handler(errors);
8197
8198    switch (*error_handler) {
8199    case _Py_ERROR_STRICT:
8200        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8201        return -1;
8202
8203    case _Py_ERROR_REPLACE:
8204        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8205            x = charmapencode_output('?', mapping, res, respos);
8206            if (x==enc_EXCEPTION) {
8207                return -1;
8208            }
8209            else if (x==enc_FAILED) {
8210                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8211                return -1;
8212            }
8213        }
8214        /* fall through */
8215    case _Py_ERROR_IGNORE:
8216        *inpos = collendpos;
8217        break;
8218
8219    case _Py_ERROR_XMLCHARREFREPLACE:
8220        /* generate replacement (temporarily (mis)uses p) */
8221        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8222            char buffer[2+29+1+1];
8223            char *cp;
8224            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8225            for (cp = buffer; *cp; ++cp) {
8226                x = charmapencode_output(*cp, mapping, res, respos);
8227                if (x==enc_EXCEPTION)
8228                    return -1;
8229                else if (x==enc_FAILED) {
8230                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8231                    return -1;
8232                }
8233            }
8234        }
8235        *inpos = collendpos;
8236        break;
8237
8238    default:
8239        repunicode = unicode_encode_call_errorhandler(errors, error_handler_obj,
8240                                                      encoding, reason, unicode, exceptionObject,
8241                                                      collstartpos, collendpos, &newpos);
8242        if (repunicode == NULL)
8243            return -1;
8244        if (PyBytes_Check(repunicode)) {
8245            /* Directly copy bytes result to output. */
8246            Py_ssize_t outsize = PyBytes_Size(*res);
8247            Py_ssize_t requiredsize;
8248            repsize = PyBytes_Size(repunicode);
8249            requiredsize = *respos + repsize;
8250            if (requiredsize > outsize)
8251                /* Make room for all additional bytes. */
8252                if (charmapencode_resize(res, respos, requiredsize)) {
8253                    Py_DECREF(repunicode);
8254                    return -1;
8255                }
8256            memcpy(PyBytes_AsString(*res) + *respos,
8257                   PyBytes_AsString(repunicode),  repsize);
8258            *respos += repsize;
8259            *inpos = newpos;
8260            Py_DECREF(repunicode);
8261            break;
8262        }
8263        /* generate replacement  */
8264        if (PyUnicode_READY(repunicode) == -1) {
8265            Py_DECREF(repunicode);
8266            return -1;
8267        }
8268        repsize = PyUnicode_GET_LENGTH(repunicode);
8269        data = PyUnicode_DATA(repunicode);
8270        kind = PyUnicode_KIND(repunicode);
8271        for (index = 0; index < repsize; index++) {
8272            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8273            x = charmapencode_output(repch, mapping, res, respos);
8274            if (x==enc_EXCEPTION) {
8275                Py_DECREF(repunicode);
8276                return -1;
8277            }
8278            else if (x==enc_FAILED) {
8279                Py_DECREF(repunicode);
8280                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8281                return -1;
8282            }
8283        }
8284        *inpos = newpos;
8285        Py_DECREF(repunicode);
8286    }
8287    return 0;
8288}
8289
8290PyObject *
8291_PyUnicode_EncodeCharmap(PyObject *unicode,
8292                         PyObject *mapping,
8293                         const char *errors)
8294{
8295    /* output object */
8296    PyObject *res = NULL;
8297    /* current input position */
8298    Py_ssize_t inpos = 0;
8299    Py_ssize_t size;
8300    /* current output position */
8301    Py_ssize_t respos = 0;
8302    PyObject *error_handler_obj = NULL;
8303    PyObject *exc = NULL;
8304    _Py_error_handler error_handler = _Py_ERROR_UNKNOWN;
8305    void *data;
8306    int kind;
8307
8308    if (PyUnicode_READY(unicode) == -1)
8309        return NULL;
8310    size = PyUnicode_GET_LENGTH(unicode);
8311    data = PyUnicode_DATA(unicode);
8312    kind = PyUnicode_KIND(unicode);
8313
8314    /* Default to Latin-1 */
8315    if (mapping == NULL)
8316        return unicode_encode_ucs1(unicode, errors, 256);
8317
8318    /* allocate enough for a simple encoding without
8319       replacements, if we need more, we'll resize */
8320    res = PyBytes_FromStringAndSize(NULL, size);
8321    if (res == NULL)
8322        goto onError;
8323    if (size == 0)
8324        return res;
8325
8326    while (inpos<size) {
8327        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8328        /* try to encode it */
8329        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8330        if (x==enc_EXCEPTION) /* error */
8331            goto onError;
8332        if (x==enc_FAILED) { /* unencodable character */
8333            if (charmap_encoding_error(unicode, &inpos, mapping,
8334                                       &exc,
8335                                       &error_handler, &error_handler_obj, errors,
8336                                       &res, &respos)) {
8337                goto onError;
8338            }
8339        }
8340        else
8341            /* done with this character => adjust input position */
8342            ++inpos;
8343    }
8344
8345    /* Resize if we allocated to much */
8346    if (respos<PyBytes_GET_SIZE(res))
8347        if (_PyBytes_Resize(&res, respos) < 0)
8348            goto onError;
8349
8350    Py_XDECREF(exc);
8351    Py_XDECREF(error_handler_obj);
8352    return res;
8353
8354  onError:
8355    Py_XDECREF(res);
8356    Py_XDECREF(exc);
8357    Py_XDECREF(error_handler_obj);
8358    return NULL;
8359}
8360
8361/* Deprecated */
8362PyObject *
8363PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8364                        Py_ssize_t size,
8365                        PyObject *mapping,
8366                        const char *errors)
8367{
8368    PyObject *result;
8369    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8370    if (unicode == NULL)
8371        return NULL;
8372    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8373    Py_DECREF(unicode);
8374    return result;
8375}
8376
8377PyObject *
8378PyUnicode_AsCharmapString(PyObject *unicode,
8379                          PyObject *mapping)
8380{
8381    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8382        PyErr_BadArgument();
8383        return NULL;
8384    }
8385    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8386}
8387
8388/* create or adjust a UnicodeTranslateError */
8389static void
8390make_translate_exception(PyObject **exceptionObject,
8391                         PyObject *unicode,
8392                         Py_ssize_t startpos, Py_ssize_t endpos,
8393                         const char *reason)
8394{
8395    if (*exceptionObject == NULL) {
8396        *exceptionObject = _PyUnicodeTranslateError_Create(
8397            unicode, startpos, endpos, reason);
8398    }
8399    else {
8400        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8401            goto onError;
8402        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8403            goto onError;
8404        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8405            goto onError;
8406        return;
8407      onError:
8408        Py_CLEAR(*exceptionObject);
8409    }
8410}
8411
8412/* error handling callback helper:
8413   build arguments, call the callback and check the arguments,
8414   put the result into newpos and return the replacement string, which
8415   has to be freed by the caller */
8416static PyObject *
8417unicode_translate_call_errorhandler(const char *errors,
8418                                    PyObject **errorHandler,
8419                                    const char *reason,
8420                                    PyObject *unicode, PyObject **exceptionObject,
8421                                    Py_ssize_t startpos, Py_ssize_t endpos,
8422                                    Py_ssize_t *newpos)
8423{
8424    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8425
8426    Py_ssize_t i_newpos;
8427    PyObject *restuple;
8428    PyObject *resunicode;
8429
8430    if (*errorHandler == NULL) {
8431        *errorHandler = PyCodec_LookupError(errors);
8432        if (*errorHandler == NULL)
8433            return NULL;
8434    }
8435
8436    make_translate_exception(exceptionObject,
8437                             unicode, startpos, endpos, reason);
8438    if (*exceptionObject == NULL)
8439        return NULL;
8440
8441    restuple = PyObject_CallFunctionObjArgs(
8442        *errorHandler, *exceptionObject, NULL);
8443    if (restuple == NULL)
8444        return NULL;
8445    if (!PyTuple_Check(restuple)) {
8446        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8447        Py_DECREF(restuple);
8448        return NULL;
8449    }
8450    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8451                          &resunicode, &i_newpos)) {
8452        Py_DECREF(restuple);
8453        return NULL;
8454    }
8455    if (i_newpos<0)
8456        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8457    else
8458        *newpos = i_newpos;
8459    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8460        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8461        Py_DECREF(restuple);
8462        return NULL;
8463    }
8464    Py_INCREF(resunicode);
8465    Py_DECREF(restuple);
8466    return resunicode;
8467}
8468
8469/* Lookup the character ch in the mapping and put the result in result,
8470   which must be decrefed by the caller.
8471   Return 0 on success, -1 on error */
8472static int
8473charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8474{
8475    PyObject *w = PyLong_FromLong((long)c);
8476    PyObject *x;
8477
8478    if (w == NULL)
8479        return -1;
8480    x = PyObject_GetItem(mapping, w);
8481    Py_DECREF(w);
8482    if (x == NULL) {
8483        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8484            /* No mapping found means: use 1:1 mapping. */
8485            PyErr_Clear();
8486            *result = NULL;
8487            return 0;
8488        } else
8489            return -1;
8490    }
8491    else if (x == Py_None) {
8492        *result = x;
8493        return 0;
8494    }
8495    else if (PyLong_Check(x)) {
8496        long value = PyLong_AS_LONG(x);
8497        if (value < 0 || value > MAX_UNICODE) {
8498            PyErr_Format(PyExc_ValueError,
8499                         "character mapping must be in range(0x%x)",
8500                         MAX_UNICODE+1);
8501            Py_DECREF(x);
8502            return -1;
8503        }
8504        *result = x;
8505        return 0;
8506    }
8507    else if (PyUnicode_Check(x)) {
8508        *result = x;
8509        return 0;
8510    }
8511    else {
8512        /* wrong return value */
8513        PyErr_SetString(PyExc_TypeError,
8514                        "character mapping must return integer, None or str");
8515        Py_DECREF(x);
8516        return -1;
8517    }
8518}
8519
8520/* lookup the character, write the result into the writer.
8521   Return 1 if the result was written into the writer, return 0 if the mapping
8522   was undefined, raise an exception return -1 on error. */
8523static int
8524charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8525                        _PyUnicodeWriter *writer)
8526{
8527    PyObject *item;
8528
8529    if (charmaptranslate_lookup(ch, mapping, &item))
8530        return -1;
8531
8532    if (item == NULL) {
8533        /* not found => default to 1:1 mapping */
8534        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8535            return -1;
8536        }
8537        return 1;
8538    }
8539
8540    if (item == Py_None) {
8541        Py_DECREF(item);
8542        return 0;
8543    }
8544
8545    if (PyLong_Check(item)) {
8546        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8547        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8548           used it */
8549        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8550            Py_DECREF(item);
8551            return -1;
8552        }
8553        Py_DECREF(item);
8554        return 1;
8555    }
8556
8557    if (!PyUnicode_Check(item)) {
8558        Py_DECREF(item);
8559        return -1;
8560    }
8561
8562    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8563        Py_DECREF(item);
8564        return -1;
8565    }
8566
8567    Py_DECREF(item);
8568    return 1;
8569}
8570
8571static int
8572unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8573                              Py_UCS1 *translate)
8574{
8575    PyObject *item = NULL;
8576    int ret = 0;
8577
8578    if (charmaptranslate_lookup(ch, mapping, &item)) {
8579        return -1;
8580    }
8581
8582    if (item == Py_None) {
8583        /* deletion */
8584        translate[ch] = 0xfe;
8585    }
8586    else if (item == NULL) {
8587        /* not found => default to 1:1 mapping */
8588        translate[ch] = ch;
8589        return 1;
8590    }
8591    else if (PyLong_Check(item)) {
8592        long replace = PyLong_AS_LONG(item);
8593        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8594           used it */
8595        if (127 < replace) {
8596            /* invalid character or character outside ASCII:
8597               skip the fast translate */
8598            goto exit;
8599        }
8600        translate[ch] = (Py_UCS1)replace;
8601    }
8602    else if (PyUnicode_Check(item)) {
8603        Py_UCS4 replace;
8604
8605        if (PyUnicode_READY(item) == -1) {
8606            Py_DECREF(item);
8607            return -1;
8608        }
8609        if (PyUnicode_GET_LENGTH(item) != 1)
8610            goto exit;
8611
8612        replace = PyUnicode_READ_CHAR(item, 0);
8613        if (replace > 127)
8614            goto exit;
8615        translate[ch] = (Py_UCS1)replace;
8616    }
8617    else {
8618        /* not None, NULL, long or unicode */
8619        goto exit;
8620    }
8621    ret = 1;
8622
8623  exit:
8624    Py_DECREF(item);
8625    return ret;
8626}
8627
8628/* Fast path for ascii => ascii translation. Return 1 if the whole string
8629   was translated into writer, return 0 if the input string was partially
8630   translated into writer, raise an exception and return -1 on error. */
8631static int
8632unicode_fast_translate(PyObject *input, PyObject *mapping,
8633                       _PyUnicodeWriter *writer, int ignore)
8634{
8635    Py_UCS1 ascii_table[128], ch, ch2;
8636    Py_ssize_t len;
8637    Py_UCS1 *in, *end, *out;
8638    int res = 0;
8639
8640    if (PyUnicode_READY(input) == -1)
8641        return -1;
8642    if (!PyUnicode_IS_ASCII(input))
8643        return 0;
8644    len = PyUnicode_GET_LENGTH(input);
8645
8646    memset(ascii_table, 0xff, 128);
8647
8648    in = PyUnicode_1BYTE_DATA(input);
8649    end = in + len;
8650
8651    assert(PyUnicode_IS_ASCII(writer->buffer));
8652    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8653    out = PyUnicode_1BYTE_DATA(writer->buffer);
8654
8655    for (; in < end; in++) {
8656        ch = *in;
8657        ch2 = ascii_table[ch];
8658        if (ch2 == 0xff) {
8659            int translate = unicode_fast_translate_lookup(mapping, ch,
8660                                                          ascii_table);
8661            if (translate < 0)
8662                return -1;
8663            if (translate == 0)
8664                goto exit;
8665            ch2 = ascii_table[ch];
8666        }
8667        if (ch2 == 0xfe) {
8668            if (ignore)
8669                continue;
8670            goto exit;
8671        }
8672        assert(ch2 < 128);
8673        *out = ch2;
8674        out++;
8675    }
8676    res = 1;
8677
8678exit:
8679    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8680    return res;
8681}
8682
8683PyObject *
8684_PyUnicode_TranslateCharmap(PyObject *input,
8685                            PyObject *mapping,
8686                            const char *errors)
8687{
8688    /* input object */
8689    char *data;
8690    Py_ssize_t size, i;
8691    int kind;
8692    /* output buffer */
8693    _PyUnicodeWriter writer;
8694    /* error handler */
8695    char *reason = "character maps to <undefined>";
8696    PyObject *errorHandler = NULL;
8697    PyObject *exc = NULL;
8698    int ignore;
8699    int res;
8700
8701    if (mapping == NULL) {
8702        PyErr_BadArgument();
8703        return NULL;
8704    }
8705
8706    if (PyUnicode_READY(input) == -1)
8707        return NULL;
8708    data = (char*)PyUnicode_DATA(input);
8709    kind = PyUnicode_KIND(input);
8710    size = PyUnicode_GET_LENGTH(input);
8711
8712    if (size == 0) {
8713        Py_INCREF(input);
8714        return input;
8715    }
8716
8717    /* allocate enough for a simple 1:1 translation without
8718       replacements, if we need more, we'll resize */
8719    _PyUnicodeWriter_Init(&writer);
8720    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8721        goto onError;
8722
8723    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8724
8725    res = unicode_fast_translate(input, mapping, &writer, ignore);
8726    if (res < 0) {
8727        _PyUnicodeWriter_Dealloc(&writer);
8728        return NULL;
8729    }
8730    if (res == 1)
8731        return _PyUnicodeWriter_Finish(&writer);
8732
8733    i = writer.pos;
8734    while (i<size) {
8735        /* try to encode it */
8736        int translate;
8737        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8738        Py_ssize_t newpos;
8739        /* startpos for collecting untranslatable chars */
8740        Py_ssize_t collstart;
8741        Py_ssize_t collend;
8742        Py_UCS4 ch;
8743
8744        ch = PyUnicode_READ(kind, data, i);
8745        translate = charmaptranslate_output(ch, mapping, &writer);
8746        if (translate < 0)
8747            goto onError;
8748
8749        if (translate != 0) {
8750            /* it worked => adjust input pointer */
8751            ++i;
8752            continue;
8753        }
8754
8755        /* untranslatable character */
8756        collstart = i;
8757        collend = i+1;
8758
8759        /* find all untranslatable characters */
8760        while (collend < size) {
8761            PyObject *x;
8762            ch = PyUnicode_READ(kind, data, collend);
8763            if (charmaptranslate_lookup(ch, mapping, &x))
8764                goto onError;
8765            Py_XDECREF(x);
8766            if (x != Py_None)
8767                break;
8768            ++collend;
8769        }
8770
8771        if (ignore) {
8772            i = collend;
8773        }
8774        else {
8775            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8776                                                             reason, input, &exc,
8777                                                             collstart, collend, &newpos);
8778            if (repunicode == NULL)
8779                goto onError;
8780            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
8781                Py_DECREF(repunicode);
8782                goto onError;
8783            }
8784            Py_DECREF(repunicode);
8785            i = newpos;
8786        }
8787    }
8788    Py_XDECREF(exc);
8789    Py_XDECREF(errorHandler);
8790    return _PyUnicodeWriter_Finish(&writer);
8791
8792  onError:
8793    _PyUnicodeWriter_Dealloc(&writer);
8794    Py_XDECREF(exc);
8795    Py_XDECREF(errorHandler);
8796    return NULL;
8797}
8798
8799/* Deprecated. Use PyUnicode_Translate instead. */
8800PyObject *
8801PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8802                           Py_ssize_t size,
8803                           PyObject *mapping,
8804                           const char *errors)
8805{
8806    PyObject *result;
8807    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8808    if (!unicode)
8809        return NULL;
8810    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8811    Py_DECREF(unicode);
8812    return result;
8813}
8814
8815PyObject *
8816PyUnicode_Translate(PyObject *str,
8817                    PyObject *mapping,
8818                    const char *errors)
8819{
8820    PyObject *result;
8821
8822    str = PyUnicode_FromObject(str);
8823    if (str == NULL)
8824        return NULL;
8825    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8826    Py_DECREF(str);
8827    return result;
8828}
8829
8830static Py_UCS4
8831fix_decimal_and_space_to_ascii(PyObject *self)
8832{
8833    /* No need to call PyUnicode_READY(self) because this function is only
8834       called as a callback from fixup() which does it already. */
8835    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8836    const int kind = PyUnicode_KIND(self);
8837    void *data = PyUnicode_DATA(self);
8838    Py_UCS4 maxchar = 127, ch, fixed;
8839    int modified = 0;
8840    Py_ssize_t i;
8841
8842    for (i = 0; i < len; ++i) {
8843        ch = PyUnicode_READ(kind, data, i);
8844        fixed = 0;
8845        if (ch > 127) {
8846            if (Py_UNICODE_ISSPACE(ch))
8847                fixed = ' ';
8848            else {
8849                const int decimal = Py_UNICODE_TODECIMAL(ch);
8850                if (decimal >= 0)
8851                    fixed = '0' + decimal;
8852            }
8853            if (fixed != 0) {
8854                modified = 1;
8855                maxchar = Py_MAX(maxchar, fixed);
8856                PyUnicode_WRITE(kind, data, i, fixed);
8857            }
8858            else
8859                maxchar = Py_MAX(maxchar, ch);
8860        }
8861    }
8862
8863    return (modified) ? maxchar : 0;
8864}
8865
8866PyObject *
8867_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8868{
8869    if (!PyUnicode_Check(unicode)) {
8870        PyErr_BadInternalCall();
8871        return NULL;
8872    }
8873    if (PyUnicode_READY(unicode) == -1)
8874        return NULL;
8875    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8876        /* If the string is already ASCII, just return the same string */
8877        Py_INCREF(unicode);
8878        return unicode;
8879    }
8880    return fixup(unicode, fix_decimal_and_space_to_ascii);
8881}
8882
8883PyObject *
8884PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8885                                  Py_ssize_t length)
8886{
8887    PyObject *decimal;
8888    Py_ssize_t i;
8889    Py_UCS4 maxchar;
8890    enum PyUnicode_Kind kind;
8891    void *data;
8892
8893    maxchar = 127;
8894    for (i = 0; i < length; i++) {
8895        Py_UCS4 ch = s[i];
8896        if (ch > 127) {
8897            int decimal = Py_UNICODE_TODECIMAL(ch);
8898            if (decimal >= 0)
8899                ch = '0' + decimal;
8900            maxchar = Py_MAX(maxchar, ch);
8901        }
8902    }
8903
8904    /* Copy to a new string */
8905    decimal = PyUnicode_New(length, maxchar);
8906    if (decimal == NULL)
8907        return decimal;
8908    kind = PyUnicode_KIND(decimal);
8909    data = PyUnicode_DATA(decimal);
8910    /* Iterate over code points */
8911    for (i = 0; i < length; i++) {
8912        Py_UCS4 ch = s[i];
8913        if (ch > 127) {
8914            int decimal = Py_UNICODE_TODECIMAL(ch);
8915            if (decimal >= 0)
8916                ch = '0' + decimal;
8917        }
8918        PyUnicode_WRITE(kind, data, i, ch);
8919    }
8920    return unicode_result(decimal);
8921}
8922/* --- Decimal Encoder ---------------------------------------------------- */
8923
8924int
8925PyUnicode_EncodeDecimal(Py_UNICODE *s,
8926                        Py_ssize_t length,
8927                        char *output,
8928                        const char *errors)
8929{
8930    PyObject *unicode;
8931    Py_ssize_t i;
8932    enum PyUnicode_Kind kind;
8933    void *data;
8934
8935    if (output == NULL) {
8936        PyErr_BadArgument();
8937        return -1;
8938    }
8939
8940    unicode = PyUnicode_FromUnicode(s, length);
8941    if (unicode == NULL)
8942        return -1;
8943
8944    if (PyUnicode_READY(unicode) == -1) {
8945        Py_DECREF(unicode);
8946        return -1;
8947    }
8948    kind = PyUnicode_KIND(unicode);
8949    data = PyUnicode_DATA(unicode);
8950
8951    for (i=0; i < length; ) {
8952        PyObject *exc;
8953        Py_UCS4 ch;
8954        int decimal;
8955        Py_ssize_t startpos;
8956
8957        ch = PyUnicode_READ(kind, data, i);
8958
8959        if (Py_UNICODE_ISSPACE(ch)) {
8960            *output++ = ' ';
8961            i++;
8962            continue;
8963        }
8964        decimal = Py_UNICODE_TODECIMAL(ch);
8965        if (decimal >= 0) {
8966            *output++ = '0' + decimal;
8967            i++;
8968            continue;
8969        }
8970        if (0 < ch && ch < 256) {
8971            *output++ = (char)ch;
8972            i++;
8973            continue;
8974        }
8975
8976        startpos = i;
8977        exc = NULL;
8978        raise_encode_exception(&exc, "decimal", unicode,
8979                               startpos, startpos+1,
8980                               "invalid decimal Unicode string");
8981        Py_XDECREF(exc);
8982        Py_DECREF(unicode);
8983        return -1;
8984    }
8985    /* 0-terminate the output string */
8986    *output++ = '\0';
8987    Py_DECREF(unicode);
8988    return 0;
8989}
8990
8991/* --- Helpers ------------------------------------------------------------ */
8992
8993/* helper macro to fixup start/end slice values */
8994#define ADJUST_INDICES(start, end, len)         \
8995    if (end > len)                              \
8996        end = len;                              \
8997    else if (end < 0) {                         \
8998        end += len;                             \
8999        if (end < 0)                            \
9000            end = 0;                            \
9001    }                                           \
9002    if (start < 0) {                            \
9003        start += len;                           \
9004        if (start < 0)                          \
9005            start = 0;                          \
9006    }
9007
9008static Py_ssize_t
9009any_find_slice(int direction, PyObject* s1, PyObject* s2,
9010               Py_ssize_t start,
9011               Py_ssize_t end)
9012{
9013    int kind1, kind2;
9014    void *buf1, *buf2;
9015    Py_ssize_t len1, len2, result;
9016
9017    kind1 = PyUnicode_KIND(s1);
9018    kind2 = PyUnicode_KIND(s2);
9019    if (kind1 < kind2)
9020        return -1;
9021
9022    len1 = PyUnicode_GET_LENGTH(s1);
9023    len2 = PyUnicode_GET_LENGTH(s2);
9024    ADJUST_INDICES(start, end, len1);
9025    if (end - start < len2)
9026        return -1;
9027
9028    buf1 = PyUnicode_DATA(s1);
9029    buf2 = PyUnicode_DATA(s2);
9030    if (len2 == 1) {
9031        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
9032        result = findchar((const char *)buf1 + kind1*start,
9033                          kind1, end - start, ch, direction);
9034        if (result == -1)
9035            return -1;
9036        else
9037            return start + result;
9038    }
9039
9040    if (kind2 != kind1) {
9041        buf2 = _PyUnicode_AsKind(s2, kind1);
9042        if (!buf2)
9043            return -2;
9044    }
9045
9046    if (direction > 0) {
9047        switch (kind1) {
9048        case PyUnicode_1BYTE_KIND:
9049            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9050                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9051            else
9052                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9053            break;
9054        case PyUnicode_2BYTE_KIND:
9055            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9056            break;
9057        case PyUnicode_4BYTE_KIND:
9058            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9059            break;
9060        default:
9061            assert(0); result = -2;
9062        }
9063    }
9064    else {
9065        switch (kind1) {
9066        case PyUnicode_1BYTE_KIND:
9067            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9068                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9069            else
9070                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9071            break;
9072        case PyUnicode_2BYTE_KIND:
9073            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9074            break;
9075        case PyUnicode_4BYTE_KIND:
9076            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9077            break;
9078        default:
9079            assert(0); result = -2;
9080        }
9081    }
9082
9083    if (kind2 != kind1)
9084        PyMem_Free(buf2);
9085
9086    return result;
9087}
9088
9089Py_ssize_t
9090_PyUnicode_InsertThousandsGrouping(
9091    PyObject *unicode, Py_ssize_t index,
9092    Py_ssize_t n_buffer,
9093    void *digits, Py_ssize_t n_digits,
9094    Py_ssize_t min_width,
9095    const char *grouping, PyObject *thousands_sep,
9096    Py_UCS4 *maxchar)
9097{
9098    unsigned int kind, thousands_sep_kind;
9099    char *data, *thousands_sep_data;
9100    Py_ssize_t thousands_sep_len;
9101    Py_ssize_t len;
9102
9103    if (unicode != NULL) {
9104        kind = PyUnicode_KIND(unicode);
9105        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9106    }
9107    else {
9108        kind = PyUnicode_1BYTE_KIND;
9109        data = NULL;
9110    }
9111    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9112    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9113    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9114    if (unicode != NULL && thousands_sep_kind != kind) {
9115        if (thousands_sep_kind < kind) {
9116            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9117            if (!thousands_sep_data)
9118                return -1;
9119        }
9120        else {
9121            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9122            if (!data)
9123                return -1;
9124        }
9125    }
9126
9127    switch (kind) {
9128    case PyUnicode_1BYTE_KIND:
9129        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9130            len = asciilib_InsertThousandsGrouping(
9131                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9132                min_width, grouping,
9133                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9134        else
9135            len = ucs1lib_InsertThousandsGrouping(
9136                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9137                min_width, grouping,
9138                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9139        break;
9140    case PyUnicode_2BYTE_KIND:
9141        len = ucs2lib_InsertThousandsGrouping(
9142            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9143            min_width, grouping,
9144            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9145        break;
9146    case PyUnicode_4BYTE_KIND:
9147        len = ucs4lib_InsertThousandsGrouping(
9148            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9149            min_width, grouping,
9150            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9151        break;
9152    default:
9153        assert(0);
9154        return -1;
9155    }
9156    if (unicode != NULL && thousands_sep_kind != kind) {
9157        if (thousands_sep_kind < kind)
9158            PyMem_Free(thousands_sep_data);
9159        else
9160            PyMem_Free(data);
9161    }
9162    if (unicode == NULL) {
9163        *maxchar = 127;
9164        if (len != n_digits) {
9165            *maxchar = Py_MAX(*maxchar,
9166                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9167        }
9168    }
9169    return len;
9170}
9171
9172
9173Py_ssize_t
9174PyUnicode_Count(PyObject *str,
9175                PyObject *substr,
9176                Py_ssize_t start,
9177                Py_ssize_t end)
9178{
9179    Py_ssize_t result;
9180    PyObject* str_obj;
9181    PyObject* sub_obj;
9182    int kind1, kind2;
9183    void *buf1 = NULL, *buf2 = NULL;
9184    Py_ssize_t len1, len2;
9185
9186    str_obj = PyUnicode_FromObject(str);
9187    if (!str_obj)
9188        return -1;
9189    sub_obj = PyUnicode_FromObject(substr);
9190    if (!sub_obj) {
9191        Py_DECREF(str_obj);
9192        return -1;
9193    }
9194    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9195        Py_DECREF(sub_obj);
9196        Py_DECREF(str_obj);
9197        return -1;
9198    }
9199
9200    kind1 = PyUnicode_KIND(str_obj);
9201    kind2 = PyUnicode_KIND(sub_obj);
9202    if (kind1 < kind2) {
9203        Py_DECREF(sub_obj);
9204        Py_DECREF(str_obj);
9205        return 0;
9206    }
9207
9208    len1 = PyUnicode_GET_LENGTH(str_obj);
9209    len2 = PyUnicode_GET_LENGTH(sub_obj);
9210    ADJUST_INDICES(start, end, len1);
9211    if (end - start < len2) {
9212        Py_DECREF(sub_obj);
9213        Py_DECREF(str_obj);
9214        return 0;
9215    }
9216
9217    buf1 = PyUnicode_DATA(str_obj);
9218    buf2 = PyUnicode_DATA(sub_obj);
9219    if (kind2 != kind1) {
9220        buf2 = _PyUnicode_AsKind(sub_obj, kind1);
9221        if (!buf2)
9222            goto onError;
9223    }
9224
9225    switch (kind1) {
9226    case PyUnicode_1BYTE_KIND:
9227        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9228            result = asciilib_count(
9229                ((Py_UCS1*)buf1) + start, end - start,
9230                buf2, len2, PY_SSIZE_T_MAX
9231                );
9232        else
9233            result = ucs1lib_count(
9234                ((Py_UCS1*)buf1) + start, end - start,
9235                buf2, len2, PY_SSIZE_T_MAX
9236                );
9237        break;
9238    case PyUnicode_2BYTE_KIND:
9239        result = ucs2lib_count(
9240            ((Py_UCS2*)buf1) + start, end - start,
9241            buf2, len2, PY_SSIZE_T_MAX
9242            );
9243        break;
9244    case PyUnicode_4BYTE_KIND:
9245        result = ucs4lib_count(
9246            ((Py_UCS4*)buf1) + start, end - start,
9247            buf2, len2, PY_SSIZE_T_MAX
9248            );
9249        break;
9250    default:
9251        assert(0); result = 0;
9252    }
9253
9254    Py_DECREF(sub_obj);
9255    Py_DECREF(str_obj);
9256
9257    if (kind2 != kind1)
9258        PyMem_Free(buf2);
9259
9260    return result;
9261  onError:
9262    Py_DECREF(sub_obj);
9263    Py_DECREF(str_obj);
9264    if (kind2 != kind1 && buf2)
9265        PyMem_Free(buf2);
9266    return -1;
9267}
9268
9269Py_ssize_t
9270PyUnicode_Find(PyObject *str,
9271               PyObject *sub,
9272               Py_ssize_t start,
9273               Py_ssize_t end,
9274               int direction)
9275{
9276    Py_ssize_t result;
9277
9278    str = PyUnicode_FromObject(str);
9279    if (!str)
9280        return -2;
9281    sub = PyUnicode_FromObject(sub);
9282    if (!sub) {
9283        Py_DECREF(str);
9284        return -2;
9285    }
9286    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9287        Py_DECREF(sub);
9288        Py_DECREF(str);
9289        return -2;
9290    }
9291
9292    result = any_find_slice(direction,
9293        str, sub, start, end
9294        );
9295
9296    Py_DECREF(str);
9297    Py_DECREF(sub);
9298
9299    return result;
9300}
9301
9302Py_ssize_t
9303PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9304                   Py_ssize_t start, Py_ssize_t end,
9305                   int direction)
9306{
9307    int kind;
9308    Py_ssize_t result;
9309    if (PyUnicode_READY(str) == -1)
9310        return -2;
9311    if (start < 0 || end < 0) {
9312        PyErr_SetString(PyExc_IndexError, "string index out of range");
9313        return -2;
9314    }
9315    if (end > PyUnicode_GET_LENGTH(str))
9316        end = PyUnicode_GET_LENGTH(str);
9317    if (start >= end)
9318        return -1;
9319    kind = PyUnicode_KIND(str);
9320    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9321                      kind, end-start, ch, direction);
9322    if (result == -1)
9323        return -1;
9324    else
9325        return start + result;
9326}
9327
9328static int
9329tailmatch(PyObject *self,
9330          PyObject *substring,
9331          Py_ssize_t start,
9332          Py_ssize_t end,
9333          int direction)
9334{
9335    int kind_self;
9336    int kind_sub;
9337    void *data_self;
9338    void *data_sub;
9339    Py_ssize_t offset;
9340    Py_ssize_t i;
9341    Py_ssize_t end_sub;
9342
9343    if (PyUnicode_READY(self) == -1 ||
9344        PyUnicode_READY(substring) == -1)
9345        return -1;
9346
9347    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9348    end -= PyUnicode_GET_LENGTH(substring);
9349    if (end < start)
9350        return 0;
9351
9352    if (PyUnicode_GET_LENGTH(substring) == 0)
9353        return 1;
9354
9355    kind_self = PyUnicode_KIND(self);
9356    data_self = PyUnicode_DATA(self);
9357    kind_sub = PyUnicode_KIND(substring);
9358    data_sub = PyUnicode_DATA(substring);
9359    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9360
9361    if (direction > 0)
9362        offset = end;
9363    else
9364        offset = start;
9365
9366    if (PyUnicode_READ(kind_self, data_self, offset) ==
9367        PyUnicode_READ(kind_sub, data_sub, 0) &&
9368        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9369        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9370        /* If both are of the same kind, memcmp is sufficient */
9371        if (kind_self == kind_sub) {
9372            return ! memcmp((char *)data_self +
9373                                (offset * PyUnicode_KIND(substring)),
9374                            data_sub,
9375                            PyUnicode_GET_LENGTH(substring) *
9376                                PyUnicode_KIND(substring));
9377        }
9378        /* otherwise we have to compare each character by first accesing it */
9379        else {
9380            /* We do not need to compare 0 and len(substring)-1 because
9381               the if statement above ensured already that they are equal
9382               when we end up here. */
9383            for (i = 1; i < end_sub; ++i) {
9384                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9385                    PyUnicode_READ(kind_sub, data_sub, i))
9386                    return 0;
9387            }
9388            return 1;
9389        }
9390    }
9391
9392    return 0;
9393}
9394
9395Py_ssize_t
9396PyUnicode_Tailmatch(PyObject *str,
9397                    PyObject *substr,
9398                    Py_ssize_t start,
9399                    Py_ssize_t end,
9400                    int direction)
9401{
9402    Py_ssize_t result;
9403
9404    str = PyUnicode_FromObject(str);
9405    if (str == NULL)
9406        return -1;
9407    substr = PyUnicode_FromObject(substr);
9408    if (substr == NULL) {
9409        Py_DECREF(str);
9410        return -1;
9411    }
9412
9413    result = tailmatch(str, substr,
9414                       start, end, direction);
9415    Py_DECREF(str);
9416    Py_DECREF(substr);
9417    return result;
9418}
9419
9420/* Apply fixfct filter to the Unicode object self and return a
9421   reference to the modified object */
9422
9423static PyObject *
9424fixup(PyObject *self,
9425      Py_UCS4 (*fixfct)(PyObject *s))
9426{
9427    PyObject *u;
9428    Py_UCS4 maxchar_old, maxchar_new = 0;
9429    PyObject *v;
9430
9431    u = _PyUnicode_Copy(self);
9432    if (u == NULL)
9433        return NULL;
9434    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9435
9436    /* fix functions return the new maximum character in a string,
9437       if the kind of the resulting unicode object does not change,
9438       everything is fine.  Otherwise we need to change the string kind
9439       and re-run the fix function. */
9440    maxchar_new = fixfct(u);
9441
9442    if (maxchar_new == 0) {
9443        /* no changes */;
9444        if (PyUnicode_CheckExact(self)) {
9445            Py_DECREF(u);
9446            Py_INCREF(self);
9447            return self;
9448        }
9449        else
9450            return u;
9451    }
9452
9453    maxchar_new = align_maxchar(maxchar_new);
9454
9455    if (maxchar_new == maxchar_old)
9456        return u;
9457
9458    /* In case the maximum character changed, we need to
9459       convert the string to the new category. */
9460    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9461    if (v == NULL) {
9462        Py_DECREF(u);
9463        return NULL;
9464    }
9465    if (maxchar_new > maxchar_old) {
9466        /* If the maxchar increased so that the kind changed, not all
9467           characters are representable anymore and we need to fix the
9468           string again. This only happens in very few cases. */
9469        _PyUnicode_FastCopyCharacters(v, 0,
9470                                      self, 0, PyUnicode_GET_LENGTH(self));
9471        maxchar_old = fixfct(v);
9472        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9473    }
9474    else {
9475        _PyUnicode_FastCopyCharacters(v, 0,
9476                                      u, 0, PyUnicode_GET_LENGTH(self));
9477    }
9478    Py_DECREF(u);
9479    assert(_PyUnicode_CheckConsistency(v, 1));
9480    return v;
9481}
9482
9483static PyObject *
9484ascii_upper_or_lower(PyObject *self, int lower)
9485{
9486    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9487    char *resdata, *data = PyUnicode_DATA(self);
9488    PyObject *res;
9489
9490    res = PyUnicode_New(len, 127);
9491    if (res == NULL)
9492        return NULL;
9493    resdata = PyUnicode_DATA(res);
9494    if (lower)
9495        _Py_bytes_lower(resdata, data, len);
9496    else
9497        _Py_bytes_upper(resdata, data, len);
9498    return res;
9499}
9500
9501static Py_UCS4
9502handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9503{
9504    Py_ssize_t j;
9505    int final_sigma;
9506    Py_UCS4 c = 0;   /* initialize to prevent gcc warning */
9507    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9508
9509     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9510
9511    where ! is a negation and \p{xxx} is a character with property xxx.
9512    */
9513    for (j = i - 1; j >= 0; j--) {
9514        c = PyUnicode_READ(kind, data, j);
9515        if (!_PyUnicode_IsCaseIgnorable(c))
9516            break;
9517    }
9518    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9519    if (final_sigma) {
9520        for (j = i + 1; j < length; j++) {
9521            c = PyUnicode_READ(kind, data, j);
9522            if (!_PyUnicode_IsCaseIgnorable(c))
9523                break;
9524        }
9525        final_sigma = j == length || !_PyUnicode_IsCased(c);
9526    }
9527    return (final_sigma) ? 0x3C2 : 0x3C3;
9528}
9529
9530static int
9531lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9532           Py_UCS4 c, Py_UCS4 *mapped)
9533{
9534    /* Obscure special case. */
9535    if (c == 0x3A3) {
9536        mapped[0] = handle_capital_sigma(kind, data, length, i);
9537        return 1;
9538    }
9539    return _PyUnicode_ToLowerFull(c, mapped);
9540}
9541
9542static Py_ssize_t
9543do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9544{
9545    Py_ssize_t i, k = 0;
9546    int n_res, j;
9547    Py_UCS4 c, mapped[3];
9548
9549    c = PyUnicode_READ(kind, data, 0);
9550    n_res = _PyUnicode_ToUpperFull(c, mapped);
9551    for (j = 0; j < n_res; j++) {
9552        *maxchar = Py_MAX(*maxchar, mapped[j]);
9553        res[k++] = mapped[j];
9554    }
9555    for (i = 1; i < length; i++) {
9556        c = PyUnicode_READ(kind, data, i);
9557        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9558        for (j = 0; j < n_res; j++) {
9559            *maxchar = Py_MAX(*maxchar, mapped[j]);
9560            res[k++] = mapped[j];
9561        }
9562    }
9563    return k;
9564}
9565
9566static Py_ssize_t
9567do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9568    Py_ssize_t i, k = 0;
9569
9570    for (i = 0; i < length; i++) {
9571        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9572        int n_res, j;
9573        if (Py_UNICODE_ISUPPER(c)) {
9574            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9575        }
9576        else if (Py_UNICODE_ISLOWER(c)) {
9577            n_res = _PyUnicode_ToUpperFull(c, mapped);
9578        }
9579        else {
9580            n_res = 1;
9581            mapped[0] = c;
9582        }
9583        for (j = 0; j < n_res; j++) {
9584            *maxchar = Py_MAX(*maxchar, mapped[j]);
9585            res[k++] = mapped[j];
9586        }
9587    }
9588    return k;
9589}
9590
9591static Py_ssize_t
9592do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9593                  Py_UCS4 *maxchar, int lower)
9594{
9595    Py_ssize_t i, k = 0;
9596
9597    for (i = 0; i < length; i++) {
9598        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9599        int n_res, j;
9600        if (lower)
9601            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9602        else
9603            n_res = _PyUnicode_ToUpperFull(c, mapped);
9604        for (j = 0; j < n_res; j++) {
9605            *maxchar = Py_MAX(*maxchar, mapped[j]);
9606            res[k++] = mapped[j];
9607        }
9608    }
9609    return k;
9610}
9611
9612static Py_ssize_t
9613do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9614{
9615    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9616}
9617
9618static Py_ssize_t
9619do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9620{
9621    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9622}
9623
9624static Py_ssize_t
9625do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9626{
9627    Py_ssize_t i, k = 0;
9628
9629    for (i = 0; i < length; i++) {
9630        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9631        Py_UCS4 mapped[3];
9632        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9633        for (j = 0; j < n_res; j++) {
9634            *maxchar = Py_MAX(*maxchar, mapped[j]);
9635            res[k++] = mapped[j];
9636        }
9637    }
9638    return k;
9639}
9640
9641static Py_ssize_t
9642do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9643{
9644    Py_ssize_t i, k = 0;
9645    int previous_is_cased;
9646
9647    previous_is_cased = 0;
9648    for (i = 0; i < length; i++) {
9649        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9650        Py_UCS4 mapped[3];
9651        int n_res, j;
9652
9653        if (previous_is_cased)
9654            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9655        else
9656            n_res = _PyUnicode_ToTitleFull(c, mapped);
9657
9658        for (j = 0; j < n_res; j++) {
9659            *maxchar = Py_MAX(*maxchar, mapped[j]);
9660            res[k++] = mapped[j];
9661        }
9662
9663        previous_is_cased = _PyUnicode_IsCased(c);
9664    }
9665    return k;
9666}
9667
9668static PyObject *
9669case_operation(PyObject *self,
9670               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9671{
9672    PyObject *res = NULL;
9673    Py_ssize_t length, newlength = 0;
9674    int kind, outkind;
9675    void *data, *outdata;
9676    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9677
9678    assert(PyUnicode_IS_READY(self));
9679
9680    kind = PyUnicode_KIND(self);
9681    data = PyUnicode_DATA(self);
9682    length = PyUnicode_GET_LENGTH(self);
9683    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9684        PyErr_SetString(PyExc_OverflowError, "string is too long");
9685        return NULL;
9686    }
9687    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9688    if (tmp == NULL)
9689        return PyErr_NoMemory();
9690    newlength = perform(kind, data, length, tmp, &maxchar);
9691    res = PyUnicode_New(newlength, maxchar);
9692    if (res == NULL)
9693        goto leave;
9694    tmpend = tmp + newlength;
9695    outdata = PyUnicode_DATA(res);
9696    outkind = PyUnicode_KIND(res);
9697    switch (outkind) {
9698    case PyUnicode_1BYTE_KIND:
9699        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9700        break;
9701    case PyUnicode_2BYTE_KIND:
9702        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9703        break;
9704    case PyUnicode_4BYTE_KIND:
9705        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9706        break;
9707    default:
9708        assert(0);
9709        break;
9710    }
9711  leave:
9712    PyMem_FREE(tmp);
9713    return res;
9714}
9715
9716PyObject *
9717PyUnicode_Join(PyObject *separator, PyObject *seq)
9718{
9719    PyObject *sep = NULL;
9720    Py_ssize_t seplen;
9721    PyObject *res = NULL; /* the result */
9722    PyObject *fseq;          /* PySequence_Fast(seq) */
9723    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9724    PyObject **items;
9725    PyObject *item;
9726    Py_ssize_t sz, i, res_offset;
9727    Py_UCS4 maxchar;
9728    Py_UCS4 item_maxchar;
9729    int use_memcpy;
9730    unsigned char *res_data = NULL, *sep_data = NULL;
9731    PyObject *last_obj;
9732    unsigned int kind = 0;
9733
9734    fseq = PySequence_Fast(seq, "can only join an iterable");
9735    if (fseq == NULL) {
9736        return NULL;
9737    }
9738
9739    /* NOTE: the following code can't call back into Python code,
9740     * so we are sure that fseq won't be mutated.
9741     */
9742
9743    seqlen = PySequence_Fast_GET_SIZE(fseq);
9744    /* If empty sequence, return u"". */
9745    if (seqlen == 0) {
9746        Py_DECREF(fseq);
9747        _Py_RETURN_UNICODE_EMPTY();
9748    }
9749
9750    /* If singleton sequence with an exact Unicode, return that. */
9751    last_obj = NULL;
9752    items = PySequence_Fast_ITEMS(fseq);
9753    if (seqlen == 1) {
9754        if (PyUnicode_CheckExact(items[0])) {
9755            res = items[0];
9756            Py_INCREF(res);
9757            Py_DECREF(fseq);
9758            return res;
9759        }
9760        seplen = 0;
9761        maxchar = 0;
9762    }
9763    else {
9764        /* Set up sep and seplen */
9765        if (separator == NULL) {
9766            /* fall back to a blank space separator */
9767            sep = PyUnicode_FromOrdinal(' ');
9768            if (!sep)
9769                goto onError;
9770            seplen = 1;
9771            maxchar = 32;
9772        }
9773        else {
9774            if (!PyUnicode_Check(separator)) {
9775                PyErr_Format(PyExc_TypeError,
9776                             "separator: expected str instance,"
9777                             " %.80s found",
9778                             Py_TYPE(separator)->tp_name);
9779                goto onError;
9780            }
9781            if (PyUnicode_READY(separator))
9782                goto onError;
9783            sep = separator;
9784            seplen = PyUnicode_GET_LENGTH(separator);
9785            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9786            /* inc refcount to keep this code path symmetric with the
9787               above case of a blank separator */
9788            Py_INCREF(sep);
9789        }
9790        last_obj = sep;
9791    }
9792
9793    /* There are at least two things to join, or else we have a subclass
9794     * of str in the sequence.
9795     * Do a pre-pass to figure out the total amount of space we'll
9796     * need (sz), and see whether all argument are strings.
9797     */
9798    sz = 0;
9799#ifdef Py_DEBUG
9800    use_memcpy = 0;
9801#else
9802    use_memcpy = 1;
9803#endif
9804    for (i = 0; i < seqlen; i++) {
9805        const Py_ssize_t old_sz = sz;
9806        item = items[i];
9807        if (!PyUnicode_Check(item)) {
9808            PyErr_Format(PyExc_TypeError,
9809                         "sequence item %zd: expected str instance,"
9810                         " %.80s found",
9811                         i, Py_TYPE(item)->tp_name);
9812            goto onError;
9813        }
9814        if (PyUnicode_READY(item) == -1)
9815            goto onError;
9816        sz += PyUnicode_GET_LENGTH(item);
9817        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9818        maxchar = Py_MAX(maxchar, item_maxchar);
9819        if (i != 0)
9820            sz += seplen;
9821        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9822            PyErr_SetString(PyExc_OverflowError,
9823                            "join() result is too long for a Python string");
9824            goto onError;
9825        }
9826        if (use_memcpy && last_obj != NULL) {
9827            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9828                use_memcpy = 0;
9829        }
9830        last_obj = item;
9831    }
9832
9833    res = PyUnicode_New(sz, maxchar);
9834    if (res == NULL)
9835        goto onError;
9836
9837    /* Catenate everything. */
9838#ifdef Py_DEBUG
9839    use_memcpy = 0;
9840#else
9841    if (use_memcpy) {
9842        res_data = PyUnicode_1BYTE_DATA(res);
9843        kind = PyUnicode_KIND(res);
9844        if (seplen != 0)
9845            sep_data = PyUnicode_1BYTE_DATA(sep);
9846    }
9847#endif
9848    if (use_memcpy) {
9849        for (i = 0; i < seqlen; ++i) {
9850            Py_ssize_t itemlen;
9851            item = items[i];
9852
9853            /* Copy item, and maybe the separator. */
9854            if (i && seplen != 0) {
9855                Py_MEMCPY(res_data,
9856                          sep_data,
9857                          kind * seplen);
9858                res_data += kind * seplen;
9859            }
9860
9861            itemlen = PyUnicode_GET_LENGTH(item);
9862            if (itemlen != 0) {
9863                Py_MEMCPY(res_data,
9864                          PyUnicode_DATA(item),
9865                          kind * itemlen);
9866                res_data += kind * itemlen;
9867            }
9868        }
9869        assert(res_data == PyUnicode_1BYTE_DATA(res)
9870                           + kind * PyUnicode_GET_LENGTH(res));
9871    }
9872    else {
9873        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9874            Py_ssize_t itemlen;
9875            item = items[i];
9876
9877            /* Copy item, and maybe the separator. */
9878            if (i && seplen != 0) {
9879                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9880                res_offset += seplen;
9881            }
9882
9883            itemlen = PyUnicode_GET_LENGTH(item);
9884            if (itemlen != 0) {
9885                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9886                res_offset += itemlen;
9887            }
9888        }
9889        assert(res_offset == PyUnicode_GET_LENGTH(res));
9890    }
9891
9892    Py_DECREF(fseq);
9893    Py_XDECREF(sep);
9894    assert(_PyUnicode_CheckConsistency(res, 1));
9895    return res;
9896
9897  onError:
9898    Py_DECREF(fseq);
9899    Py_XDECREF(sep);
9900    Py_XDECREF(res);
9901    return NULL;
9902}
9903
9904#define FILL(kind, data, value, start, length) \
9905    do { \
9906        Py_ssize_t i_ = 0; \
9907        assert(kind != PyUnicode_WCHAR_KIND); \
9908        switch ((kind)) { \
9909        case PyUnicode_1BYTE_KIND: { \
9910            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9911            memset(to_, (unsigned char)value, (length)); \
9912            break; \
9913        } \
9914        case PyUnicode_2BYTE_KIND: { \
9915            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9916            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9917            break; \
9918        } \
9919        case PyUnicode_4BYTE_KIND: { \
9920            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9921            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9922            break; \
9923        } \
9924        default: assert(0); \
9925        } \
9926    } while (0)
9927
9928void
9929_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9930                    Py_UCS4 fill_char)
9931{
9932    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9933    const void *data = PyUnicode_DATA(unicode);
9934    assert(PyUnicode_IS_READY(unicode));
9935    assert(unicode_modifiable(unicode));
9936    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9937    assert(start >= 0);
9938    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9939    FILL(kind, data, fill_char, start, length);
9940}
9941
9942Py_ssize_t
9943PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9944               Py_UCS4 fill_char)
9945{
9946    Py_ssize_t maxlen;
9947
9948    if (!PyUnicode_Check(unicode)) {
9949        PyErr_BadInternalCall();
9950        return -1;
9951    }
9952    if (PyUnicode_READY(unicode) == -1)
9953        return -1;
9954    if (unicode_check_modifiable(unicode))
9955        return -1;
9956
9957    if (start < 0) {
9958        PyErr_SetString(PyExc_IndexError, "string index out of range");
9959        return -1;
9960    }
9961    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9962        PyErr_SetString(PyExc_ValueError,
9963                         "fill character is bigger than "
9964                         "the string maximum character");
9965        return -1;
9966    }
9967
9968    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9969    length = Py_MIN(maxlen, length);
9970    if (length <= 0)
9971        return 0;
9972
9973    _PyUnicode_FastFill(unicode, start, length, fill_char);
9974    return length;
9975}
9976
9977static PyObject *
9978pad(PyObject *self,
9979    Py_ssize_t left,
9980    Py_ssize_t right,
9981    Py_UCS4 fill)
9982{
9983    PyObject *u;
9984    Py_UCS4 maxchar;
9985    int kind;
9986    void *data;
9987
9988    if (left < 0)
9989        left = 0;
9990    if (right < 0)
9991        right = 0;
9992
9993    if (left == 0 && right == 0)
9994        return unicode_result_unchanged(self);
9995
9996    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9997        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9998        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9999        return NULL;
10000    }
10001    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10002    maxchar = Py_MAX(maxchar, fill);
10003    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10004    if (!u)
10005        return NULL;
10006
10007    kind = PyUnicode_KIND(u);
10008    data = PyUnicode_DATA(u);
10009    if (left)
10010        FILL(kind, data, fill, 0, left);
10011    if (right)
10012        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10013    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10014    assert(_PyUnicode_CheckConsistency(u, 1));
10015    return u;
10016}
10017
10018PyObject *
10019PyUnicode_Splitlines(PyObject *string, int keepends)
10020{
10021    PyObject *list;
10022
10023    string = PyUnicode_FromObject(string);
10024    if (string == NULL)
10025        return NULL;
10026    if (PyUnicode_READY(string) == -1) {
10027        Py_DECREF(string);
10028        return NULL;
10029    }
10030
10031    switch (PyUnicode_KIND(string)) {
10032    case PyUnicode_1BYTE_KIND:
10033        if (PyUnicode_IS_ASCII(string))
10034            list = asciilib_splitlines(
10035                string, PyUnicode_1BYTE_DATA(string),
10036                PyUnicode_GET_LENGTH(string), keepends);
10037        else
10038            list = ucs1lib_splitlines(
10039                string, PyUnicode_1BYTE_DATA(string),
10040                PyUnicode_GET_LENGTH(string), keepends);
10041        break;
10042    case PyUnicode_2BYTE_KIND:
10043        list = ucs2lib_splitlines(
10044            string, PyUnicode_2BYTE_DATA(string),
10045            PyUnicode_GET_LENGTH(string), keepends);
10046        break;
10047    case PyUnicode_4BYTE_KIND:
10048        list = ucs4lib_splitlines(
10049            string, PyUnicode_4BYTE_DATA(string),
10050            PyUnicode_GET_LENGTH(string), keepends);
10051        break;
10052    default:
10053        assert(0);
10054        list = 0;
10055    }
10056    Py_DECREF(string);
10057    return list;
10058}
10059
10060static PyObject *
10061split(PyObject *self,
10062      PyObject *substring,
10063      Py_ssize_t maxcount)
10064{
10065    int kind1, kind2;
10066    void *buf1, *buf2;
10067    Py_ssize_t len1, len2;
10068    PyObject* out;
10069
10070    if (maxcount < 0)
10071        maxcount = PY_SSIZE_T_MAX;
10072
10073    if (PyUnicode_READY(self) == -1)
10074        return NULL;
10075
10076    if (substring == NULL)
10077        switch (PyUnicode_KIND(self)) {
10078        case PyUnicode_1BYTE_KIND:
10079            if (PyUnicode_IS_ASCII(self))
10080                return asciilib_split_whitespace(
10081                    self,  PyUnicode_1BYTE_DATA(self),
10082                    PyUnicode_GET_LENGTH(self), maxcount
10083                    );
10084            else
10085                return ucs1lib_split_whitespace(
10086                    self,  PyUnicode_1BYTE_DATA(self),
10087                    PyUnicode_GET_LENGTH(self), maxcount
10088                    );
10089        case PyUnicode_2BYTE_KIND:
10090            return ucs2lib_split_whitespace(
10091                self,  PyUnicode_2BYTE_DATA(self),
10092                PyUnicode_GET_LENGTH(self), maxcount
10093                );
10094        case PyUnicode_4BYTE_KIND:
10095            return ucs4lib_split_whitespace(
10096                self,  PyUnicode_4BYTE_DATA(self),
10097                PyUnicode_GET_LENGTH(self), maxcount
10098                );
10099        default:
10100            assert(0);
10101            return NULL;
10102        }
10103
10104    if (PyUnicode_READY(substring) == -1)
10105        return NULL;
10106
10107    kind1 = PyUnicode_KIND(self);
10108    kind2 = PyUnicode_KIND(substring);
10109    len1 = PyUnicode_GET_LENGTH(self);
10110    len2 = PyUnicode_GET_LENGTH(substring);
10111    if (kind1 < kind2 || len1 < len2) {
10112        out = PyList_New(1);
10113        if (out == NULL)
10114            return NULL;
10115        Py_INCREF(self);
10116        PyList_SET_ITEM(out, 0, self);
10117        return out;
10118    }
10119    buf1 = PyUnicode_DATA(self);
10120    buf2 = PyUnicode_DATA(substring);
10121    if (kind2 != kind1) {
10122        buf2 = _PyUnicode_AsKind(substring, kind1);
10123        if (!buf2)
10124            return NULL;
10125    }
10126
10127    switch (kind1) {
10128    case PyUnicode_1BYTE_KIND:
10129        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10130            out = asciilib_split(
10131                self,  buf1, len1, buf2, len2, maxcount);
10132        else
10133            out = ucs1lib_split(
10134                self,  buf1, len1, buf2, len2, maxcount);
10135        break;
10136    case PyUnicode_2BYTE_KIND:
10137        out = ucs2lib_split(
10138            self,  buf1, len1, buf2, len2, maxcount);
10139        break;
10140    case PyUnicode_4BYTE_KIND:
10141        out = ucs4lib_split(
10142            self,  buf1, len1, buf2, len2, maxcount);
10143        break;
10144    default:
10145        out = NULL;
10146    }
10147    if (kind2 != kind1)
10148        PyMem_Free(buf2);
10149    return out;
10150}
10151
10152static PyObject *
10153rsplit(PyObject *self,
10154       PyObject *substring,
10155       Py_ssize_t maxcount)
10156{
10157    int kind1, kind2;
10158    void *buf1, *buf2;
10159    Py_ssize_t len1, len2;
10160    PyObject* out;
10161
10162    if (maxcount < 0)
10163        maxcount = PY_SSIZE_T_MAX;
10164
10165    if (PyUnicode_READY(self) == -1)
10166        return NULL;
10167
10168    if (substring == NULL)
10169        switch (PyUnicode_KIND(self)) {
10170        case PyUnicode_1BYTE_KIND:
10171            if (PyUnicode_IS_ASCII(self))
10172                return asciilib_rsplit_whitespace(
10173                    self,  PyUnicode_1BYTE_DATA(self),
10174                    PyUnicode_GET_LENGTH(self), maxcount
10175                    );
10176            else
10177                return ucs1lib_rsplit_whitespace(
10178                    self,  PyUnicode_1BYTE_DATA(self),
10179                    PyUnicode_GET_LENGTH(self), maxcount
10180                    );
10181        case PyUnicode_2BYTE_KIND:
10182            return ucs2lib_rsplit_whitespace(
10183                self,  PyUnicode_2BYTE_DATA(self),
10184                PyUnicode_GET_LENGTH(self), maxcount
10185                );
10186        case PyUnicode_4BYTE_KIND:
10187            return ucs4lib_rsplit_whitespace(
10188                self,  PyUnicode_4BYTE_DATA(self),
10189                PyUnicode_GET_LENGTH(self), maxcount
10190                );
10191        default:
10192            assert(0);
10193            return NULL;
10194        }
10195
10196    if (PyUnicode_READY(substring) == -1)
10197        return NULL;
10198
10199    kind1 = PyUnicode_KIND(self);
10200    kind2 = PyUnicode_KIND(substring);
10201    len1 = PyUnicode_GET_LENGTH(self);
10202    len2 = PyUnicode_GET_LENGTH(substring);
10203    if (kind1 < kind2 || len1 < len2) {
10204        out = PyList_New(1);
10205        if (out == NULL)
10206            return NULL;
10207        Py_INCREF(self);
10208        PyList_SET_ITEM(out, 0, self);
10209        return out;
10210    }
10211    buf1 = PyUnicode_DATA(self);
10212    buf2 = PyUnicode_DATA(substring);
10213    if (kind2 != kind1) {
10214        buf2 = _PyUnicode_AsKind(substring, kind1);
10215        if (!buf2)
10216            return NULL;
10217    }
10218
10219    switch (kind1) {
10220    case PyUnicode_1BYTE_KIND:
10221        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10222            out = asciilib_rsplit(
10223                self,  buf1, len1, buf2, len2, maxcount);
10224        else
10225            out = ucs1lib_rsplit(
10226                self,  buf1, len1, buf2, len2, maxcount);
10227        break;
10228    case PyUnicode_2BYTE_KIND:
10229        out = ucs2lib_rsplit(
10230            self,  buf1, len1, buf2, len2, maxcount);
10231        break;
10232    case PyUnicode_4BYTE_KIND:
10233        out = ucs4lib_rsplit(
10234            self,  buf1, len1, buf2, len2, maxcount);
10235        break;
10236    default:
10237        out = NULL;
10238    }
10239    if (kind2 != kind1)
10240        PyMem_Free(buf2);
10241    return out;
10242}
10243
10244static Py_ssize_t
10245anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10246            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10247{
10248    switch (kind) {
10249    case PyUnicode_1BYTE_KIND:
10250        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10251            return asciilib_find(buf1, len1, buf2, len2, offset);
10252        else
10253            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10254    case PyUnicode_2BYTE_KIND:
10255        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10256    case PyUnicode_4BYTE_KIND:
10257        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10258    }
10259    assert(0);
10260    return -1;
10261}
10262
10263static Py_ssize_t
10264anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10265             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10266{
10267    switch (kind) {
10268    case PyUnicode_1BYTE_KIND:
10269        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10270            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10271        else
10272            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10273    case PyUnicode_2BYTE_KIND:
10274        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10275    case PyUnicode_4BYTE_KIND:
10276        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10277    }
10278    assert(0);
10279    return 0;
10280}
10281
10282static void
10283replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10284                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10285{
10286    int kind = PyUnicode_KIND(u);
10287    void *data = PyUnicode_DATA(u);
10288    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10289    if (kind == PyUnicode_1BYTE_KIND) {
10290        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10291                                      (Py_UCS1 *)data + len,
10292                                      u1, u2, maxcount);
10293    }
10294    else if (kind == PyUnicode_2BYTE_KIND) {
10295        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10296                                      (Py_UCS2 *)data + len,
10297                                      u1, u2, maxcount);
10298    }
10299    else {
10300        assert(kind == PyUnicode_4BYTE_KIND);
10301        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10302                                      (Py_UCS4 *)data + len,
10303                                      u1, u2, maxcount);
10304    }
10305}
10306
10307static PyObject *
10308replace(PyObject *self, PyObject *str1,
10309        PyObject *str2, Py_ssize_t maxcount)
10310{
10311    PyObject *u;
10312    char *sbuf = PyUnicode_DATA(self);
10313    char *buf1 = PyUnicode_DATA(str1);
10314    char *buf2 = PyUnicode_DATA(str2);
10315    int srelease = 0, release1 = 0, release2 = 0;
10316    int skind = PyUnicode_KIND(self);
10317    int kind1 = PyUnicode_KIND(str1);
10318    int kind2 = PyUnicode_KIND(str2);
10319    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10320    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10321    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10322    int mayshrink;
10323    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10324
10325    if (maxcount < 0)
10326        maxcount = PY_SSIZE_T_MAX;
10327    else if (maxcount == 0 || slen == 0)
10328        goto nothing;
10329
10330    if (str1 == str2)
10331        goto nothing;
10332
10333    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10334    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10335    if (maxchar < maxchar_str1)
10336        /* substring too wide to be present */
10337        goto nothing;
10338    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10339    /* Replacing str1 with str2 may cause a maxchar reduction in the
10340       result string. */
10341    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10342    maxchar = Py_MAX(maxchar, maxchar_str2);
10343
10344    if (len1 == len2) {
10345        /* same length */
10346        if (len1 == 0)
10347            goto nothing;
10348        if (len1 == 1) {
10349            /* replace characters */
10350            Py_UCS4 u1, u2;
10351            Py_ssize_t pos;
10352
10353            u1 = PyUnicode_READ(kind1, buf1, 0);
10354            pos = findchar(sbuf, skind, slen, u1, 1);
10355            if (pos < 0)
10356                goto nothing;
10357            u2 = PyUnicode_READ(kind2, buf2, 0);
10358            u = PyUnicode_New(slen, maxchar);
10359            if (!u)
10360                goto error;
10361
10362            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10363            replace_1char_inplace(u, pos, u1, u2, maxcount);
10364        }
10365        else {
10366            int rkind = skind;
10367            char *res;
10368            Py_ssize_t i;
10369
10370            if (kind1 < rkind) {
10371                /* widen substring */
10372                buf1 = _PyUnicode_AsKind(str1, rkind);
10373                if (!buf1) goto error;
10374                release1 = 1;
10375            }
10376            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10377            if (i < 0)
10378                goto nothing;
10379            if (rkind > kind2) {
10380                /* widen replacement */
10381                buf2 = _PyUnicode_AsKind(str2, rkind);
10382                if (!buf2) goto error;
10383                release2 = 1;
10384            }
10385            else if (rkind < kind2) {
10386                /* widen self and buf1 */
10387                rkind = kind2;
10388                if (release1) PyMem_Free(buf1);
10389                release1 = 0;
10390                sbuf = _PyUnicode_AsKind(self, rkind);
10391                if (!sbuf) goto error;
10392                srelease = 1;
10393                buf1 = _PyUnicode_AsKind(str1, rkind);
10394                if (!buf1) goto error;
10395                release1 = 1;
10396            }
10397            u = PyUnicode_New(slen, maxchar);
10398            if (!u)
10399                goto error;
10400            assert(PyUnicode_KIND(u) == rkind);
10401            res = PyUnicode_DATA(u);
10402
10403            memcpy(res, sbuf, rkind * slen);
10404            /* change everything in-place, starting with this one */
10405            memcpy(res + rkind * i,
10406                   buf2,
10407                   rkind * len2);
10408            i += len1;
10409
10410            while ( --maxcount > 0) {
10411                i = anylib_find(rkind, self,
10412                                sbuf+rkind*i, slen-i,
10413                                str1, buf1, len1, i);
10414                if (i == -1)
10415                    break;
10416                memcpy(res + rkind * i,
10417                       buf2,
10418                       rkind * len2);
10419                i += len1;
10420            }
10421        }
10422    }
10423    else {
10424        Py_ssize_t n, i, j, ires;
10425        Py_ssize_t new_size;
10426        int rkind = skind;
10427        char *res;
10428
10429        if (kind1 < rkind) {
10430            /* widen substring */
10431            buf1 = _PyUnicode_AsKind(str1, rkind);
10432            if (!buf1) goto error;
10433            release1 = 1;
10434        }
10435        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10436        if (n == 0)
10437            goto nothing;
10438        if (kind2 < rkind) {
10439            /* widen replacement */
10440            buf2 = _PyUnicode_AsKind(str2, rkind);
10441            if (!buf2) goto error;
10442            release2 = 1;
10443        }
10444        else if (kind2 > rkind) {
10445            /* widen self and buf1 */
10446            rkind = kind2;
10447            sbuf = _PyUnicode_AsKind(self, rkind);
10448            if (!sbuf) goto error;
10449            srelease = 1;
10450            if (release1) PyMem_Free(buf1);
10451            release1 = 0;
10452            buf1 = _PyUnicode_AsKind(str1, rkind);
10453            if (!buf1) goto error;
10454            release1 = 1;
10455        }
10456        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10457           PyUnicode_GET_LENGTH(str1))); */
10458        if (len1 < len2 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10459                PyErr_SetString(PyExc_OverflowError,
10460                                "replace string is too long");
10461                goto error;
10462        }
10463        new_size = slen + n * (len2 - len1);
10464        if (new_size == 0) {
10465            _Py_INCREF_UNICODE_EMPTY();
10466            if (!unicode_empty)
10467                goto error;
10468            u = unicode_empty;
10469            goto done;
10470        }
10471        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10472            PyErr_SetString(PyExc_OverflowError,
10473                            "replace string is too long");
10474            goto error;
10475        }
10476        u = PyUnicode_New(new_size, maxchar);
10477        if (!u)
10478            goto error;
10479        assert(PyUnicode_KIND(u) == rkind);
10480        res = PyUnicode_DATA(u);
10481        ires = i = 0;
10482        if (len1 > 0) {
10483            while (n-- > 0) {
10484                /* look for next match */
10485                j = anylib_find(rkind, self,
10486                                sbuf + rkind * i, slen-i,
10487                                str1, buf1, len1, i);
10488                if (j == -1)
10489                    break;
10490                else if (j > i) {
10491                    /* copy unchanged part [i:j] */
10492                    memcpy(res + rkind * ires,
10493                           sbuf + rkind * i,
10494                           rkind * (j-i));
10495                    ires += j - i;
10496                }
10497                /* copy substitution string */
10498                if (len2 > 0) {
10499                    memcpy(res + rkind * ires,
10500                           buf2,
10501                           rkind * len2);
10502                    ires += len2;
10503                }
10504                i = j + len1;
10505            }
10506            if (i < slen)
10507                /* copy tail [i:] */
10508                memcpy(res + rkind * ires,
10509                       sbuf + rkind * i,
10510                       rkind * (slen-i));
10511        }
10512        else {
10513            /* interleave */
10514            while (n > 0) {
10515                memcpy(res + rkind * ires,
10516                       buf2,
10517                       rkind * len2);
10518                ires += len2;
10519                if (--n <= 0)
10520                    break;
10521                memcpy(res + rkind * ires,
10522                       sbuf + rkind * i,
10523                       rkind);
10524                ires++;
10525                i++;
10526            }
10527            memcpy(res + rkind * ires,
10528                   sbuf + rkind * i,
10529                   rkind * (slen-i));
10530        }
10531    }
10532
10533    if (mayshrink) {
10534        unicode_adjust_maxchar(&u);
10535        if (u == NULL)
10536            goto error;
10537    }
10538
10539  done:
10540    if (srelease)
10541        PyMem_FREE(sbuf);
10542    if (release1)
10543        PyMem_FREE(buf1);
10544    if (release2)
10545        PyMem_FREE(buf2);
10546    assert(_PyUnicode_CheckConsistency(u, 1));
10547    return u;
10548
10549  nothing:
10550    /* nothing to replace; return original string (when possible) */
10551    if (srelease)
10552        PyMem_FREE(sbuf);
10553    if (release1)
10554        PyMem_FREE(buf1);
10555    if (release2)
10556        PyMem_FREE(buf2);
10557    return unicode_result_unchanged(self);
10558
10559  error:
10560    if (srelease && sbuf)
10561        PyMem_FREE(sbuf);
10562    if (release1 && buf1)
10563        PyMem_FREE(buf1);
10564    if (release2 && buf2)
10565        PyMem_FREE(buf2);
10566    return NULL;
10567}
10568
10569/* --- Unicode Object Methods --------------------------------------------- */
10570
10571PyDoc_STRVAR(title__doc__,
10572             "S.title() -> str\n\
10573\n\
10574Return a titlecased version of S, i.e. words start with title case\n\
10575characters, all remaining cased characters have lower case.");
10576
10577static PyObject*
10578unicode_title(PyObject *self)
10579{
10580    if (PyUnicode_READY(self) == -1)
10581        return NULL;
10582    return case_operation(self, do_title);
10583}
10584
10585PyDoc_STRVAR(capitalize__doc__,
10586             "S.capitalize() -> str\n\
10587\n\
10588Return a capitalized version of S, i.e. make the first character\n\
10589have upper case and the rest lower case.");
10590
10591static PyObject*
10592unicode_capitalize(PyObject *self)
10593{
10594    if (PyUnicode_READY(self) == -1)
10595        return NULL;
10596    if (PyUnicode_GET_LENGTH(self) == 0)
10597        return unicode_result_unchanged(self);
10598    return case_operation(self, do_capitalize);
10599}
10600
10601PyDoc_STRVAR(casefold__doc__,
10602             "S.casefold() -> str\n\
10603\n\
10604Return a version of S suitable for caseless comparisons.");
10605
10606static PyObject *
10607unicode_casefold(PyObject *self)
10608{
10609    if (PyUnicode_READY(self) == -1)
10610        return NULL;
10611    if (PyUnicode_IS_ASCII(self))
10612        return ascii_upper_or_lower(self, 1);
10613    return case_operation(self, do_casefold);
10614}
10615
10616
10617/* Argument converter.  Coerces to a single unicode character */
10618
10619static int
10620convert_uc(PyObject *obj, void *addr)
10621{
10622    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10623    PyObject *uniobj;
10624
10625    uniobj = PyUnicode_FromObject(obj);
10626    if (uniobj == NULL) {
10627        PyErr_SetString(PyExc_TypeError,
10628                        "The fill character cannot be converted to Unicode");
10629        return 0;
10630    }
10631    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10632        PyErr_SetString(PyExc_TypeError,
10633                        "The fill character must be exactly one character long");
10634        Py_DECREF(uniobj);
10635        return 0;
10636    }
10637    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10638    Py_DECREF(uniobj);
10639    return 1;
10640}
10641
10642PyDoc_STRVAR(center__doc__,
10643             "S.center(width[, fillchar]) -> str\n\
10644\n\
10645Return S centered in a string of length width. Padding is\n\
10646done using the specified fill character (default is a space)");
10647
10648static PyObject *
10649unicode_center(PyObject *self, PyObject *args)
10650{
10651    Py_ssize_t marg, left;
10652    Py_ssize_t width;
10653    Py_UCS4 fillchar = ' ';
10654
10655    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10656        return NULL;
10657
10658    if (PyUnicode_READY(self) == -1)
10659        return NULL;
10660
10661    if (PyUnicode_GET_LENGTH(self) >= width)
10662        return unicode_result_unchanged(self);
10663
10664    marg = width - PyUnicode_GET_LENGTH(self);
10665    left = marg / 2 + (marg & width & 1);
10666
10667    return pad(self, left, marg - left, fillchar);
10668}
10669
10670/* This function assumes that str1 and str2 are readied by the caller. */
10671
10672static int
10673unicode_compare(PyObject *str1, PyObject *str2)
10674{
10675#define COMPARE(TYPE1, TYPE2) \
10676    do { \
10677        TYPE1* p1 = (TYPE1 *)data1; \
10678        TYPE2* p2 = (TYPE2 *)data2; \
10679        TYPE1* end = p1 + len; \
10680        Py_UCS4 c1, c2; \
10681        for (; p1 != end; p1++, p2++) { \
10682            c1 = *p1; \
10683            c2 = *p2; \
10684            if (c1 != c2) \
10685                return (c1 < c2) ? -1 : 1; \
10686        } \
10687    } \
10688    while (0)
10689
10690    int kind1, kind2;
10691    void *data1, *data2;
10692    Py_ssize_t len1, len2, len;
10693
10694    kind1 = PyUnicode_KIND(str1);
10695    kind2 = PyUnicode_KIND(str2);
10696    data1 = PyUnicode_DATA(str1);
10697    data2 = PyUnicode_DATA(str2);
10698    len1 = PyUnicode_GET_LENGTH(str1);
10699    len2 = PyUnicode_GET_LENGTH(str2);
10700    len = Py_MIN(len1, len2);
10701
10702    switch(kind1) {
10703    case PyUnicode_1BYTE_KIND:
10704    {
10705        switch(kind2) {
10706        case PyUnicode_1BYTE_KIND:
10707        {
10708            int cmp = memcmp(data1, data2, len);
10709            /* normalize result of memcmp() into the range [-1; 1] */
10710            if (cmp < 0)
10711                return -1;
10712            if (cmp > 0)
10713                return 1;
10714            break;
10715        }
10716        case PyUnicode_2BYTE_KIND:
10717            COMPARE(Py_UCS1, Py_UCS2);
10718            break;
10719        case PyUnicode_4BYTE_KIND:
10720            COMPARE(Py_UCS1, Py_UCS4);
10721            break;
10722        default:
10723            assert(0);
10724        }
10725        break;
10726    }
10727    case PyUnicode_2BYTE_KIND:
10728    {
10729        switch(kind2) {
10730        case PyUnicode_1BYTE_KIND:
10731            COMPARE(Py_UCS2, Py_UCS1);
10732            break;
10733        case PyUnicode_2BYTE_KIND:
10734        {
10735            COMPARE(Py_UCS2, Py_UCS2);
10736            break;
10737        }
10738        case PyUnicode_4BYTE_KIND:
10739            COMPARE(Py_UCS2, Py_UCS4);
10740            break;
10741        default:
10742            assert(0);
10743        }
10744        break;
10745    }
10746    case PyUnicode_4BYTE_KIND:
10747    {
10748        switch(kind2) {
10749        case PyUnicode_1BYTE_KIND:
10750            COMPARE(Py_UCS4, Py_UCS1);
10751            break;
10752        case PyUnicode_2BYTE_KIND:
10753            COMPARE(Py_UCS4, Py_UCS2);
10754            break;
10755        case PyUnicode_4BYTE_KIND:
10756        {
10757#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10758            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10759            /* normalize result of wmemcmp() into the range [-1; 1] */
10760            if (cmp < 0)
10761                return -1;
10762            if (cmp > 0)
10763                return 1;
10764#else
10765            COMPARE(Py_UCS4, Py_UCS4);
10766#endif
10767            break;
10768        }
10769        default:
10770            assert(0);
10771        }
10772        break;
10773    }
10774    default:
10775        assert(0);
10776    }
10777
10778    if (len1 == len2)
10779        return 0;
10780    if (len1 < len2)
10781        return -1;
10782    else
10783        return 1;
10784
10785#undef COMPARE
10786}
10787
10788Py_LOCAL(int)
10789unicode_compare_eq(PyObject *str1, PyObject *str2)
10790{
10791    int kind;
10792    void *data1, *data2;
10793    Py_ssize_t len;
10794    int cmp;
10795
10796    len = PyUnicode_GET_LENGTH(str1);
10797    if (PyUnicode_GET_LENGTH(str2) != len)
10798        return 0;
10799    kind = PyUnicode_KIND(str1);
10800    if (PyUnicode_KIND(str2) != kind)
10801        return 0;
10802    data1 = PyUnicode_DATA(str1);
10803    data2 = PyUnicode_DATA(str2);
10804
10805    cmp = memcmp(data1, data2, len * kind);
10806    return (cmp == 0);
10807}
10808
10809
10810int
10811PyUnicode_Compare(PyObject *left, PyObject *right)
10812{
10813    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10814        if (PyUnicode_READY(left) == -1 ||
10815            PyUnicode_READY(right) == -1)
10816            return -1;
10817
10818        /* a string is equal to itself */
10819        if (left == right)
10820            return 0;
10821
10822        return unicode_compare(left, right);
10823    }
10824    PyErr_Format(PyExc_TypeError,
10825                 "Can't compare %.100s and %.100s",
10826                 left->ob_type->tp_name,
10827                 right->ob_type->tp_name);
10828    return -1;
10829}
10830
10831int
10832_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10833{
10834    PyObject *right_str = _PyUnicode_FromId(right);   /* borrowed */
10835    if (right_str == NULL)
10836        return -1;
10837    return PyUnicode_Compare(left, right_str);
10838}
10839
10840int
10841PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10842{
10843    Py_ssize_t i;
10844    int kind;
10845    Py_UCS4 chr;
10846
10847    assert(_PyUnicode_CHECK(uni));
10848    if (PyUnicode_READY(uni) == -1)
10849        return -1;
10850    kind = PyUnicode_KIND(uni);
10851    if (kind == PyUnicode_1BYTE_KIND) {
10852        const void *data = PyUnicode_1BYTE_DATA(uni);
10853        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10854        size_t len, len2 = strlen(str);
10855        int cmp;
10856
10857        len = Py_MIN(len1, len2);
10858        cmp = memcmp(data, str, len);
10859        if (cmp != 0) {
10860            if (cmp < 0)
10861                return -1;
10862            else
10863                return 1;
10864        }
10865        if (len1 > len2)
10866            return 1; /* uni is longer */
10867        if (len1 < len2)
10868            return -1; /* str is longer */
10869        return 0;
10870    }
10871    else {
10872        void *data = PyUnicode_DATA(uni);
10873        /* Compare Unicode string and source character set string */
10874        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10875            if (chr != (unsigned char)str[i])
10876                return (chr < (unsigned char)(str[i])) ? -1 : 1;
10877        /* This check keeps Python strings that end in '\0' from comparing equal
10878         to C strings identical up to that point. */
10879        if (PyUnicode_GET_LENGTH(uni) != i || chr)
10880            return 1; /* uni is longer */
10881        if (str[i])
10882            return -1; /* str is longer */
10883        return 0;
10884    }
10885}
10886
10887
10888#define TEST_COND(cond)                         \
10889    ((cond) ? Py_True : Py_False)
10890
10891PyObject *
10892PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10893{
10894    int result;
10895    PyObject *v;
10896
10897    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10898        Py_RETURN_NOTIMPLEMENTED;
10899
10900    if (PyUnicode_READY(left) == -1 ||
10901        PyUnicode_READY(right) == -1)
10902        return NULL;
10903
10904    if (left == right) {
10905        switch (op) {
10906        case Py_EQ:
10907        case Py_LE:
10908        case Py_GE:
10909            /* a string is equal to itself */
10910            v = Py_True;
10911            break;
10912        case Py_NE:
10913        case Py_LT:
10914        case Py_GT:
10915            v = Py_False;
10916            break;
10917        default:
10918            PyErr_BadArgument();
10919            return NULL;
10920        }
10921    }
10922    else if (op == Py_EQ || op == Py_NE) {
10923        result = unicode_compare_eq(left, right);
10924        result ^= (op == Py_NE);
10925        v = TEST_COND(result);
10926    }
10927    else {
10928        result = unicode_compare(left, right);
10929
10930        /* Convert the return value to a Boolean */
10931        switch (op) {
10932        case Py_LE:
10933            v = TEST_COND(result <= 0);
10934            break;
10935        case Py_GE:
10936            v = TEST_COND(result >= 0);
10937            break;
10938        case Py_LT:
10939            v = TEST_COND(result == -1);
10940            break;
10941        case Py_GT:
10942            v = TEST_COND(result == 1);
10943            break;
10944        default:
10945            PyErr_BadArgument();
10946            return NULL;
10947        }
10948    }
10949    Py_INCREF(v);
10950    return v;
10951}
10952
10953int
10954_PyUnicode_EQ(PyObject *aa, PyObject *bb)
10955{
10956    return unicode_eq(aa, bb);
10957}
10958
10959int
10960PyUnicode_Contains(PyObject *container, PyObject *element)
10961{
10962    PyObject *str, *sub;
10963    int kind1, kind2;
10964    void *buf1, *buf2;
10965    Py_ssize_t len1, len2;
10966    int result;
10967
10968    /* Coerce the two arguments */
10969    sub = PyUnicode_FromObject(element);
10970    if (!sub) {
10971        PyErr_Format(PyExc_TypeError,
10972                     "'in <string>' requires string as left operand, not %s",
10973                     element->ob_type->tp_name);
10974        return -1;
10975    }
10976
10977    str = PyUnicode_FromObject(container);
10978    if (!str) {
10979        Py_DECREF(sub);
10980        return -1;
10981    }
10982
10983    kind1 = PyUnicode_KIND(str);
10984    kind2 = PyUnicode_KIND(sub);
10985    if (kind1 < kind2) {
10986        Py_DECREF(sub);
10987        Py_DECREF(str);
10988        return 0;
10989    }
10990    len1 = PyUnicode_GET_LENGTH(str);
10991    len2 = PyUnicode_GET_LENGTH(sub);
10992    if (len1 < len2) {
10993        Py_DECREF(sub);
10994        Py_DECREF(str);
10995        return 0;
10996    }
10997    buf1 = PyUnicode_DATA(str);
10998    buf2 = PyUnicode_DATA(sub);
10999    if (len2 == 1) {
11000        Py_UCS4 ch = PyUnicode_READ(kind2, buf2, 0);
11001        result = findchar((const char *)buf1, kind1, len1, ch, 1) != -1;
11002        Py_DECREF(sub);
11003        Py_DECREF(str);
11004        return result;
11005    }
11006    if (kind2 != kind1) {
11007        buf2 = _PyUnicode_AsKind(sub, kind1);
11008        if (!buf2) {
11009            Py_DECREF(sub);
11010            Py_DECREF(str);
11011            return -1;
11012        }
11013    }
11014
11015    switch (kind1) {
11016    case PyUnicode_1BYTE_KIND:
11017        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
11018        break;
11019    case PyUnicode_2BYTE_KIND:
11020        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
11021        break;
11022    case PyUnicode_4BYTE_KIND:
11023        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11024        break;
11025    default:
11026        result = -1;
11027        assert(0);
11028    }
11029
11030    Py_DECREF(str);
11031    Py_DECREF(sub);
11032
11033    if (kind2 != kind1)
11034        PyMem_Free(buf2);
11035
11036    return result;
11037}
11038
11039/* Concat to string or Unicode object giving a new Unicode object. */
11040
11041PyObject *
11042PyUnicode_Concat(PyObject *left, PyObject *right)
11043{
11044    PyObject *u = NULL, *v = NULL, *w;
11045    Py_UCS4 maxchar, maxchar2;
11046    Py_ssize_t u_len, v_len, new_len;
11047
11048    /* Coerce the two arguments */
11049    u = PyUnicode_FromObject(left);
11050    if (u == NULL)
11051        goto onError;
11052    v = PyUnicode_FromObject(right);
11053    if (v == NULL)
11054        goto onError;
11055
11056    /* Shortcuts */
11057    if (v == unicode_empty) {
11058        Py_DECREF(v);
11059        return u;
11060    }
11061    if (u == unicode_empty) {
11062        Py_DECREF(u);
11063        return v;
11064    }
11065
11066    u_len = PyUnicode_GET_LENGTH(u);
11067    v_len = PyUnicode_GET_LENGTH(v);
11068    if (u_len > PY_SSIZE_T_MAX - v_len) {
11069        PyErr_SetString(PyExc_OverflowError,
11070                        "strings are too large to concat");
11071        goto onError;
11072    }
11073    new_len = u_len + v_len;
11074
11075    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
11076    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11077    maxchar = Py_MAX(maxchar, maxchar2);
11078
11079    /* Concat the two Unicode strings */
11080    w = PyUnicode_New(new_len, maxchar);
11081    if (w == NULL)
11082        goto onError;
11083    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11084    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
11085    Py_DECREF(u);
11086    Py_DECREF(v);
11087    assert(_PyUnicode_CheckConsistency(w, 1));
11088    return w;
11089
11090  onError:
11091    Py_XDECREF(u);
11092    Py_XDECREF(v);
11093    return NULL;
11094}
11095
11096void
11097PyUnicode_Append(PyObject **p_left, PyObject *right)
11098{
11099    PyObject *left, *res;
11100    Py_UCS4 maxchar, maxchar2;
11101    Py_ssize_t left_len, right_len, new_len;
11102
11103    if (p_left == NULL) {
11104        if (!PyErr_Occurred())
11105            PyErr_BadInternalCall();
11106        return;
11107    }
11108    left = *p_left;
11109    if (right == NULL || left == NULL
11110        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11111        if (!PyErr_Occurred())
11112            PyErr_BadInternalCall();
11113        goto error;
11114    }
11115
11116    if (PyUnicode_READY(left) == -1)
11117        goto error;
11118    if (PyUnicode_READY(right) == -1)
11119        goto error;
11120
11121    /* Shortcuts */
11122    if (left == unicode_empty) {
11123        Py_DECREF(left);
11124        Py_INCREF(right);
11125        *p_left = right;
11126        return;
11127    }
11128    if (right == unicode_empty)
11129        return;
11130
11131    left_len = PyUnicode_GET_LENGTH(left);
11132    right_len = PyUnicode_GET_LENGTH(right);
11133    if (left_len > PY_SSIZE_T_MAX - right_len) {
11134        PyErr_SetString(PyExc_OverflowError,
11135                        "strings are too large to concat");
11136        goto error;
11137    }
11138    new_len = left_len + right_len;
11139
11140    if (unicode_modifiable(left)
11141        && PyUnicode_CheckExact(right)
11142        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11143        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11144           to change the structure size, but characters are stored just after
11145           the structure, and so it requires to move all characters which is
11146           not so different than duplicating the string. */
11147        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11148    {
11149        /* append inplace */
11150        if (unicode_resize(p_left, new_len) != 0)
11151            goto error;
11152
11153        /* copy 'right' into the newly allocated area of 'left' */
11154        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11155    }
11156    else {
11157        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11158        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11159        maxchar = Py_MAX(maxchar, maxchar2);
11160
11161        /* Concat the two Unicode strings */
11162        res = PyUnicode_New(new_len, maxchar);
11163        if (res == NULL)
11164            goto error;
11165        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11166        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11167        Py_DECREF(left);
11168        *p_left = res;
11169    }
11170    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11171    return;
11172
11173error:
11174    Py_CLEAR(*p_left);
11175}
11176
11177void
11178PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11179{
11180    PyUnicode_Append(pleft, right);
11181    Py_XDECREF(right);
11182}
11183
11184PyDoc_STRVAR(count__doc__,
11185             "S.count(sub[, start[, end]]) -> int\n\
11186\n\
11187Return the number of non-overlapping occurrences of substring sub in\n\
11188string S[start:end].  Optional arguments start and end are\n\
11189interpreted as in slice notation.");
11190
11191static PyObject *
11192unicode_count(PyObject *self, PyObject *args)
11193{
11194    PyObject *substring = NULL;   /* initialize to fix a compiler warning */
11195    Py_ssize_t start = 0;
11196    Py_ssize_t end = PY_SSIZE_T_MAX;
11197    PyObject *result;
11198    int kind1, kind2;
11199    void *buf1, *buf2;
11200    Py_ssize_t len1, len2, iresult;
11201
11202    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11203                                            &start, &end))
11204        return NULL;
11205
11206    kind1 = PyUnicode_KIND(self);
11207    kind2 = PyUnicode_KIND(substring);
11208    if (kind1 < kind2) {
11209        Py_DECREF(substring);
11210        return PyLong_FromLong(0);
11211    }
11212    len1 = PyUnicode_GET_LENGTH(self);
11213    len2 = PyUnicode_GET_LENGTH(substring);
11214    ADJUST_INDICES(start, end, len1);
11215    if (end - start < len2) {
11216        Py_DECREF(substring);
11217        return PyLong_FromLong(0);
11218    }
11219    buf1 = PyUnicode_DATA(self);
11220    buf2 = PyUnicode_DATA(substring);
11221    if (kind2 != kind1) {
11222        buf2 = _PyUnicode_AsKind(substring, kind1);
11223        if (!buf2) {
11224            Py_DECREF(substring);
11225            return NULL;
11226        }
11227    }
11228    switch (kind1) {
11229    case PyUnicode_1BYTE_KIND:
11230        iresult = ucs1lib_count(
11231            ((Py_UCS1*)buf1) + start, end - start,
11232            buf2, len2, PY_SSIZE_T_MAX
11233            );
11234        break;
11235    case PyUnicode_2BYTE_KIND:
11236        iresult = ucs2lib_count(
11237            ((Py_UCS2*)buf1) + start, end - start,
11238            buf2, len2, PY_SSIZE_T_MAX
11239            );
11240        break;
11241    case PyUnicode_4BYTE_KIND:
11242        iresult = ucs4lib_count(
11243            ((Py_UCS4*)buf1) + start, end - start,
11244            buf2, len2, PY_SSIZE_T_MAX
11245            );
11246        break;
11247    default:
11248        assert(0); iresult = 0;
11249    }
11250
11251    result = PyLong_FromSsize_t(iresult);
11252
11253    if (kind2 != kind1)
11254        PyMem_Free(buf2);
11255
11256    Py_DECREF(substring);
11257
11258    return result;
11259}
11260
11261PyDoc_STRVAR(encode__doc__,
11262             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11263\n\
11264Encode S using the codec registered for encoding. Default encoding\n\
11265is 'utf-8'. errors may be given to set a different error\n\
11266handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11267a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11268'xmlcharrefreplace' as well as any other name registered with\n\
11269codecs.register_error that can handle UnicodeEncodeErrors.");
11270
11271static PyObject *
11272unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11273{
11274    static char *kwlist[] = {"encoding", "errors", 0};
11275    char *encoding = NULL;
11276    char *errors = NULL;
11277
11278    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11279                                     kwlist, &encoding, &errors))
11280        return NULL;
11281    return PyUnicode_AsEncodedString(self, encoding, errors);
11282}
11283
11284PyDoc_STRVAR(expandtabs__doc__,
11285             "S.expandtabs(tabsize=8) -> str\n\
11286\n\
11287Return a copy of S where all tab characters are expanded using spaces.\n\
11288If tabsize is not given, a tab size of 8 characters is assumed.");
11289
11290static PyObject*
11291unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11292{
11293    Py_ssize_t i, j, line_pos, src_len, incr;
11294    Py_UCS4 ch;
11295    PyObject *u;
11296    void *src_data, *dest_data;
11297    static char *kwlist[] = {"tabsize", 0};
11298    int tabsize = 8;
11299    int kind;
11300    int found;
11301
11302    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11303                                     kwlist, &tabsize))
11304        return NULL;
11305
11306    if (PyUnicode_READY(self) == -1)
11307        return NULL;
11308
11309    /* First pass: determine size of output string */
11310    src_len = PyUnicode_GET_LENGTH(self);
11311    i = j = line_pos = 0;
11312    kind = PyUnicode_KIND(self);
11313    src_data = PyUnicode_DATA(self);
11314    found = 0;
11315    for (; i < src_len; i++) {
11316        ch = PyUnicode_READ(kind, src_data, i);
11317        if (ch == '\t') {
11318            found = 1;
11319            if (tabsize > 0) {
11320                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11321                if (j > PY_SSIZE_T_MAX - incr)
11322                    goto overflow;
11323                line_pos += incr;
11324                j += incr;
11325            }
11326        }
11327        else {
11328            if (j > PY_SSIZE_T_MAX - 1)
11329                goto overflow;
11330            line_pos++;
11331            j++;
11332            if (ch == '\n' || ch == '\r')
11333                line_pos = 0;
11334        }
11335    }
11336    if (!found)
11337        return unicode_result_unchanged(self);
11338
11339    /* Second pass: create output string and fill it */
11340    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11341    if (!u)
11342        return NULL;
11343    dest_data = PyUnicode_DATA(u);
11344
11345    i = j = line_pos = 0;
11346
11347    for (; i < src_len; i++) {
11348        ch = PyUnicode_READ(kind, src_data, i);
11349        if (ch == '\t') {
11350            if (tabsize > 0) {
11351                incr = tabsize - (line_pos % tabsize);
11352                line_pos += incr;
11353                FILL(kind, dest_data, ' ', j, incr);
11354                j += incr;
11355            }
11356        }
11357        else {
11358            line_pos++;
11359            PyUnicode_WRITE(kind, dest_data, j, ch);
11360            j++;
11361            if (ch == '\n' || ch == '\r')
11362                line_pos = 0;
11363        }
11364    }
11365    assert (j == PyUnicode_GET_LENGTH(u));
11366    return unicode_result(u);
11367
11368  overflow:
11369    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11370    return NULL;
11371}
11372
11373PyDoc_STRVAR(find__doc__,
11374             "S.find(sub[, start[, end]]) -> int\n\
11375\n\
11376Return the lowest index in S where substring sub is found,\n\
11377such that sub is contained within S[start:end].  Optional\n\
11378arguments start and end are interpreted as in slice notation.\n\
11379\n\
11380Return -1 on failure.");
11381
11382static PyObject *
11383unicode_find(PyObject *self, PyObject *args)
11384{
11385    /* initialize variables to prevent gcc warning */
11386    PyObject *substring = NULL;
11387    Py_ssize_t start = 0;
11388    Py_ssize_t end = 0;
11389    Py_ssize_t result;
11390
11391    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11392                                            &start, &end))
11393        return NULL;
11394
11395    if (PyUnicode_READY(self) == -1) {
11396        Py_DECREF(substring);
11397        return NULL;
11398    }
11399    if (PyUnicode_READY(substring) == -1) {
11400        Py_DECREF(substring);
11401        return NULL;
11402    }
11403
11404    result = any_find_slice(1, self, substring, start, end);
11405
11406    Py_DECREF(substring);
11407
11408    if (result == -2)
11409        return NULL;
11410
11411    return PyLong_FromSsize_t(result);
11412}
11413
11414static PyObject *
11415unicode_getitem(PyObject *self, Py_ssize_t index)
11416{
11417    void *data;
11418    enum PyUnicode_Kind kind;
11419    Py_UCS4 ch;
11420
11421    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11422        PyErr_BadArgument();
11423        return NULL;
11424    }
11425    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11426        PyErr_SetString(PyExc_IndexError, "string index out of range");
11427        return NULL;
11428    }
11429    kind = PyUnicode_KIND(self);
11430    data = PyUnicode_DATA(self);
11431    ch = PyUnicode_READ(kind, data, index);
11432    return unicode_char(ch);
11433}
11434
11435/* Believe it or not, this produces the same value for ASCII strings
11436   as bytes_hash(). */
11437static Py_hash_t
11438unicode_hash(PyObject *self)
11439{
11440    Py_ssize_t len;
11441    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11442
11443#ifdef Py_DEBUG
11444    assert(_Py_HashSecret_Initialized);
11445#endif
11446    if (_PyUnicode_HASH(self) != -1)
11447        return _PyUnicode_HASH(self);
11448    if (PyUnicode_READY(self) == -1)
11449        return -1;
11450    len = PyUnicode_GET_LENGTH(self);
11451    /*
11452      We make the hash of the empty string be 0, rather than using
11453      (prefix ^ suffix), since this slightly obfuscates the hash secret
11454    */
11455    if (len == 0) {
11456        _PyUnicode_HASH(self) = 0;
11457        return 0;
11458    }
11459    x = _Py_HashBytes(PyUnicode_DATA(self),
11460                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11461    _PyUnicode_HASH(self) = x;
11462    return x;
11463}
11464
11465PyDoc_STRVAR(index__doc__,
11466             "S.index(sub[, start[, end]]) -> int\n\
11467\n\
11468Like S.find() but raise ValueError when the substring is not found.");
11469
11470static PyObject *
11471unicode_index(PyObject *self, PyObject *args)
11472{
11473    /* initialize variables to prevent gcc warning */
11474    Py_ssize_t result;
11475    PyObject *substring = NULL;
11476    Py_ssize_t start = 0;
11477    Py_ssize_t end = 0;
11478
11479    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11480                                            &start, &end))
11481        return NULL;
11482
11483    if (PyUnicode_READY(self) == -1) {
11484        Py_DECREF(substring);
11485        return NULL;
11486    }
11487    if (PyUnicode_READY(substring) == -1) {
11488        Py_DECREF(substring);
11489        return NULL;
11490    }
11491
11492    result = any_find_slice(1, self, substring, start, end);
11493
11494    Py_DECREF(substring);
11495
11496    if (result == -2)
11497        return NULL;
11498
11499    if (result < 0) {
11500        PyErr_SetString(PyExc_ValueError, "substring not found");
11501        return NULL;
11502    }
11503
11504    return PyLong_FromSsize_t(result);
11505}
11506
11507PyDoc_STRVAR(islower__doc__,
11508             "S.islower() -> bool\n\
11509\n\
11510Return True if all cased characters in S are lowercase and there is\n\
11511at least one cased character in S, False otherwise.");
11512
11513static PyObject*
11514unicode_islower(PyObject *self)
11515{
11516    Py_ssize_t i, length;
11517    int kind;
11518    void *data;
11519    int cased;
11520
11521    if (PyUnicode_READY(self) == -1)
11522        return NULL;
11523    length = PyUnicode_GET_LENGTH(self);
11524    kind = PyUnicode_KIND(self);
11525    data = PyUnicode_DATA(self);
11526
11527    /* Shortcut for single character strings */
11528    if (length == 1)
11529        return PyBool_FromLong(
11530            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11531
11532    /* Special case for empty strings */
11533    if (length == 0)
11534        return PyBool_FromLong(0);
11535
11536    cased = 0;
11537    for (i = 0; i < length; i++) {
11538        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11539
11540        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11541            return PyBool_FromLong(0);
11542        else if (!cased && Py_UNICODE_ISLOWER(ch))
11543            cased = 1;
11544    }
11545    return PyBool_FromLong(cased);
11546}
11547
11548PyDoc_STRVAR(isupper__doc__,
11549             "S.isupper() -> bool\n\
11550\n\
11551Return True if all cased characters in S are uppercase and there is\n\
11552at least one cased character in S, False otherwise.");
11553
11554static PyObject*
11555unicode_isupper(PyObject *self)
11556{
11557    Py_ssize_t i, length;
11558    int kind;
11559    void *data;
11560    int cased;
11561
11562    if (PyUnicode_READY(self) == -1)
11563        return NULL;
11564    length = PyUnicode_GET_LENGTH(self);
11565    kind = PyUnicode_KIND(self);
11566    data = PyUnicode_DATA(self);
11567
11568    /* Shortcut for single character strings */
11569    if (length == 1)
11570        return PyBool_FromLong(
11571            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11572
11573    /* Special case for empty strings */
11574    if (length == 0)
11575        return PyBool_FromLong(0);
11576
11577    cased = 0;
11578    for (i = 0; i < length; i++) {
11579        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11580
11581        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11582            return PyBool_FromLong(0);
11583        else if (!cased && Py_UNICODE_ISUPPER(ch))
11584            cased = 1;
11585    }
11586    return PyBool_FromLong(cased);
11587}
11588
11589PyDoc_STRVAR(istitle__doc__,
11590             "S.istitle() -> bool\n\
11591\n\
11592Return True if S is a titlecased string and there is at least one\n\
11593character in S, i.e. upper- and titlecase characters may only\n\
11594follow uncased characters and lowercase characters only cased ones.\n\
11595Return False otherwise.");
11596
11597static PyObject*
11598unicode_istitle(PyObject *self)
11599{
11600    Py_ssize_t i, length;
11601    int kind;
11602    void *data;
11603    int cased, previous_is_cased;
11604
11605    if (PyUnicode_READY(self) == -1)
11606        return NULL;
11607    length = PyUnicode_GET_LENGTH(self);
11608    kind = PyUnicode_KIND(self);
11609    data = PyUnicode_DATA(self);
11610
11611    /* Shortcut for single character strings */
11612    if (length == 1) {
11613        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11614        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11615                               (Py_UNICODE_ISUPPER(ch) != 0));
11616    }
11617
11618    /* Special case for empty strings */
11619    if (length == 0)
11620        return PyBool_FromLong(0);
11621
11622    cased = 0;
11623    previous_is_cased = 0;
11624    for (i = 0; i < length; i++) {
11625        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11626
11627        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11628            if (previous_is_cased)
11629                return PyBool_FromLong(0);
11630            previous_is_cased = 1;
11631            cased = 1;
11632        }
11633        else if (Py_UNICODE_ISLOWER(ch)) {
11634            if (!previous_is_cased)
11635                return PyBool_FromLong(0);
11636            previous_is_cased = 1;
11637            cased = 1;
11638        }
11639        else
11640            previous_is_cased = 0;
11641    }
11642    return PyBool_FromLong(cased);
11643}
11644
11645PyDoc_STRVAR(isspace__doc__,
11646             "S.isspace() -> bool\n\
11647\n\
11648Return True if all characters in S are whitespace\n\
11649and there is at least one character in S, False otherwise.");
11650
11651static PyObject*
11652unicode_isspace(PyObject *self)
11653{
11654    Py_ssize_t i, length;
11655    int kind;
11656    void *data;
11657
11658    if (PyUnicode_READY(self) == -1)
11659        return NULL;
11660    length = PyUnicode_GET_LENGTH(self);
11661    kind = PyUnicode_KIND(self);
11662    data = PyUnicode_DATA(self);
11663
11664    /* Shortcut for single character strings */
11665    if (length == 1)
11666        return PyBool_FromLong(
11667            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11668
11669    /* Special case for empty strings */
11670    if (length == 0)
11671        return PyBool_FromLong(0);
11672
11673    for (i = 0; i < length; i++) {
11674        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11675        if (!Py_UNICODE_ISSPACE(ch))
11676            return PyBool_FromLong(0);
11677    }
11678    return PyBool_FromLong(1);
11679}
11680
11681PyDoc_STRVAR(isalpha__doc__,
11682             "S.isalpha() -> bool\n\
11683\n\
11684Return True if all characters in S are alphabetic\n\
11685and there is at least one character in S, False otherwise.");
11686
11687static PyObject*
11688unicode_isalpha(PyObject *self)
11689{
11690    Py_ssize_t i, length;
11691    int kind;
11692    void *data;
11693
11694    if (PyUnicode_READY(self) == -1)
11695        return NULL;
11696    length = PyUnicode_GET_LENGTH(self);
11697    kind = PyUnicode_KIND(self);
11698    data = PyUnicode_DATA(self);
11699
11700    /* Shortcut for single character strings */
11701    if (length == 1)
11702        return PyBool_FromLong(
11703            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11704
11705    /* Special case for empty strings */
11706    if (length == 0)
11707        return PyBool_FromLong(0);
11708
11709    for (i = 0; i < length; i++) {
11710        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11711            return PyBool_FromLong(0);
11712    }
11713    return PyBool_FromLong(1);
11714}
11715
11716PyDoc_STRVAR(isalnum__doc__,
11717             "S.isalnum() -> bool\n\
11718\n\
11719Return True if all characters in S are alphanumeric\n\
11720and there is at least one character in S, False otherwise.");
11721
11722static PyObject*
11723unicode_isalnum(PyObject *self)
11724{
11725    int kind;
11726    void *data;
11727    Py_ssize_t len, i;
11728
11729    if (PyUnicode_READY(self) == -1)
11730        return NULL;
11731
11732    kind = PyUnicode_KIND(self);
11733    data = PyUnicode_DATA(self);
11734    len = PyUnicode_GET_LENGTH(self);
11735
11736    /* Shortcut for single character strings */
11737    if (len == 1) {
11738        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11739        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11740    }
11741
11742    /* Special case for empty strings */
11743    if (len == 0)
11744        return PyBool_FromLong(0);
11745
11746    for (i = 0; i < len; i++) {
11747        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11748        if (!Py_UNICODE_ISALNUM(ch))
11749            return PyBool_FromLong(0);
11750    }
11751    return PyBool_FromLong(1);
11752}
11753
11754PyDoc_STRVAR(isdecimal__doc__,
11755             "S.isdecimal() -> bool\n\
11756\n\
11757Return True if there are only decimal characters in S,\n\
11758False otherwise.");
11759
11760static PyObject*
11761unicode_isdecimal(PyObject *self)
11762{
11763    Py_ssize_t i, length;
11764    int kind;
11765    void *data;
11766
11767    if (PyUnicode_READY(self) == -1)
11768        return NULL;
11769    length = PyUnicode_GET_LENGTH(self);
11770    kind = PyUnicode_KIND(self);
11771    data = PyUnicode_DATA(self);
11772
11773    /* Shortcut for single character strings */
11774    if (length == 1)
11775        return PyBool_FromLong(
11776            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11777
11778    /* Special case for empty strings */
11779    if (length == 0)
11780        return PyBool_FromLong(0);
11781
11782    for (i = 0; i < length; i++) {
11783        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11784            return PyBool_FromLong(0);
11785    }
11786    return PyBool_FromLong(1);
11787}
11788
11789PyDoc_STRVAR(isdigit__doc__,
11790             "S.isdigit() -> bool\n\
11791\n\
11792Return True if all characters in S are digits\n\
11793and there is at least one character in S, False otherwise.");
11794
11795static PyObject*
11796unicode_isdigit(PyObject *self)
11797{
11798    Py_ssize_t i, length;
11799    int kind;
11800    void *data;
11801
11802    if (PyUnicode_READY(self) == -1)
11803        return NULL;
11804    length = PyUnicode_GET_LENGTH(self);
11805    kind = PyUnicode_KIND(self);
11806    data = PyUnicode_DATA(self);
11807
11808    /* Shortcut for single character strings */
11809    if (length == 1) {
11810        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11811        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11812    }
11813
11814    /* Special case for empty strings */
11815    if (length == 0)
11816        return PyBool_FromLong(0);
11817
11818    for (i = 0; i < length; i++) {
11819        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11820            return PyBool_FromLong(0);
11821    }
11822    return PyBool_FromLong(1);
11823}
11824
11825PyDoc_STRVAR(isnumeric__doc__,
11826             "S.isnumeric() -> bool\n\
11827\n\
11828Return True if there are only numeric characters in S,\n\
11829False otherwise.");
11830
11831static PyObject*
11832unicode_isnumeric(PyObject *self)
11833{
11834    Py_ssize_t i, length;
11835    int kind;
11836    void *data;
11837
11838    if (PyUnicode_READY(self) == -1)
11839        return NULL;
11840    length = PyUnicode_GET_LENGTH(self);
11841    kind = PyUnicode_KIND(self);
11842    data = PyUnicode_DATA(self);
11843
11844    /* Shortcut for single character strings */
11845    if (length == 1)
11846        return PyBool_FromLong(
11847            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11848
11849    /* Special case for empty strings */
11850    if (length == 0)
11851        return PyBool_FromLong(0);
11852
11853    for (i = 0; i < length; i++) {
11854        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11855            return PyBool_FromLong(0);
11856    }
11857    return PyBool_FromLong(1);
11858}
11859
11860int
11861PyUnicode_IsIdentifier(PyObject *self)
11862{
11863    int kind;
11864    void *data;
11865    Py_ssize_t i;
11866    Py_UCS4 first;
11867
11868    if (PyUnicode_READY(self) == -1) {
11869        Py_FatalError("identifier not ready");
11870        return 0;
11871    }
11872
11873    /* Special case for empty strings */
11874    if (PyUnicode_GET_LENGTH(self) == 0)
11875        return 0;
11876    kind = PyUnicode_KIND(self);
11877    data = PyUnicode_DATA(self);
11878
11879    /* PEP 3131 says that the first character must be in
11880       XID_Start and subsequent characters in XID_Continue,
11881       and for the ASCII range, the 2.x rules apply (i.e
11882       start with letters and underscore, continue with
11883       letters, digits, underscore). However, given the current
11884       definition of XID_Start and XID_Continue, it is sufficient
11885       to check just for these, except that _ must be allowed
11886       as starting an identifier.  */
11887    first = PyUnicode_READ(kind, data, 0);
11888    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11889        return 0;
11890
11891    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11892        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11893            return 0;
11894    return 1;
11895}
11896
11897PyDoc_STRVAR(isidentifier__doc__,
11898             "S.isidentifier() -> bool\n\
11899\n\
11900Return True if S is a valid identifier according\n\
11901to the language definition.\n\
11902\n\
11903Use keyword.iskeyword() to test for reserved identifiers\n\
11904such as \"def\" and \"class\".\n");
11905
11906static PyObject*
11907unicode_isidentifier(PyObject *self)
11908{
11909    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11910}
11911
11912PyDoc_STRVAR(isprintable__doc__,
11913             "S.isprintable() -> bool\n\
11914\n\
11915Return True if all characters in S are considered\n\
11916printable in repr() or S is empty, False otherwise.");
11917
11918static PyObject*
11919unicode_isprintable(PyObject *self)
11920{
11921    Py_ssize_t i, length;
11922    int kind;
11923    void *data;
11924
11925    if (PyUnicode_READY(self) == -1)
11926        return NULL;
11927    length = PyUnicode_GET_LENGTH(self);
11928    kind = PyUnicode_KIND(self);
11929    data = PyUnicode_DATA(self);
11930
11931    /* Shortcut for single character strings */
11932    if (length == 1)
11933        return PyBool_FromLong(
11934            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11935
11936    for (i = 0; i < length; i++) {
11937        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11938            Py_RETURN_FALSE;
11939        }
11940    }
11941    Py_RETURN_TRUE;
11942}
11943
11944PyDoc_STRVAR(join__doc__,
11945             "S.join(iterable) -> str\n\
11946\n\
11947Return a string which is the concatenation of the strings in the\n\
11948iterable.  The separator between elements is S.");
11949
11950static PyObject*
11951unicode_join(PyObject *self, PyObject *data)
11952{
11953    return PyUnicode_Join(self, data);
11954}
11955
11956static Py_ssize_t
11957unicode_length(PyObject *self)
11958{
11959    if (PyUnicode_READY(self) == -1)
11960        return -1;
11961    return PyUnicode_GET_LENGTH(self);
11962}
11963
11964PyDoc_STRVAR(ljust__doc__,
11965             "S.ljust(width[, fillchar]) -> str\n\
11966\n\
11967Return S left-justified in a Unicode string of length width. Padding is\n\
11968done using the specified fill character (default is a space).");
11969
11970static PyObject *
11971unicode_ljust(PyObject *self, PyObject *args)
11972{
11973    Py_ssize_t width;
11974    Py_UCS4 fillchar = ' ';
11975
11976    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11977        return NULL;
11978
11979    if (PyUnicode_READY(self) == -1)
11980        return NULL;
11981
11982    if (PyUnicode_GET_LENGTH(self) >= width)
11983        return unicode_result_unchanged(self);
11984
11985    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11986}
11987
11988PyDoc_STRVAR(lower__doc__,
11989             "S.lower() -> str\n\
11990\n\
11991Return a copy of the string S converted to lowercase.");
11992
11993static PyObject*
11994unicode_lower(PyObject *self)
11995{
11996    if (PyUnicode_READY(self) == -1)
11997        return NULL;
11998    if (PyUnicode_IS_ASCII(self))
11999        return ascii_upper_or_lower(self, 1);
12000    return case_operation(self, do_lower);
12001}
12002
12003#define LEFTSTRIP 0
12004#define RIGHTSTRIP 1
12005#define BOTHSTRIP 2
12006
12007/* Arrays indexed by above */
12008static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
12009
12010#define STRIPNAME(i) (stripformat[i]+3)
12011
12012/* externally visible for str.strip(unicode) */
12013PyObject *
12014_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
12015{
12016    void *data;
12017    int kind;
12018    Py_ssize_t i, j, len;
12019    BLOOM_MASK sepmask;
12020    Py_ssize_t seplen;
12021
12022    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
12023        return NULL;
12024
12025    kind = PyUnicode_KIND(self);
12026    data = PyUnicode_DATA(self);
12027    len = PyUnicode_GET_LENGTH(self);
12028    seplen = PyUnicode_GET_LENGTH(sepobj);
12029    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12030                              PyUnicode_DATA(sepobj),
12031                              seplen);
12032
12033    i = 0;
12034    if (striptype != RIGHTSTRIP) {
12035        while (i < len) {
12036            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12037            if (!BLOOM(sepmask, ch))
12038                break;
12039            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12040                break;
12041            i++;
12042        }
12043    }
12044
12045    j = len;
12046    if (striptype != LEFTSTRIP) {
12047        j--;
12048        while (j >= i) {
12049            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12050            if (!BLOOM(sepmask, ch))
12051                break;
12052            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12053                break;
12054            j--;
12055        }
12056
12057        j++;
12058    }
12059
12060    return PyUnicode_Substring(self, i, j);
12061}
12062
12063PyObject*
12064PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12065{
12066    unsigned char *data;
12067    int kind;
12068    Py_ssize_t length;
12069
12070    if (PyUnicode_READY(self) == -1)
12071        return NULL;
12072
12073    length = PyUnicode_GET_LENGTH(self);
12074    end = Py_MIN(end, length);
12075
12076    if (start == 0 && end == length)
12077        return unicode_result_unchanged(self);
12078
12079    if (start < 0 || end < 0) {
12080        PyErr_SetString(PyExc_IndexError, "string index out of range");
12081        return NULL;
12082    }
12083    if (start >= length || end < start)
12084        _Py_RETURN_UNICODE_EMPTY();
12085
12086    length = end - start;
12087    if (PyUnicode_IS_ASCII(self)) {
12088        data = PyUnicode_1BYTE_DATA(self);
12089        return _PyUnicode_FromASCII((char*)(data + start), length);
12090    }
12091    else {
12092        kind = PyUnicode_KIND(self);
12093        data = PyUnicode_1BYTE_DATA(self);
12094        return PyUnicode_FromKindAndData(kind,
12095                                         data + kind * start,
12096                                         length);
12097    }
12098}
12099
12100static PyObject *
12101do_strip(PyObject *self, int striptype)
12102{
12103    Py_ssize_t len, i, j;
12104
12105    if (PyUnicode_READY(self) == -1)
12106        return NULL;
12107
12108    len = PyUnicode_GET_LENGTH(self);
12109
12110    if (PyUnicode_IS_ASCII(self)) {
12111        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12112
12113        i = 0;
12114        if (striptype != RIGHTSTRIP) {
12115            while (i < len) {
12116                Py_UCS1 ch = data[i];
12117                if (!_Py_ascii_whitespace[ch])
12118                    break;
12119                i++;
12120            }
12121        }
12122
12123        j = len;
12124        if (striptype != LEFTSTRIP) {
12125            j--;
12126            while (j >= i) {
12127                Py_UCS1 ch = data[j];
12128                if (!_Py_ascii_whitespace[ch])
12129                    break;
12130                j--;
12131            }
12132            j++;
12133        }
12134    }
12135    else {
12136        int kind = PyUnicode_KIND(self);
12137        void *data = PyUnicode_DATA(self);
12138
12139        i = 0;
12140        if (striptype != RIGHTSTRIP) {
12141            while (i < len) {
12142                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12143                if (!Py_UNICODE_ISSPACE(ch))
12144                    break;
12145                i++;
12146            }
12147        }
12148
12149        j = len;
12150        if (striptype != LEFTSTRIP) {
12151            j--;
12152            while (j >= i) {
12153                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12154                if (!Py_UNICODE_ISSPACE(ch))
12155                    break;
12156                j--;
12157            }
12158            j++;
12159        }
12160    }
12161
12162    return PyUnicode_Substring(self, i, j);
12163}
12164
12165
12166static PyObject *
12167do_argstrip(PyObject *self, int striptype, PyObject *args)
12168{
12169    PyObject *sep = NULL;
12170
12171    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12172        return NULL;
12173
12174    if (sep != NULL && sep != Py_None) {
12175        if (PyUnicode_Check(sep))
12176            return _PyUnicode_XStrip(self, striptype, sep);
12177        else {
12178            PyErr_Format(PyExc_TypeError,
12179                         "%s arg must be None or str",
12180                         STRIPNAME(striptype));
12181            return NULL;
12182        }
12183    }
12184
12185    return do_strip(self, striptype);
12186}
12187
12188
12189PyDoc_STRVAR(strip__doc__,
12190             "S.strip([chars]) -> str\n\
12191\n\
12192Return a copy of the string S with leading and trailing\n\
12193whitespace removed.\n\
12194If chars is given and not None, remove characters in chars instead.");
12195
12196static PyObject *
12197unicode_strip(PyObject *self, PyObject *args)
12198{
12199    if (PyTuple_GET_SIZE(args) == 0)
12200        return do_strip(self, BOTHSTRIP); /* Common case */
12201    else
12202        return do_argstrip(self, BOTHSTRIP, args);
12203}
12204
12205
12206PyDoc_STRVAR(lstrip__doc__,
12207             "S.lstrip([chars]) -> str\n\
12208\n\
12209Return a copy of the string S with leading whitespace removed.\n\
12210If chars is given and not None, remove characters in chars instead.");
12211
12212static PyObject *
12213unicode_lstrip(PyObject *self, PyObject *args)
12214{
12215    if (PyTuple_GET_SIZE(args) == 0)
12216        return do_strip(self, LEFTSTRIP); /* Common case */
12217    else
12218        return do_argstrip(self, LEFTSTRIP, args);
12219}
12220
12221
12222PyDoc_STRVAR(rstrip__doc__,
12223             "S.rstrip([chars]) -> str\n\
12224\n\
12225Return a copy of the string S with trailing whitespace removed.\n\
12226If chars is given and not None, remove characters in chars instead.");
12227
12228static PyObject *
12229unicode_rstrip(PyObject *self, PyObject *args)
12230{
12231    if (PyTuple_GET_SIZE(args) == 0)
12232        return do_strip(self, RIGHTSTRIP); /* Common case */
12233    else
12234        return do_argstrip(self, RIGHTSTRIP, args);
12235}
12236
12237
12238static PyObject*
12239unicode_repeat(PyObject *str, Py_ssize_t len)
12240{
12241    PyObject *u;
12242    Py_ssize_t nchars, n;
12243
12244    if (len < 1)
12245        _Py_RETURN_UNICODE_EMPTY();
12246
12247    /* no repeat, return original string */
12248    if (len == 1)
12249        return unicode_result_unchanged(str);
12250
12251    if (PyUnicode_READY(str) == -1)
12252        return NULL;
12253
12254    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12255        PyErr_SetString(PyExc_OverflowError,
12256                        "repeated string is too long");
12257        return NULL;
12258    }
12259    nchars = len * PyUnicode_GET_LENGTH(str);
12260
12261    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12262    if (!u)
12263        return NULL;
12264    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12265
12266    if (PyUnicode_GET_LENGTH(str) == 1) {
12267        const int kind = PyUnicode_KIND(str);
12268        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12269        if (kind == PyUnicode_1BYTE_KIND) {
12270            void *to = PyUnicode_DATA(u);
12271            memset(to, (unsigned char)fill_char, len);
12272        }
12273        else if (kind == PyUnicode_2BYTE_KIND) {
12274            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12275            for (n = 0; n < len; ++n)
12276                ucs2[n] = fill_char;
12277        } else {
12278            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12279            assert(kind == PyUnicode_4BYTE_KIND);
12280            for (n = 0; n < len; ++n)
12281                ucs4[n] = fill_char;
12282        }
12283    }
12284    else {
12285        /* number of characters copied this far */
12286        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12287        const Py_ssize_t char_size = PyUnicode_KIND(str);
12288        char *to = (char *) PyUnicode_DATA(u);
12289        Py_MEMCPY(to, PyUnicode_DATA(str),
12290                  PyUnicode_GET_LENGTH(str) * char_size);
12291        while (done < nchars) {
12292            n = (done <= nchars-done) ? done : nchars-done;
12293            Py_MEMCPY(to + (done * char_size), to, n * char_size);
12294            done += n;
12295        }
12296    }
12297
12298    assert(_PyUnicode_CheckConsistency(u, 1));
12299    return u;
12300}
12301
12302PyObject *
12303PyUnicode_Replace(PyObject *obj,
12304                  PyObject *subobj,
12305                  PyObject *replobj,
12306                  Py_ssize_t maxcount)
12307{
12308    PyObject *self;
12309    PyObject *str1;
12310    PyObject *str2;
12311    PyObject *result;
12312
12313    self = PyUnicode_FromObject(obj);
12314    if (self == NULL)
12315        return NULL;
12316    str1 = PyUnicode_FromObject(subobj);
12317    if (str1 == NULL) {
12318        Py_DECREF(self);
12319        return NULL;
12320    }
12321    str2 = PyUnicode_FromObject(replobj);
12322    if (str2 == NULL) {
12323        Py_DECREF(self);
12324        Py_DECREF(str1);
12325        return NULL;
12326    }
12327    if (PyUnicode_READY(self) == -1 ||
12328        PyUnicode_READY(str1) == -1 ||
12329        PyUnicode_READY(str2) == -1)
12330        result = NULL;
12331    else
12332        result = replace(self, str1, str2, maxcount);
12333    Py_DECREF(self);
12334    Py_DECREF(str1);
12335    Py_DECREF(str2);
12336    return result;
12337}
12338
12339PyDoc_STRVAR(replace__doc__,
12340             "S.replace(old, new[, count]) -> str\n\
12341\n\
12342Return a copy of S with all occurrences of substring\n\
12343old replaced by new.  If the optional argument count is\n\
12344given, only the first count occurrences are replaced.");
12345
12346static PyObject*
12347unicode_replace(PyObject *self, PyObject *args)
12348{
12349    PyObject *str1;
12350    PyObject *str2;
12351    Py_ssize_t maxcount = -1;
12352    PyObject *result;
12353
12354    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12355        return NULL;
12356    if (PyUnicode_READY(self) == -1)
12357        return NULL;
12358    str1 = PyUnicode_FromObject(str1);
12359    if (str1 == NULL)
12360        return NULL;
12361    str2 = PyUnicode_FromObject(str2);
12362    if (str2 == NULL) {
12363        Py_DECREF(str1);
12364        return NULL;
12365    }
12366    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12367        result = NULL;
12368    else
12369        result = replace(self, str1, str2, maxcount);
12370
12371    Py_DECREF(str1);
12372    Py_DECREF(str2);
12373    return result;
12374}
12375
12376static PyObject *
12377unicode_repr(PyObject *unicode)
12378{
12379    PyObject *repr;
12380    Py_ssize_t isize;
12381    Py_ssize_t osize, squote, dquote, i, o;
12382    Py_UCS4 max, quote;
12383    int ikind, okind, unchanged;
12384    void *idata, *odata;
12385
12386    if (PyUnicode_READY(unicode) == -1)
12387        return NULL;
12388
12389    isize = PyUnicode_GET_LENGTH(unicode);
12390    idata = PyUnicode_DATA(unicode);
12391
12392    /* Compute length of output, quote characters, and
12393       maximum character */
12394    osize = 0;
12395    max = 127;
12396    squote = dquote = 0;
12397    ikind = PyUnicode_KIND(unicode);
12398    for (i = 0; i < isize; i++) {
12399        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12400        Py_ssize_t incr = 1;
12401        switch (ch) {
12402        case '\'': squote++; break;
12403        case '"':  dquote++; break;
12404        case '\\': case '\t': case '\r': case '\n':
12405            incr = 2;
12406            break;
12407        default:
12408            /* Fast-path ASCII */
12409            if (ch < ' ' || ch == 0x7f)
12410                incr = 4; /* \xHH */
12411            else if (ch < 0x7f)
12412                ;
12413            else if (Py_UNICODE_ISPRINTABLE(ch))
12414                max = ch > max ? ch : max;
12415            else if (ch < 0x100)
12416                incr = 4; /* \xHH */
12417            else if (ch < 0x10000)
12418                incr = 6; /* \uHHHH */
12419            else
12420                incr = 10; /* \uHHHHHHHH */
12421        }
12422        if (osize > PY_SSIZE_T_MAX - incr) {
12423            PyErr_SetString(PyExc_OverflowError,
12424                            "string is too long to generate repr");
12425            return NULL;
12426        }
12427        osize += incr;
12428    }
12429
12430    quote = '\'';
12431    unchanged = (osize == isize);
12432    if (squote) {
12433        unchanged = 0;
12434        if (dquote)
12435            /* Both squote and dquote present. Use squote,
12436               and escape them */
12437            osize += squote;
12438        else
12439            quote = '"';
12440    }
12441    osize += 2;   /* quotes */
12442
12443    repr = PyUnicode_New(osize, max);
12444    if (repr == NULL)
12445        return NULL;
12446    okind = PyUnicode_KIND(repr);
12447    odata = PyUnicode_DATA(repr);
12448
12449    PyUnicode_WRITE(okind, odata, 0, quote);
12450    PyUnicode_WRITE(okind, odata, osize-1, quote);
12451    if (unchanged) {
12452        _PyUnicode_FastCopyCharacters(repr, 1,
12453                                      unicode, 0,
12454                                      isize);
12455    }
12456    else {
12457        for (i = 0, o = 1; i < isize; i++) {
12458            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12459
12460            /* Escape quotes and backslashes */
12461            if ((ch == quote) || (ch == '\\')) {
12462                PyUnicode_WRITE(okind, odata, o++, '\\');
12463                PyUnicode_WRITE(okind, odata, o++, ch);
12464                continue;
12465            }
12466
12467            /* Map special whitespace to '\t', \n', '\r' */
12468            if (ch == '\t') {
12469                PyUnicode_WRITE(okind, odata, o++, '\\');
12470                PyUnicode_WRITE(okind, odata, o++, 't');
12471            }
12472            else if (ch == '\n') {
12473                PyUnicode_WRITE(okind, odata, o++, '\\');
12474                PyUnicode_WRITE(okind, odata, o++, 'n');
12475            }
12476            else if (ch == '\r') {
12477                PyUnicode_WRITE(okind, odata, o++, '\\');
12478                PyUnicode_WRITE(okind, odata, o++, 'r');
12479            }
12480
12481            /* Map non-printable US ASCII to '\xhh' */
12482            else if (ch < ' ' || ch == 0x7F) {
12483                PyUnicode_WRITE(okind, odata, o++, '\\');
12484                PyUnicode_WRITE(okind, odata, o++, 'x');
12485                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12486                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12487            }
12488
12489            /* Copy ASCII characters as-is */
12490            else if (ch < 0x7F) {
12491                PyUnicode_WRITE(okind, odata, o++, ch);
12492            }
12493
12494            /* Non-ASCII characters */
12495            else {
12496                /* Map Unicode whitespace and control characters
12497                   (categories Z* and C* except ASCII space)
12498                */
12499                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12500                    PyUnicode_WRITE(okind, odata, o++, '\\');
12501                    /* Map 8-bit characters to '\xhh' */
12502                    if (ch <= 0xff) {
12503                        PyUnicode_WRITE(okind, odata, o++, 'x');
12504                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12505                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12506                    }
12507                    /* Map 16-bit characters to '\uxxxx' */
12508                    else if (ch <= 0xffff) {
12509                        PyUnicode_WRITE(okind, odata, o++, 'u');
12510                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12511                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12512                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12513                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12514                    }
12515                    /* Map 21-bit characters to '\U00xxxxxx' */
12516                    else {
12517                        PyUnicode_WRITE(okind, odata, o++, 'U');
12518                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12519                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12520                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12521                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12522                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12523                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12524                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12525                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12526                    }
12527                }
12528                /* Copy characters as-is */
12529                else {
12530                    PyUnicode_WRITE(okind, odata, o++, ch);
12531                }
12532            }
12533        }
12534    }
12535    /* Closing quote already added at the beginning */
12536    assert(_PyUnicode_CheckConsistency(repr, 1));
12537    return repr;
12538}
12539
12540PyDoc_STRVAR(rfind__doc__,
12541             "S.rfind(sub[, start[, end]]) -> int\n\
12542\n\
12543Return the highest index in S where substring sub is found,\n\
12544such that sub is contained within S[start:end].  Optional\n\
12545arguments start and end are interpreted as in slice notation.\n\
12546\n\
12547Return -1 on failure.");
12548
12549static PyObject *
12550unicode_rfind(PyObject *self, PyObject *args)
12551{
12552    /* initialize variables to prevent gcc warning */
12553    PyObject *substring = NULL;
12554    Py_ssize_t start = 0;
12555    Py_ssize_t end = 0;
12556    Py_ssize_t result;
12557
12558    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12559                                            &start, &end))
12560        return NULL;
12561
12562    if (PyUnicode_READY(self) == -1) {
12563        Py_DECREF(substring);
12564        return NULL;
12565    }
12566    if (PyUnicode_READY(substring) == -1) {
12567        Py_DECREF(substring);
12568        return NULL;
12569    }
12570
12571    result = any_find_slice(-1, self, substring, start, end);
12572
12573    Py_DECREF(substring);
12574
12575    if (result == -2)
12576        return NULL;
12577
12578    return PyLong_FromSsize_t(result);
12579}
12580
12581PyDoc_STRVAR(rindex__doc__,
12582             "S.rindex(sub[, start[, end]]) -> int\n\
12583\n\
12584Like S.rfind() but raise ValueError when the substring is not found.");
12585
12586static PyObject *
12587unicode_rindex(PyObject *self, PyObject *args)
12588{
12589    /* initialize variables to prevent gcc warning */
12590    PyObject *substring = NULL;
12591    Py_ssize_t start = 0;
12592    Py_ssize_t end = 0;
12593    Py_ssize_t result;
12594
12595    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12596                                            &start, &end))
12597        return NULL;
12598
12599    if (PyUnicode_READY(self) == -1) {
12600        Py_DECREF(substring);
12601        return NULL;
12602    }
12603    if (PyUnicode_READY(substring) == -1) {
12604        Py_DECREF(substring);
12605        return NULL;
12606    }
12607
12608    result = any_find_slice(-1, self, substring, start, end);
12609
12610    Py_DECREF(substring);
12611
12612    if (result == -2)
12613        return NULL;
12614
12615    if (result < 0) {
12616        PyErr_SetString(PyExc_ValueError, "substring not found");
12617        return NULL;
12618    }
12619
12620    return PyLong_FromSsize_t(result);
12621}
12622
12623PyDoc_STRVAR(rjust__doc__,
12624             "S.rjust(width[, fillchar]) -> str\n\
12625\n\
12626Return S right-justified in a string of length width. Padding is\n\
12627done using the specified fill character (default is a space).");
12628
12629static PyObject *
12630unicode_rjust(PyObject *self, PyObject *args)
12631{
12632    Py_ssize_t width;
12633    Py_UCS4 fillchar = ' ';
12634
12635    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12636        return NULL;
12637
12638    if (PyUnicode_READY(self) == -1)
12639        return NULL;
12640
12641    if (PyUnicode_GET_LENGTH(self) >= width)
12642        return unicode_result_unchanged(self);
12643
12644    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12645}
12646
12647PyObject *
12648PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12649{
12650    PyObject *result;
12651
12652    s = PyUnicode_FromObject(s);
12653    if (s == NULL)
12654        return NULL;
12655    if (sep != NULL) {
12656        sep = PyUnicode_FromObject(sep);
12657        if (sep == NULL) {
12658            Py_DECREF(s);
12659            return NULL;
12660        }
12661    }
12662
12663    result = split(s, sep, maxsplit);
12664
12665    Py_DECREF(s);
12666    Py_XDECREF(sep);
12667    return result;
12668}
12669
12670PyDoc_STRVAR(split__doc__,
12671             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12672\n\
12673Return a list of the words in S, using sep as the\n\
12674delimiter string.  If maxsplit is given, at most maxsplit\n\
12675splits are done. If sep is not specified or is None, any\n\
12676whitespace string is a separator and empty strings are\n\
12677removed from the result.");
12678
12679static PyObject*
12680unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12681{
12682    static char *kwlist[] = {"sep", "maxsplit", 0};
12683    PyObject *substring = Py_None;
12684    Py_ssize_t maxcount = -1;
12685
12686    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12687                                     kwlist, &substring, &maxcount))
12688        return NULL;
12689
12690    if (substring == Py_None)
12691        return split(self, NULL, maxcount);
12692    else if (PyUnicode_Check(substring))
12693        return split(self, substring, maxcount);
12694    else
12695        return PyUnicode_Split(self, substring, maxcount);
12696}
12697
12698PyObject *
12699PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12700{
12701    PyObject* str_obj;
12702    PyObject* sep_obj;
12703    PyObject* out;
12704    int kind1, kind2;
12705    void *buf1, *buf2;
12706    Py_ssize_t len1, len2;
12707
12708    str_obj = PyUnicode_FromObject(str_in);
12709    if (!str_obj)
12710        return NULL;
12711    sep_obj = PyUnicode_FromObject(sep_in);
12712    if (!sep_obj) {
12713        Py_DECREF(str_obj);
12714        return NULL;
12715    }
12716    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12717        Py_DECREF(sep_obj);
12718        Py_DECREF(str_obj);
12719        return NULL;
12720    }
12721
12722    kind1 = PyUnicode_KIND(str_obj);
12723    kind2 = PyUnicode_KIND(sep_obj);
12724    len1 = PyUnicode_GET_LENGTH(str_obj);
12725    len2 = PyUnicode_GET_LENGTH(sep_obj);
12726    if (kind1 < kind2 || len1 < len2) {
12727        _Py_INCREF_UNICODE_EMPTY();
12728        if (!unicode_empty)
12729            out = NULL;
12730        else {
12731            out = PyTuple_Pack(3, str_obj, unicode_empty, unicode_empty);
12732            Py_DECREF(unicode_empty);
12733        }
12734        Py_DECREF(sep_obj);
12735        Py_DECREF(str_obj);
12736        return out;
12737    }
12738    buf1 = PyUnicode_DATA(str_obj);
12739    buf2 = PyUnicode_DATA(sep_obj);
12740    if (kind2 != kind1) {
12741        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12742        if (!buf2)
12743            goto onError;
12744    }
12745
12746    switch (kind1) {
12747    case PyUnicode_1BYTE_KIND:
12748        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12749            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12750        else
12751            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12752        break;
12753    case PyUnicode_2BYTE_KIND:
12754        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12755        break;
12756    case PyUnicode_4BYTE_KIND:
12757        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12758        break;
12759    default:
12760        assert(0);
12761        out = 0;
12762    }
12763
12764    Py_DECREF(sep_obj);
12765    Py_DECREF(str_obj);
12766    if (kind2 != kind1)
12767        PyMem_Free(buf2);
12768
12769    return out;
12770  onError:
12771    Py_DECREF(sep_obj);
12772    Py_DECREF(str_obj);
12773    if (kind2 != kind1 && buf2)
12774        PyMem_Free(buf2);
12775    return NULL;
12776}
12777
12778
12779PyObject *
12780PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12781{
12782    PyObject* str_obj;
12783    PyObject* sep_obj;
12784    PyObject* out;
12785    int kind1, kind2;
12786    void *buf1, *buf2;
12787    Py_ssize_t len1, len2;
12788
12789    str_obj = PyUnicode_FromObject(str_in);
12790    if (!str_obj)
12791        return NULL;
12792    sep_obj = PyUnicode_FromObject(sep_in);
12793    if (!sep_obj) {
12794        Py_DECREF(str_obj);
12795        return NULL;
12796    }
12797
12798    kind1 = PyUnicode_KIND(str_obj);
12799    kind2 = PyUnicode_KIND(sep_obj);
12800    len1 = PyUnicode_GET_LENGTH(str_obj);
12801    len2 = PyUnicode_GET_LENGTH(sep_obj);
12802    if (kind1 < kind2 || len1 < len2) {
12803        _Py_INCREF_UNICODE_EMPTY();
12804        if (!unicode_empty)
12805            out = NULL;
12806        else {
12807            out = PyTuple_Pack(3, unicode_empty, unicode_empty, str_obj);
12808            Py_DECREF(unicode_empty);
12809        }
12810        Py_DECREF(sep_obj);
12811        Py_DECREF(str_obj);
12812        return out;
12813    }
12814    buf1 = PyUnicode_DATA(str_obj);
12815    buf2 = PyUnicode_DATA(sep_obj);
12816    if (kind2 != kind1) {
12817        buf2 = _PyUnicode_AsKind(sep_obj, kind1);
12818        if (!buf2)
12819            goto onError;
12820    }
12821
12822    switch (kind1) {
12823    case PyUnicode_1BYTE_KIND:
12824        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12825            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12826        else
12827            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12828        break;
12829    case PyUnicode_2BYTE_KIND:
12830        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12831        break;
12832    case PyUnicode_4BYTE_KIND:
12833        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12834        break;
12835    default:
12836        assert(0);
12837        out = 0;
12838    }
12839
12840    Py_DECREF(sep_obj);
12841    Py_DECREF(str_obj);
12842    if (kind2 != kind1)
12843        PyMem_Free(buf2);
12844
12845    return out;
12846  onError:
12847    Py_DECREF(sep_obj);
12848    Py_DECREF(str_obj);
12849    if (kind2 != kind1 && buf2)
12850        PyMem_Free(buf2);
12851    return NULL;
12852}
12853
12854PyDoc_STRVAR(partition__doc__,
12855             "S.partition(sep) -> (head, sep, tail)\n\
12856\n\
12857Search for the separator sep in S, and return the part before it,\n\
12858the separator itself, and the part after it.  If the separator is not\n\
12859found, return S and two empty strings.");
12860
12861static PyObject*
12862unicode_partition(PyObject *self, PyObject *separator)
12863{
12864    return PyUnicode_Partition(self, separator);
12865}
12866
12867PyDoc_STRVAR(rpartition__doc__,
12868             "S.rpartition(sep) -> (head, sep, tail)\n\
12869\n\
12870Search for the separator sep in S, starting at the end of S, and return\n\
12871the part before it, the separator itself, and the part after it.  If the\n\
12872separator is not found, return two empty strings and S.");
12873
12874static PyObject*
12875unicode_rpartition(PyObject *self, PyObject *separator)
12876{
12877    return PyUnicode_RPartition(self, separator);
12878}
12879
12880PyObject *
12881PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12882{
12883    PyObject *result;
12884
12885    s = PyUnicode_FromObject(s);
12886    if (s == NULL)
12887        return NULL;
12888    if (sep != NULL) {
12889        sep = PyUnicode_FromObject(sep);
12890        if (sep == NULL) {
12891            Py_DECREF(s);
12892            return NULL;
12893        }
12894    }
12895
12896    result = rsplit(s, sep, maxsplit);
12897
12898    Py_DECREF(s);
12899    Py_XDECREF(sep);
12900    return result;
12901}
12902
12903PyDoc_STRVAR(rsplit__doc__,
12904             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12905\n\
12906Return a list of the words in S, using sep as the\n\
12907delimiter string, starting at the end of the string and\n\
12908working to the front.  If maxsplit is given, at most maxsplit\n\
12909splits are done. If sep is not specified, any whitespace string\n\
12910is a separator.");
12911
12912static PyObject*
12913unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12914{
12915    static char *kwlist[] = {"sep", "maxsplit", 0};
12916    PyObject *substring = Py_None;
12917    Py_ssize_t maxcount = -1;
12918
12919    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12920                                     kwlist, &substring, &maxcount))
12921        return NULL;
12922
12923    if (substring == Py_None)
12924        return rsplit(self, NULL, maxcount);
12925    else if (PyUnicode_Check(substring))
12926        return rsplit(self, substring, maxcount);
12927    else
12928        return PyUnicode_RSplit(self, substring, maxcount);
12929}
12930
12931PyDoc_STRVAR(splitlines__doc__,
12932             "S.splitlines([keepends]) -> list of strings\n\
12933\n\
12934Return a list of the lines in S, breaking at line boundaries.\n\
12935Line breaks are not included in the resulting list unless keepends\n\
12936is given and true.");
12937
12938static PyObject*
12939unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12940{
12941    static char *kwlist[] = {"keepends", 0};
12942    int keepends = 0;
12943
12944    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12945                                     kwlist, &keepends))
12946        return NULL;
12947
12948    return PyUnicode_Splitlines(self, keepends);
12949}
12950
12951static
12952PyObject *unicode_str(PyObject *self)
12953{
12954    return unicode_result_unchanged(self);
12955}
12956
12957PyDoc_STRVAR(swapcase__doc__,
12958             "S.swapcase() -> str\n\
12959\n\
12960Return a copy of S with uppercase characters converted to lowercase\n\
12961and vice versa.");
12962
12963static PyObject*
12964unicode_swapcase(PyObject *self)
12965{
12966    if (PyUnicode_READY(self) == -1)
12967        return NULL;
12968    return case_operation(self, do_swapcase);
12969}
12970
12971/*[clinic input]
12972
12973@staticmethod
12974str.maketrans as unicode_maketrans
12975
12976  x: object
12977
12978  y: unicode=NULL
12979
12980  z: unicode=NULL
12981
12982  /
12983
12984Return a translation table usable for str.translate().
12985
12986If there is only one argument, it must be a dictionary mapping Unicode
12987ordinals (integers) or characters to Unicode ordinals, strings or None.
12988Character keys will be then converted to ordinals.
12989If there are two arguments, they must be strings of equal length, and
12990in the resulting dictionary, each character in x will be mapped to the
12991character at the same position in y. If there is a third argument, it
12992must be a string, whose characters will be mapped to None in the result.
12993[clinic start generated code]*/
12994
12995static PyObject *
12996unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12997/*[clinic end generated code: output=a925c89452bd5881 input=7bfbf529a293c6c5]*/
12998{
12999    PyObject *new = NULL, *key, *value;
13000    Py_ssize_t i = 0;
13001    int res;
13002
13003    new = PyDict_New();
13004    if (!new)
13005        return NULL;
13006    if (y != NULL) {
13007        int x_kind, y_kind, z_kind;
13008        void *x_data, *y_data, *z_data;
13009
13010        /* x must be a string too, of equal length */
13011        if (!PyUnicode_Check(x)) {
13012            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13013                            "be a string if there is a second argument");
13014            goto err;
13015        }
13016        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13017            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13018                            "arguments must have equal length");
13019            goto err;
13020        }
13021        /* create entries for translating chars in x to those in y */
13022        x_kind = PyUnicode_KIND(x);
13023        y_kind = PyUnicode_KIND(y);
13024        x_data = PyUnicode_DATA(x);
13025        y_data = PyUnicode_DATA(y);
13026        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13027            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13028            if (!key)
13029                goto err;
13030            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13031            if (!value) {
13032                Py_DECREF(key);
13033                goto err;
13034            }
13035            res = PyDict_SetItem(new, key, value);
13036            Py_DECREF(key);
13037            Py_DECREF(value);
13038            if (res < 0)
13039                goto err;
13040        }
13041        /* create entries for deleting chars in z */
13042        if (z != NULL) {
13043            z_kind = PyUnicode_KIND(z);
13044            z_data = PyUnicode_DATA(z);
13045            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13046                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13047                if (!key)
13048                    goto err;
13049                res = PyDict_SetItem(new, key, Py_None);
13050                Py_DECREF(key);
13051                if (res < 0)
13052                    goto err;
13053            }
13054        }
13055    } else {
13056        int kind;
13057        void *data;
13058
13059        /* x must be a dict */
13060        if (!PyDict_CheckExact(x)) {
13061            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13062                            "to maketrans it must be a dict");
13063            goto err;
13064        }
13065        /* copy entries into the new dict, converting string keys to int keys */
13066        while (PyDict_Next(x, &i, &key, &value)) {
13067            if (PyUnicode_Check(key)) {
13068                /* convert string keys to integer keys */
13069                PyObject *newkey;
13070                if (PyUnicode_GET_LENGTH(key) != 1) {
13071                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13072                                    "table must be of length 1");
13073                    goto err;
13074                }
13075                kind = PyUnicode_KIND(key);
13076                data = PyUnicode_DATA(key);
13077                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13078                if (!newkey)
13079                    goto err;
13080                res = PyDict_SetItem(new, newkey, value);
13081                Py_DECREF(newkey);
13082                if (res < 0)
13083                    goto err;
13084            } else if (PyLong_Check(key)) {
13085                /* just keep integer keys */
13086                if (PyDict_SetItem(new, key, value) < 0)
13087                    goto err;
13088            } else {
13089                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13090                                "be strings or integers");
13091                goto err;
13092            }
13093        }
13094    }
13095    return new;
13096  err:
13097    Py_DECREF(new);
13098    return NULL;
13099}
13100
13101PyDoc_STRVAR(translate__doc__,
13102             "S.translate(table) -> str\n\
13103\n\
13104Return a copy of the string S in which each character has been mapped\n\
13105through the given translation table. The table must implement\n\
13106lookup/indexing via __getitem__, for instance a dictionary or list,\n\
13107mapping Unicode ordinals to Unicode ordinals, strings, or None. If\n\
13108this operation raises LookupError, the character is left untouched.\n\
13109Characters mapped to None are deleted.");
13110
13111static PyObject*
13112unicode_translate(PyObject *self, PyObject *table)
13113{
13114    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13115}
13116
13117PyDoc_STRVAR(upper__doc__,
13118             "S.upper() -> str\n\
13119\n\
13120Return a copy of S converted to uppercase.");
13121
13122static PyObject*
13123unicode_upper(PyObject *self)
13124{
13125    if (PyUnicode_READY(self) == -1)
13126        return NULL;
13127    if (PyUnicode_IS_ASCII(self))
13128        return ascii_upper_or_lower(self, 0);
13129    return case_operation(self, do_upper);
13130}
13131
13132PyDoc_STRVAR(zfill__doc__,
13133             "S.zfill(width) -> str\n\
13134\n\
13135Pad a numeric string S with zeros on the left, to fill a field\n\
13136of the specified width. The string S is never truncated.");
13137
13138static PyObject *
13139unicode_zfill(PyObject *self, PyObject *args)
13140{
13141    Py_ssize_t fill;
13142    PyObject *u;
13143    Py_ssize_t width;
13144    int kind;
13145    void *data;
13146    Py_UCS4 chr;
13147
13148    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13149        return NULL;
13150
13151    if (PyUnicode_READY(self) == -1)
13152        return NULL;
13153
13154    if (PyUnicode_GET_LENGTH(self) >= width)
13155        return unicode_result_unchanged(self);
13156
13157    fill = width - PyUnicode_GET_LENGTH(self);
13158
13159    u = pad(self, fill, 0, '0');
13160
13161    if (u == NULL)
13162        return NULL;
13163
13164    kind = PyUnicode_KIND(u);
13165    data = PyUnicode_DATA(u);
13166    chr = PyUnicode_READ(kind, data, fill);
13167
13168    if (chr == '+' || chr == '-') {
13169        /* move sign to beginning of string */
13170        PyUnicode_WRITE(kind, data, 0, chr);
13171        PyUnicode_WRITE(kind, data, fill, '0');
13172    }
13173
13174    assert(_PyUnicode_CheckConsistency(u, 1));
13175    return u;
13176}
13177
13178#if 0
13179static PyObject *
13180unicode__decimal2ascii(PyObject *self)
13181{
13182    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13183}
13184#endif
13185
13186PyDoc_STRVAR(startswith__doc__,
13187             "S.startswith(prefix[, start[, end]]) -> bool\n\
13188\n\
13189Return True if S starts with the specified prefix, False otherwise.\n\
13190With optional start, test S beginning at that position.\n\
13191With optional end, stop comparing S at that position.\n\
13192prefix can also be a tuple of strings to try.");
13193
13194static PyObject *
13195unicode_startswith(PyObject *self,
13196                   PyObject *args)
13197{
13198    PyObject *subobj;
13199    PyObject *substring;
13200    Py_ssize_t start = 0;
13201    Py_ssize_t end = PY_SSIZE_T_MAX;
13202    int result;
13203
13204    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13205        return NULL;
13206    if (PyTuple_Check(subobj)) {
13207        Py_ssize_t i;
13208        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13209            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
13210            if (substring == NULL)
13211                return NULL;
13212            result = tailmatch(self, substring, start, end, -1);
13213            Py_DECREF(substring);
13214            if (result == -1)
13215                return NULL;
13216            if (result) {
13217                Py_RETURN_TRUE;
13218            }
13219        }
13220        /* nothing matched */
13221        Py_RETURN_FALSE;
13222    }
13223    substring = PyUnicode_FromObject(subobj);
13224    if (substring == NULL) {
13225        if (PyErr_ExceptionMatches(PyExc_TypeError))
13226            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13227                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13228        return NULL;
13229    }
13230    result = tailmatch(self, substring, start, end, -1);
13231    Py_DECREF(substring);
13232    if (result == -1)
13233        return NULL;
13234    return PyBool_FromLong(result);
13235}
13236
13237
13238PyDoc_STRVAR(endswith__doc__,
13239             "S.endswith(suffix[, start[, end]]) -> bool\n\
13240\n\
13241Return True if S ends with the specified suffix, False otherwise.\n\
13242With optional start, test S beginning at that position.\n\
13243With optional end, stop comparing S at that position.\n\
13244suffix can also be a tuple of strings to try.");
13245
13246static PyObject *
13247unicode_endswith(PyObject *self,
13248                 PyObject *args)
13249{
13250    PyObject *subobj;
13251    PyObject *substring;
13252    Py_ssize_t start = 0;
13253    Py_ssize_t end = PY_SSIZE_T_MAX;
13254    int result;
13255
13256    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13257        return NULL;
13258    if (PyTuple_Check(subobj)) {
13259        Py_ssize_t i;
13260        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13261            substring = PyUnicode_FromObject(
13262                PyTuple_GET_ITEM(subobj, i));
13263            if (substring == NULL)
13264                return NULL;
13265            result = tailmatch(self, substring, start, end, +1);
13266            Py_DECREF(substring);
13267            if (result == -1)
13268                return NULL;
13269            if (result) {
13270                Py_RETURN_TRUE;
13271            }
13272        }
13273        Py_RETURN_FALSE;
13274    }
13275    substring = PyUnicode_FromObject(subobj);
13276    if (substring == NULL) {
13277        if (PyErr_ExceptionMatches(PyExc_TypeError))
13278            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13279                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13280        return NULL;
13281    }
13282    result = tailmatch(self, substring, start, end, +1);
13283    Py_DECREF(substring);
13284    if (result == -1)
13285        return NULL;
13286    return PyBool_FromLong(result);
13287}
13288
13289Py_LOCAL_INLINE(void)
13290_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13291{
13292    if (!writer->readonly)
13293        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13294    else {
13295        /* Copy-on-write mode: set buffer size to 0 so
13296         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13297         * next write. */
13298        writer->size = 0;
13299    }
13300    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13301    writer->data = PyUnicode_DATA(writer->buffer);
13302    writer->kind = PyUnicode_KIND(writer->buffer);
13303}
13304
13305void
13306_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13307{
13308    memset(writer, 0, sizeof(*writer));
13309#ifdef Py_DEBUG
13310    writer->kind = 5;    /* invalid kind */
13311#endif
13312    writer->min_char = 127;
13313}
13314
13315int
13316_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13317                                 Py_ssize_t length, Py_UCS4 maxchar)
13318{
13319#ifdef MS_WINDOWS
13320   /* On Windows, overallocate by 50% is the best factor */
13321#  define OVERALLOCATE_FACTOR 2
13322#else
13323   /* On Linux, overallocate by 25% is the best factor */
13324#  define OVERALLOCATE_FACTOR 4
13325#endif
13326    Py_ssize_t newlen;
13327    PyObject *newbuffer;
13328
13329    /* ensure that the _PyUnicodeWriter_Prepare macro was used */
13330    assert((maxchar > writer->maxchar && length >= 0)
13331           || length > 0);
13332
13333    if (length > PY_SSIZE_T_MAX - writer->pos) {
13334        PyErr_NoMemory();
13335        return -1;
13336    }
13337    newlen = writer->pos + length;
13338
13339    maxchar = Py_MAX(maxchar, writer->min_char);
13340
13341    if (writer->buffer == NULL) {
13342        assert(!writer->readonly);
13343        if (writer->overallocate
13344            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13345            /* overallocate to limit the number of realloc() */
13346            newlen += newlen / OVERALLOCATE_FACTOR;
13347        }
13348        if (newlen < writer->min_length)
13349            newlen = writer->min_length;
13350
13351        writer->buffer = PyUnicode_New(newlen, maxchar);
13352        if (writer->buffer == NULL)
13353            return -1;
13354    }
13355    else if (newlen > writer->size) {
13356        if (writer->overallocate
13357            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13358            /* overallocate to limit the number of realloc() */
13359            newlen += newlen / OVERALLOCATE_FACTOR;
13360        }
13361        if (newlen < writer->min_length)
13362            newlen = writer->min_length;
13363
13364        if (maxchar > writer->maxchar || writer->readonly) {
13365            /* resize + widen */
13366            newbuffer = PyUnicode_New(newlen, maxchar);
13367            if (newbuffer == NULL)
13368                return -1;
13369            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13370                                          writer->buffer, 0, writer->pos);
13371            Py_DECREF(writer->buffer);
13372            writer->readonly = 0;
13373        }
13374        else {
13375            newbuffer = resize_compact(writer->buffer, newlen);
13376            if (newbuffer == NULL)
13377                return -1;
13378        }
13379        writer->buffer = newbuffer;
13380    }
13381    else if (maxchar > writer->maxchar) {
13382        assert(!writer->readonly);
13383        newbuffer = PyUnicode_New(writer->size, maxchar);
13384        if (newbuffer == NULL)
13385            return -1;
13386        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13387                                      writer->buffer, 0, writer->pos);
13388        Py_DECREF(writer->buffer);
13389        writer->buffer = newbuffer;
13390    }
13391    _PyUnicodeWriter_Update(writer);
13392    return 0;
13393
13394#undef OVERALLOCATE_FACTOR
13395}
13396
13397int
13398_PyUnicodeWriter_PrepareKindInternal(_PyUnicodeWriter *writer,
13399                                     enum PyUnicode_Kind kind)
13400{
13401    Py_UCS4 maxchar;
13402
13403    /* ensure that the _PyUnicodeWriter_PrepareKind macro was used */
13404    assert(writer->kind < kind);
13405
13406    switch (kind)
13407    {
13408    case PyUnicode_1BYTE_KIND: maxchar = 0xff; break;
13409    case PyUnicode_2BYTE_KIND: maxchar = 0xffff; break;
13410    case PyUnicode_4BYTE_KIND: maxchar = 0x10ffff; break;
13411    default:
13412        assert(0 && "invalid kind");
13413        return -1;
13414    }
13415
13416    return _PyUnicodeWriter_PrepareInternal(writer, 0, maxchar);
13417}
13418
13419Py_LOCAL_INLINE(int)
13420_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13421{
13422    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13423        return -1;
13424    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13425    writer->pos++;
13426    return 0;
13427}
13428
13429int
13430_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13431{
13432    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13433}
13434
13435int
13436_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13437{
13438    Py_UCS4 maxchar;
13439    Py_ssize_t len;
13440
13441    if (PyUnicode_READY(str) == -1)
13442        return -1;
13443    len = PyUnicode_GET_LENGTH(str);
13444    if (len == 0)
13445        return 0;
13446    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13447    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13448        if (writer->buffer == NULL && !writer->overallocate) {
13449            assert(_PyUnicode_CheckConsistency(str, 1));
13450            writer->readonly = 1;
13451            Py_INCREF(str);
13452            writer->buffer = str;
13453            _PyUnicodeWriter_Update(writer);
13454            writer->pos += len;
13455            return 0;
13456        }
13457        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13458            return -1;
13459    }
13460    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13461                                  str, 0, len);
13462    writer->pos += len;
13463    return 0;
13464}
13465
13466int
13467_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13468                                Py_ssize_t start, Py_ssize_t end)
13469{
13470    Py_UCS4 maxchar;
13471    Py_ssize_t len;
13472
13473    if (PyUnicode_READY(str) == -1)
13474        return -1;
13475
13476    assert(0 <= start);
13477    assert(end <= PyUnicode_GET_LENGTH(str));
13478    assert(start <= end);
13479
13480    if (end == 0)
13481        return 0;
13482
13483    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13484        return _PyUnicodeWriter_WriteStr(writer, str);
13485
13486    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13487        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13488    else
13489        maxchar = writer->maxchar;
13490    len = end - start;
13491
13492    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13493        return -1;
13494
13495    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13496                                  str, start, len);
13497    writer->pos += len;
13498    return 0;
13499}
13500
13501int
13502_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13503                                  const char *ascii, Py_ssize_t len)
13504{
13505    if (len == -1)
13506        len = strlen(ascii);
13507
13508    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13509
13510    if (writer->buffer == NULL && !writer->overallocate) {
13511        PyObject *str;
13512
13513        str = _PyUnicode_FromASCII(ascii, len);
13514        if (str == NULL)
13515            return -1;
13516
13517        writer->readonly = 1;
13518        writer->buffer = str;
13519        _PyUnicodeWriter_Update(writer);
13520        writer->pos += len;
13521        return 0;
13522    }
13523
13524    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13525        return -1;
13526
13527    switch (writer->kind)
13528    {
13529    case PyUnicode_1BYTE_KIND:
13530    {
13531        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13532        Py_UCS1 *data = writer->data;
13533
13534        Py_MEMCPY(data + writer->pos, str, len);
13535        break;
13536    }
13537    case PyUnicode_2BYTE_KIND:
13538    {
13539        _PyUnicode_CONVERT_BYTES(
13540            Py_UCS1, Py_UCS2,
13541            ascii, ascii + len,
13542            (Py_UCS2 *)writer->data + writer->pos);
13543        break;
13544    }
13545    case PyUnicode_4BYTE_KIND:
13546    {
13547        _PyUnicode_CONVERT_BYTES(
13548            Py_UCS1, Py_UCS4,
13549            ascii, ascii + len,
13550            (Py_UCS4 *)writer->data + writer->pos);
13551        break;
13552    }
13553    default:
13554        assert(0);
13555    }
13556
13557    writer->pos += len;
13558    return 0;
13559}
13560
13561int
13562_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13563                                   const char *str, Py_ssize_t len)
13564{
13565    Py_UCS4 maxchar;
13566
13567    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13568    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13569        return -1;
13570    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13571    writer->pos += len;
13572    return 0;
13573}
13574
13575PyObject *
13576_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13577{
13578    PyObject *str;
13579    if (writer->pos == 0) {
13580        Py_CLEAR(writer->buffer);
13581        _Py_RETURN_UNICODE_EMPTY();
13582    }
13583    if (writer->readonly) {
13584        str = writer->buffer;
13585        writer->buffer = NULL;
13586        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13587        return str;
13588    }
13589    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13590        PyObject *newbuffer;
13591        newbuffer = resize_compact(writer->buffer, writer->pos);
13592        if (newbuffer == NULL) {
13593            Py_CLEAR(writer->buffer);
13594            return NULL;
13595        }
13596        writer->buffer = newbuffer;
13597    }
13598    str = writer->buffer;
13599    writer->buffer = NULL;
13600    assert(_PyUnicode_CheckConsistency(str, 1));
13601    return unicode_result_ready(str);
13602}
13603
13604void
13605_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13606{
13607    Py_CLEAR(writer->buffer);
13608}
13609
13610#include "stringlib/unicode_format.h"
13611
13612PyDoc_STRVAR(format__doc__,
13613             "S.format(*args, **kwargs) -> str\n\
13614\n\
13615Return a formatted version of S, using substitutions from args and kwargs.\n\
13616The substitutions are identified by braces ('{' and '}').");
13617
13618PyDoc_STRVAR(format_map__doc__,
13619             "S.format_map(mapping) -> str\n\
13620\n\
13621Return a formatted version of S, using substitutions from mapping.\n\
13622The substitutions are identified by braces ('{' and '}').");
13623
13624static PyObject *
13625unicode__format__(PyObject* self, PyObject* args)
13626{
13627    PyObject *format_spec;
13628    _PyUnicodeWriter writer;
13629    int ret;
13630
13631    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13632        return NULL;
13633
13634    if (PyUnicode_READY(self) == -1)
13635        return NULL;
13636    _PyUnicodeWriter_Init(&writer);
13637    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13638                                          self, format_spec, 0,
13639                                          PyUnicode_GET_LENGTH(format_spec));
13640    if (ret == -1) {
13641        _PyUnicodeWriter_Dealloc(&writer);
13642        return NULL;
13643    }
13644    return _PyUnicodeWriter_Finish(&writer);
13645}
13646
13647PyDoc_STRVAR(p_format__doc__,
13648             "S.__format__(format_spec) -> str\n\
13649\n\
13650Return a formatted version of S as described by format_spec.");
13651
13652static PyObject *
13653unicode__sizeof__(PyObject *v)
13654{
13655    Py_ssize_t size;
13656
13657    /* If it's a compact object, account for base structure +
13658       character data. */
13659    if (PyUnicode_IS_COMPACT_ASCII(v))
13660        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13661    else if (PyUnicode_IS_COMPACT(v))
13662        size = sizeof(PyCompactUnicodeObject) +
13663            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13664    else {
13665        /* If it is a two-block object, account for base object, and
13666           for character block if present. */
13667        size = sizeof(PyUnicodeObject);
13668        if (_PyUnicode_DATA_ANY(v))
13669            size += (PyUnicode_GET_LENGTH(v) + 1) *
13670                PyUnicode_KIND(v);
13671    }
13672    /* If the wstr pointer is present, account for it unless it is shared
13673       with the data pointer. Check if the data is not shared. */
13674    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13675        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13676    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13677        size += PyUnicode_UTF8_LENGTH(v) + 1;
13678
13679    return PyLong_FromSsize_t(size);
13680}
13681
13682PyDoc_STRVAR(sizeof__doc__,
13683             "S.__sizeof__() -> size of S in memory, in bytes");
13684
13685static PyObject *
13686unicode_getnewargs(PyObject *v)
13687{
13688    PyObject *copy = _PyUnicode_Copy(v);
13689    if (!copy)
13690        return NULL;
13691    return Py_BuildValue("(N)", copy);
13692}
13693
13694static PyMethodDef unicode_methods[] = {
13695    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13696    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13697    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13698    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13699    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13700    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13701    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13702    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13703    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13704    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13705    {"expandtabs", (PyCFunction) unicode_expandtabs,
13706     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13707    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13708    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13709    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13710    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13711    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13712    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13713    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13714    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13715    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13716    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13717    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13718    {"splitlines", (PyCFunction) unicode_splitlines,
13719     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13720    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13721    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13722    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13723    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13724    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13725    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13726    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13727    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13728    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13729    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13730    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13731    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13732    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13733    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13734    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13735    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13736    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13737    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13738    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13739    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13740    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13741    UNICODE_MAKETRANS_METHODDEF
13742    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13743#if 0
13744    /* These methods are just used for debugging the implementation. */
13745    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13746#endif
13747
13748    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13749    {NULL, NULL}
13750};
13751
13752static PyObject *
13753unicode_mod(PyObject *v, PyObject *w)
13754{
13755    if (!PyUnicode_Check(v))
13756        Py_RETURN_NOTIMPLEMENTED;
13757    return PyUnicode_Format(v, w);
13758}
13759
13760static PyNumberMethods unicode_as_number = {
13761    0,              /*nb_add*/
13762    0,              /*nb_subtract*/
13763    0,              /*nb_multiply*/
13764    unicode_mod,            /*nb_remainder*/
13765};
13766
13767static PySequenceMethods unicode_as_sequence = {
13768    (lenfunc) unicode_length,       /* sq_length */
13769    PyUnicode_Concat,           /* sq_concat */
13770    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13771    (ssizeargfunc) unicode_getitem,     /* sq_item */
13772    0,                  /* sq_slice */
13773    0,                  /* sq_ass_item */
13774    0,                  /* sq_ass_slice */
13775    PyUnicode_Contains,         /* sq_contains */
13776};
13777
13778static PyObject*
13779unicode_subscript(PyObject* self, PyObject* item)
13780{
13781    if (PyUnicode_READY(self) == -1)
13782        return NULL;
13783
13784    if (PyIndex_Check(item)) {
13785        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13786        if (i == -1 && PyErr_Occurred())
13787            return NULL;
13788        if (i < 0)
13789            i += PyUnicode_GET_LENGTH(self);
13790        return unicode_getitem(self, i);
13791    } else if (PySlice_Check(item)) {
13792        Py_ssize_t start, stop, step, slicelength, cur, i;
13793        PyObject *result;
13794        void *src_data, *dest_data;
13795        int src_kind, dest_kind;
13796        Py_UCS4 ch, max_char, kind_limit;
13797
13798        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13799                                 &start, &stop, &step, &slicelength) < 0) {
13800            return NULL;
13801        }
13802
13803        if (slicelength <= 0) {
13804            _Py_RETURN_UNICODE_EMPTY();
13805        } else if (start == 0 && step == 1 &&
13806                   slicelength == PyUnicode_GET_LENGTH(self)) {
13807            return unicode_result_unchanged(self);
13808        } else if (step == 1) {
13809            return PyUnicode_Substring(self,
13810                                       start, start + slicelength);
13811        }
13812        /* General case */
13813        src_kind = PyUnicode_KIND(self);
13814        src_data = PyUnicode_DATA(self);
13815        if (!PyUnicode_IS_ASCII(self)) {
13816            kind_limit = kind_maxchar_limit(src_kind);
13817            max_char = 0;
13818            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13819                ch = PyUnicode_READ(src_kind, src_data, cur);
13820                if (ch > max_char) {
13821                    max_char = ch;
13822                    if (max_char >= kind_limit)
13823                        break;
13824                }
13825            }
13826        }
13827        else
13828            max_char = 127;
13829        result = PyUnicode_New(slicelength, max_char);
13830        if (result == NULL)
13831            return NULL;
13832        dest_kind = PyUnicode_KIND(result);
13833        dest_data = PyUnicode_DATA(result);
13834
13835        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13836            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13837            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13838        }
13839        assert(_PyUnicode_CheckConsistency(result, 1));
13840        return result;
13841    } else {
13842        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13843        return NULL;
13844    }
13845}
13846
13847static PyMappingMethods unicode_as_mapping = {
13848    (lenfunc)unicode_length,        /* mp_length */
13849    (binaryfunc)unicode_subscript,  /* mp_subscript */
13850    (objobjargproc)0,           /* mp_ass_subscript */
13851};
13852
13853
13854/* Helpers for PyUnicode_Format() */
13855
13856struct unicode_formatter_t {
13857    PyObject *args;
13858    int args_owned;
13859    Py_ssize_t arglen, argidx;
13860    PyObject *dict;
13861
13862    enum PyUnicode_Kind fmtkind;
13863    Py_ssize_t fmtcnt, fmtpos;
13864    void *fmtdata;
13865    PyObject *fmtstr;
13866
13867    _PyUnicodeWriter writer;
13868};
13869
13870struct unicode_format_arg_t {
13871    Py_UCS4 ch;
13872    int flags;
13873    Py_ssize_t width;
13874    int prec;
13875    int sign;
13876};
13877
13878static PyObject *
13879unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13880{
13881    Py_ssize_t argidx = ctx->argidx;
13882
13883    if (argidx < ctx->arglen) {
13884        ctx->argidx++;
13885        if (ctx->arglen < 0)
13886            return ctx->args;
13887        else
13888            return PyTuple_GetItem(ctx->args, argidx);
13889    }
13890    PyErr_SetString(PyExc_TypeError,
13891                    "not enough arguments for format string");
13892    return NULL;
13893}
13894
13895/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13896
13897/* Format a float into the writer if the writer is not NULL, or into *p_output
13898   otherwise.
13899
13900   Return 0 on success, raise an exception and return -1 on error. */
13901static int
13902formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13903            PyObject **p_output,
13904            _PyUnicodeWriter *writer)
13905{
13906    char *p;
13907    double x;
13908    Py_ssize_t len;
13909    int prec;
13910    int dtoa_flags;
13911
13912    x = PyFloat_AsDouble(v);
13913    if (x == -1.0 && PyErr_Occurred())
13914        return -1;
13915
13916    prec = arg->prec;
13917    if (prec < 0)
13918        prec = 6;
13919
13920    if (arg->flags & F_ALT)
13921        dtoa_flags = Py_DTSF_ALT;
13922    else
13923        dtoa_flags = 0;
13924    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13925    if (p == NULL)
13926        return -1;
13927    len = strlen(p);
13928    if (writer) {
13929        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13930            PyMem_Free(p);
13931            return -1;
13932        }
13933    }
13934    else
13935        *p_output = _PyUnicode_FromASCII(p, len);
13936    PyMem_Free(p);
13937    return 0;
13938}
13939
13940/* formatlong() emulates the format codes d, u, o, x and X, and
13941 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13942 * Python's regular ints.
13943 * Return value:  a new PyUnicodeObject*, or NULL if error.
13944 *     The output string is of the form
13945 *         "-"? ("0x" | "0X")? digit+
13946 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13947 *         set in flags.  The case of hex digits will be correct,
13948 *     There will be at least prec digits, zero-filled on the left if
13949 *         necessary to get that many.
13950 * val          object to be converted
13951 * flags        bitmask of format flags; only F_ALT is looked at
13952 * prec         minimum number of digits; 0-fill on left if needed
13953 * type         a character in [duoxX]; u acts the same as d
13954 *
13955 * CAUTION:  o, x and X conversions on regular ints can never
13956 * produce a '-' sign, but can for Python's unbounded ints.
13957 */
13958PyObject *
13959_PyUnicode_FormatLong(PyObject *val, int alt, int prec, int type)
13960{
13961    PyObject *result = NULL;
13962    char *buf;
13963    Py_ssize_t i;
13964    int sign;           /* 1 if '-', else 0 */
13965    int len;            /* number of characters */
13966    Py_ssize_t llen;
13967    int numdigits;      /* len == numnondigits + numdigits */
13968    int numnondigits = 0;
13969
13970    /* Avoid exceeding SSIZE_T_MAX */
13971    if (prec > INT_MAX-3) {
13972        PyErr_SetString(PyExc_OverflowError,
13973                        "precision too large");
13974        return NULL;
13975    }
13976
13977    assert(PyLong_Check(val));
13978
13979    switch (type) {
13980    default:
13981        assert(!"'type' not in [diuoxX]");
13982    case 'd':
13983    case 'i':
13984    case 'u':
13985        /* int and int subclasses should print numerically when a numeric */
13986        /* format code is used (see issue18780) */
13987        result = PyNumber_ToBase(val, 10);
13988        break;
13989    case 'o':
13990        numnondigits = 2;
13991        result = PyNumber_ToBase(val, 8);
13992        break;
13993    case 'x':
13994    case 'X':
13995        numnondigits = 2;
13996        result = PyNumber_ToBase(val, 16);
13997        break;
13998    }
13999    if (!result)
14000        return NULL;
14001
14002    assert(unicode_modifiable(result));
14003    assert(PyUnicode_IS_READY(result));
14004    assert(PyUnicode_IS_ASCII(result));
14005
14006    /* To modify the string in-place, there can only be one reference. */
14007    if (Py_REFCNT(result) != 1) {
14008        Py_DECREF(result);
14009        PyErr_BadInternalCall();
14010        return NULL;
14011    }
14012    buf = PyUnicode_DATA(result);
14013    llen = PyUnicode_GET_LENGTH(result);
14014    if (llen > INT_MAX) {
14015        Py_DECREF(result);
14016        PyErr_SetString(PyExc_ValueError,
14017                        "string too large in _PyUnicode_FormatLong");
14018        return NULL;
14019    }
14020    len = (int)llen;
14021    sign = buf[0] == '-';
14022    numnondigits += sign;
14023    numdigits = len - numnondigits;
14024    assert(numdigits > 0);
14025
14026    /* Get rid of base marker unless F_ALT */
14027    if (((alt) == 0 &&
14028        (type == 'o' || type == 'x' || type == 'X'))) {
14029        assert(buf[sign] == '0');
14030        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14031               buf[sign+1] == 'o');
14032        numnondigits -= 2;
14033        buf += 2;
14034        len -= 2;
14035        if (sign)
14036            buf[0] = '-';
14037        assert(len == numnondigits + numdigits);
14038        assert(numdigits > 0);
14039    }
14040
14041    /* Fill with leading zeroes to meet minimum width. */
14042    if (prec > numdigits) {
14043        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14044                                numnondigits + prec);
14045        char *b1;
14046        if (!r1) {
14047            Py_DECREF(result);
14048            return NULL;
14049        }
14050        b1 = PyBytes_AS_STRING(r1);
14051        for (i = 0; i < numnondigits; ++i)
14052            *b1++ = *buf++;
14053        for (i = 0; i < prec - numdigits; i++)
14054            *b1++ = '0';
14055        for (i = 0; i < numdigits; i++)
14056            *b1++ = *buf++;
14057        *b1 = '\0';
14058        Py_DECREF(result);
14059        result = r1;
14060        buf = PyBytes_AS_STRING(result);
14061        len = numnondigits + prec;
14062    }
14063
14064    /* Fix up case for hex conversions. */
14065    if (type == 'X') {
14066        /* Need to convert all lower case letters to upper case.
14067           and need to convert 0x to 0X (and -0x to -0X). */
14068        for (i = 0; i < len; i++)
14069            if (buf[i] >= 'a' && buf[i] <= 'x')
14070                buf[i] -= 'a'-'A';
14071    }
14072    if (!PyUnicode_Check(result)
14073        || buf != PyUnicode_DATA(result)) {
14074        PyObject *unicode;
14075        unicode = _PyUnicode_FromASCII(buf, len);
14076        Py_DECREF(result);
14077        result = unicode;
14078    }
14079    else if (len != PyUnicode_GET_LENGTH(result)) {
14080        if (PyUnicode_Resize(&result, len) < 0)
14081            Py_CLEAR(result);
14082    }
14083    return result;
14084}
14085
14086/* Format an integer or a float as an integer.
14087 * Return 1 if the number has been formatted into the writer,
14088 *        0 if the number has been formatted into *p_output
14089 *       -1 and raise an exception on error */
14090static int
14091mainformatlong(PyObject *v,
14092               struct unicode_format_arg_t *arg,
14093               PyObject **p_output,
14094               _PyUnicodeWriter *writer)
14095{
14096    PyObject *iobj, *res;
14097    char type = (char)arg->ch;
14098
14099    if (!PyNumber_Check(v))
14100        goto wrongtype;
14101
14102    /* make sure number is a type of integer for o, x, and X */
14103    if (!PyLong_Check(v)) {
14104        if (type == 'o' || type == 'x' || type == 'X') {
14105            iobj = PyNumber_Index(v);
14106            if (iobj == NULL) {
14107                if (PyErr_ExceptionMatches(PyExc_TypeError))
14108                    goto wrongtype;
14109                return -1;
14110            }
14111        }
14112        else {
14113            iobj = PyNumber_Long(v);
14114            if (iobj == NULL ) {
14115                if (PyErr_ExceptionMatches(PyExc_TypeError))
14116                    goto wrongtype;
14117                return -1;
14118            }
14119        }
14120        assert(PyLong_Check(iobj));
14121    }
14122    else {
14123        iobj = v;
14124        Py_INCREF(iobj);
14125    }
14126
14127    if (PyLong_CheckExact(v)
14128        && arg->width == -1 && arg->prec == -1
14129        && !(arg->flags & (F_SIGN | F_BLANK))
14130        && type != 'X')
14131    {
14132        /* Fast path */
14133        int alternate = arg->flags & F_ALT;
14134        int base;
14135
14136        switch(type)
14137        {
14138            default:
14139                assert(0 && "'type' not in [diuoxX]");
14140            case 'd':
14141            case 'i':
14142            case 'u':
14143                base = 10;
14144                break;
14145            case 'o':
14146                base = 8;
14147                break;
14148            case 'x':
14149            case 'X':
14150                base = 16;
14151                break;
14152        }
14153
14154        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14155            Py_DECREF(iobj);
14156            return -1;
14157        }
14158        Py_DECREF(iobj);
14159        return 1;
14160    }
14161
14162    res = _PyUnicode_FormatLong(iobj, arg->flags & F_ALT, arg->prec, type);
14163    Py_DECREF(iobj);
14164    if (res == NULL)
14165        return -1;
14166    *p_output = res;
14167    return 0;
14168
14169wrongtype:
14170    switch(type)
14171    {
14172        case 'o':
14173        case 'x':
14174        case 'X':
14175            PyErr_Format(PyExc_TypeError,
14176                    "%%%c format: an integer is required, "
14177                    "not %.200s",
14178                    type, Py_TYPE(v)->tp_name);
14179            break;
14180        default:
14181            PyErr_Format(PyExc_TypeError,
14182                    "%%%c format: a number is required, "
14183                    "not %.200s",
14184                    type, Py_TYPE(v)->tp_name);
14185            break;
14186    }
14187    return -1;
14188}
14189
14190static Py_UCS4
14191formatchar(PyObject *v)
14192{
14193    /* presume that the buffer is at least 3 characters long */
14194    if (PyUnicode_Check(v)) {
14195        if (PyUnicode_GET_LENGTH(v) == 1) {
14196            return PyUnicode_READ_CHAR(v, 0);
14197        }
14198        goto onError;
14199    }
14200    else {
14201        PyObject *iobj;
14202        long x;
14203        /* make sure number is a type of integer */
14204        if (!PyLong_Check(v)) {
14205            iobj = PyNumber_Index(v);
14206            if (iobj == NULL) {
14207                goto onError;
14208            }
14209            v = iobj;
14210            Py_DECREF(iobj);
14211        }
14212        /* Integer input truncated to a character */
14213        x = PyLong_AsLong(v);
14214        if (x == -1 && PyErr_Occurred())
14215            goto onError;
14216
14217        if (x < 0 || x > MAX_UNICODE) {
14218            PyErr_SetString(PyExc_OverflowError,
14219                            "%c arg not in range(0x110000)");
14220            return (Py_UCS4) -1;
14221        }
14222
14223        return (Py_UCS4) x;
14224    }
14225
14226  onError:
14227    PyErr_SetString(PyExc_TypeError,
14228                    "%c requires int or char");
14229    return (Py_UCS4) -1;
14230}
14231
14232/* Parse options of an argument: flags, width, precision.
14233   Handle also "%(name)" syntax.
14234
14235   Return 0 if the argument has been formatted into arg->str.
14236   Return 1 if the argument has been written into ctx->writer,
14237   Raise an exception and return -1 on error. */
14238static int
14239unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14240                         struct unicode_format_arg_t *arg)
14241{
14242#define FORMAT_READ(ctx) \
14243        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14244
14245    PyObject *v;
14246
14247    if (arg->ch == '(') {
14248        /* Get argument value from a dictionary. Example: "%(name)s". */
14249        Py_ssize_t keystart;
14250        Py_ssize_t keylen;
14251        PyObject *key;
14252        int pcount = 1;
14253
14254        if (ctx->dict == NULL) {
14255            PyErr_SetString(PyExc_TypeError,
14256                            "format requires a mapping");
14257            return -1;
14258        }
14259        ++ctx->fmtpos;
14260        --ctx->fmtcnt;
14261        keystart = ctx->fmtpos;
14262        /* Skip over balanced parentheses */
14263        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14264            arg->ch = FORMAT_READ(ctx);
14265            if (arg->ch == ')')
14266                --pcount;
14267            else if (arg->ch == '(')
14268                ++pcount;
14269            ctx->fmtpos++;
14270        }
14271        keylen = ctx->fmtpos - keystart - 1;
14272        if (ctx->fmtcnt < 0 || pcount > 0) {
14273            PyErr_SetString(PyExc_ValueError,
14274                            "incomplete format key");
14275            return -1;
14276        }
14277        key = PyUnicode_Substring(ctx->fmtstr,
14278                                  keystart, keystart + keylen);
14279        if (key == NULL)
14280            return -1;
14281        if (ctx->args_owned) {
14282            Py_DECREF(ctx->args);
14283            ctx->args_owned = 0;
14284        }
14285        ctx->args = PyObject_GetItem(ctx->dict, key);
14286        Py_DECREF(key);
14287        if (ctx->args == NULL)
14288            return -1;
14289        ctx->args_owned = 1;
14290        ctx->arglen = -1;
14291        ctx->argidx = -2;
14292    }
14293
14294    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14295    while (--ctx->fmtcnt >= 0) {
14296        arg->ch = FORMAT_READ(ctx);
14297        ctx->fmtpos++;
14298        switch (arg->ch) {
14299        case '-': arg->flags |= F_LJUST; continue;
14300        case '+': arg->flags |= F_SIGN; continue;
14301        case ' ': arg->flags |= F_BLANK; continue;
14302        case '#': arg->flags |= F_ALT; continue;
14303        case '0': arg->flags |= F_ZERO; continue;
14304        }
14305        break;
14306    }
14307
14308    /* Parse width. Example: "%10s" => width=10 */
14309    if (arg->ch == '*') {
14310        v = unicode_format_getnextarg(ctx);
14311        if (v == NULL)
14312            return -1;
14313        if (!PyLong_Check(v)) {
14314            PyErr_SetString(PyExc_TypeError,
14315                            "* wants int");
14316            return -1;
14317        }
14318        arg->width = PyLong_AsSsize_t(v);
14319        if (arg->width == -1 && PyErr_Occurred())
14320            return -1;
14321        if (arg->width < 0) {
14322            arg->flags |= F_LJUST;
14323            arg->width = -arg->width;
14324        }
14325        if (--ctx->fmtcnt >= 0) {
14326            arg->ch = FORMAT_READ(ctx);
14327            ctx->fmtpos++;
14328        }
14329    }
14330    else if (arg->ch >= '0' && arg->ch <= '9') {
14331        arg->width = arg->ch - '0';
14332        while (--ctx->fmtcnt >= 0) {
14333            arg->ch = FORMAT_READ(ctx);
14334            ctx->fmtpos++;
14335            if (arg->ch < '0' || arg->ch > '9')
14336                break;
14337            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14338               mixing signed and unsigned comparison. Since arg->ch is between
14339               '0' and '9', casting to int is safe. */
14340            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14341                PyErr_SetString(PyExc_ValueError,
14342                                "width too big");
14343                return -1;
14344            }
14345            arg->width = arg->width*10 + (arg->ch - '0');
14346        }
14347    }
14348
14349    /* Parse precision. Example: "%.3f" => prec=3 */
14350    if (arg->ch == '.') {
14351        arg->prec = 0;
14352        if (--ctx->fmtcnt >= 0) {
14353            arg->ch = FORMAT_READ(ctx);
14354            ctx->fmtpos++;
14355        }
14356        if (arg->ch == '*') {
14357            v = unicode_format_getnextarg(ctx);
14358            if (v == NULL)
14359                return -1;
14360            if (!PyLong_Check(v)) {
14361                PyErr_SetString(PyExc_TypeError,
14362                                "* wants int");
14363                return -1;
14364            }
14365            arg->prec = _PyLong_AsInt(v);
14366            if (arg->prec == -1 && PyErr_Occurred())
14367                return -1;
14368            if (arg->prec < 0)
14369                arg->prec = 0;
14370            if (--ctx->fmtcnt >= 0) {
14371                arg->ch = FORMAT_READ(ctx);
14372                ctx->fmtpos++;
14373            }
14374        }
14375        else if (arg->ch >= '0' && arg->ch <= '9') {
14376            arg->prec = arg->ch - '0';
14377            while (--ctx->fmtcnt >= 0) {
14378                arg->ch = FORMAT_READ(ctx);
14379                ctx->fmtpos++;
14380                if (arg->ch < '0' || arg->ch > '9')
14381                    break;
14382                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14383                    PyErr_SetString(PyExc_ValueError,
14384                                    "precision too big");
14385                    return -1;
14386                }
14387                arg->prec = arg->prec*10 + (arg->ch - '0');
14388            }
14389        }
14390    }
14391
14392    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14393    if (ctx->fmtcnt >= 0) {
14394        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14395            if (--ctx->fmtcnt >= 0) {
14396                arg->ch = FORMAT_READ(ctx);
14397                ctx->fmtpos++;
14398            }
14399        }
14400    }
14401    if (ctx->fmtcnt < 0) {
14402        PyErr_SetString(PyExc_ValueError,
14403                        "incomplete format");
14404        return -1;
14405    }
14406    return 0;
14407
14408#undef FORMAT_READ
14409}
14410
14411/* Format one argument. Supported conversion specifiers:
14412
14413   - "s", "r", "a": any type
14414   - "i", "d", "u": int or float
14415   - "o", "x", "X": int
14416   - "e", "E", "f", "F", "g", "G": float
14417   - "c": int or str (1 character)
14418
14419   When possible, the output is written directly into the Unicode writer
14420   (ctx->writer). A string is created when padding is required.
14421
14422   Return 0 if the argument has been formatted into *p_str,
14423          1 if the argument has been written into ctx->writer,
14424         -1 on error. */
14425static int
14426unicode_format_arg_format(struct unicode_formatter_t *ctx,
14427                          struct unicode_format_arg_t *arg,
14428                          PyObject **p_str)
14429{
14430    PyObject *v;
14431    _PyUnicodeWriter *writer = &ctx->writer;
14432
14433    if (ctx->fmtcnt == 0)
14434        ctx->writer.overallocate = 0;
14435
14436    if (arg->ch == '%') {
14437        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14438            return -1;
14439        return 1;
14440    }
14441
14442    v = unicode_format_getnextarg(ctx);
14443    if (v == NULL)
14444        return -1;
14445
14446
14447    switch (arg->ch) {
14448    case 's':
14449    case 'r':
14450    case 'a':
14451        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14452            /* Fast path */
14453            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14454                return -1;
14455            return 1;
14456        }
14457
14458        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14459            *p_str = v;
14460            Py_INCREF(*p_str);
14461        }
14462        else {
14463            if (arg->ch == 's')
14464                *p_str = PyObject_Str(v);
14465            else if (arg->ch == 'r')
14466                *p_str = PyObject_Repr(v);
14467            else
14468                *p_str = PyObject_ASCII(v);
14469        }
14470        break;
14471
14472    case 'i':
14473    case 'd':
14474    case 'u':
14475    case 'o':
14476    case 'x':
14477    case 'X':
14478    {
14479        int ret = mainformatlong(v, arg, p_str, writer);
14480        if (ret != 0)
14481            return ret;
14482        arg->sign = 1;
14483        break;
14484    }
14485
14486    case 'e':
14487    case 'E':
14488    case 'f':
14489    case 'F':
14490    case 'g':
14491    case 'G':
14492        if (arg->width == -1 && arg->prec == -1
14493            && !(arg->flags & (F_SIGN | F_BLANK)))
14494        {
14495            /* Fast path */
14496            if (formatfloat(v, arg, NULL, writer) == -1)
14497                return -1;
14498            return 1;
14499        }
14500
14501        arg->sign = 1;
14502        if (formatfloat(v, arg, p_str, NULL) == -1)
14503            return -1;
14504        break;
14505
14506    case 'c':
14507    {
14508        Py_UCS4 ch = formatchar(v);
14509        if (ch == (Py_UCS4) -1)
14510            return -1;
14511        if (arg->width == -1 && arg->prec == -1) {
14512            /* Fast path */
14513            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14514                return -1;
14515            return 1;
14516        }
14517        *p_str = PyUnicode_FromOrdinal(ch);
14518        break;
14519    }
14520
14521    default:
14522        PyErr_Format(PyExc_ValueError,
14523                     "unsupported format character '%c' (0x%x) "
14524                     "at index %zd",
14525                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14526                     (int)arg->ch,
14527                     ctx->fmtpos - 1);
14528        return -1;
14529    }
14530    if (*p_str == NULL)
14531        return -1;
14532    assert (PyUnicode_Check(*p_str));
14533    return 0;
14534}
14535
14536static int
14537unicode_format_arg_output(struct unicode_formatter_t *ctx,
14538                          struct unicode_format_arg_t *arg,
14539                          PyObject *str)
14540{
14541    Py_ssize_t len;
14542    enum PyUnicode_Kind kind;
14543    void *pbuf;
14544    Py_ssize_t pindex;
14545    Py_UCS4 signchar;
14546    Py_ssize_t buflen;
14547    Py_UCS4 maxchar;
14548    Py_ssize_t sublen;
14549    _PyUnicodeWriter *writer = &ctx->writer;
14550    Py_UCS4 fill;
14551
14552    fill = ' ';
14553    if (arg->sign && arg->flags & F_ZERO)
14554        fill = '0';
14555
14556    if (PyUnicode_READY(str) == -1)
14557        return -1;
14558
14559    len = PyUnicode_GET_LENGTH(str);
14560    if ((arg->width == -1 || arg->width <= len)
14561        && (arg->prec == -1 || arg->prec >= len)
14562        && !(arg->flags & (F_SIGN | F_BLANK)))
14563    {
14564        /* Fast path */
14565        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14566            return -1;
14567        return 0;
14568    }
14569
14570    /* Truncate the string for "s", "r" and "a" formats
14571       if the precision is set */
14572    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14573        if (arg->prec >= 0 && len > arg->prec)
14574            len = arg->prec;
14575    }
14576
14577    /* Adjust sign and width */
14578    kind = PyUnicode_KIND(str);
14579    pbuf = PyUnicode_DATA(str);
14580    pindex = 0;
14581    signchar = '\0';
14582    if (arg->sign) {
14583        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14584        if (ch == '-' || ch == '+') {
14585            signchar = ch;
14586            len--;
14587            pindex++;
14588        }
14589        else if (arg->flags & F_SIGN)
14590            signchar = '+';
14591        else if (arg->flags & F_BLANK)
14592            signchar = ' ';
14593        else
14594            arg->sign = 0;
14595    }
14596    if (arg->width < len)
14597        arg->width = len;
14598
14599    /* Prepare the writer */
14600    maxchar = writer->maxchar;
14601    if (!(arg->flags & F_LJUST)) {
14602        if (arg->sign) {
14603            if ((arg->width-1) > len)
14604                maxchar = Py_MAX(maxchar, fill);
14605        }
14606        else {
14607            if (arg->width > len)
14608                maxchar = Py_MAX(maxchar, fill);
14609        }
14610    }
14611    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14612        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14613        maxchar = Py_MAX(maxchar, strmaxchar);
14614    }
14615
14616    buflen = arg->width;
14617    if (arg->sign && len == arg->width)
14618        buflen++;
14619    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14620        return -1;
14621
14622    /* Write the sign if needed */
14623    if (arg->sign) {
14624        if (fill != ' ') {
14625            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14626            writer->pos += 1;
14627        }
14628        if (arg->width > len)
14629            arg->width--;
14630    }
14631
14632    /* Write the numeric prefix for "x", "X" and "o" formats
14633       if the alternate form is used.
14634       For example, write "0x" for the "%#x" format. */
14635    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14636        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14637        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14638        if (fill != ' ') {
14639            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14640            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14641            writer->pos += 2;
14642            pindex += 2;
14643        }
14644        arg->width -= 2;
14645        if (arg->width < 0)
14646            arg->width = 0;
14647        len -= 2;
14648    }
14649
14650    /* Pad left with the fill character if needed */
14651    if (arg->width > len && !(arg->flags & F_LJUST)) {
14652        sublen = arg->width - len;
14653        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14654        writer->pos += sublen;
14655        arg->width = len;
14656    }
14657
14658    /* If padding with spaces: write sign if needed and/or numeric prefix if
14659       the alternate form is used */
14660    if (fill == ' ') {
14661        if (arg->sign) {
14662            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14663            writer->pos += 1;
14664        }
14665        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14666            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14667            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14668            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14669            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14670            writer->pos += 2;
14671            pindex += 2;
14672        }
14673    }
14674
14675    /* Write characters */
14676    if (len) {
14677        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14678                                      str, pindex, len);
14679        writer->pos += len;
14680    }
14681
14682    /* Pad right with the fill character if needed */
14683    if (arg->width > len) {
14684        sublen = arg->width - len;
14685        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14686        writer->pos += sublen;
14687    }
14688    return 0;
14689}
14690
14691/* Helper of PyUnicode_Format(): format one arg.
14692   Return 0 on success, raise an exception and return -1 on error. */
14693static int
14694unicode_format_arg(struct unicode_formatter_t *ctx)
14695{
14696    struct unicode_format_arg_t arg;
14697    PyObject *str;
14698    int ret;
14699
14700    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14701    arg.flags = 0;
14702    arg.width = -1;
14703    arg.prec = -1;
14704    arg.sign = 0;
14705    str = NULL;
14706
14707    ret = unicode_format_arg_parse(ctx, &arg);
14708    if (ret == -1)
14709        return -1;
14710
14711    ret = unicode_format_arg_format(ctx, &arg, &str);
14712    if (ret == -1)
14713        return -1;
14714
14715    if (ret != 1) {
14716        ret = unicode_format_arg_output(ctx, &arg, str);
14717        Py_DECREF(str);
14718        if (ret == -1)
14719            return -1;
14720    }
14721
14722    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14723        PyErr_SetString(PyExc_TypeError,
14724                        "not all arguments converted during string formatting");
14725        return -1;
14726    }
14727    return 0;
14728}
14729
14730PyObject *
14731PyUnicode_Format(PyObject *format, PyObject *args)
14732{
14733    struct unicode_formatter_t ctx;
14734
14735    if (format == NULL || args == NULL) {
14736        PyErr_BadInternalCall();
14737        return NULL;
14738    }
14739
14740    ctx.fmtstr = PyUnicode_FromObject(format);
14741    if (ctx.fmtstr == NULL)
14742        return NULL;
14743    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14744        Py_DECREF(ctx.fmtstr);
14745        return NULL;
14746    }
14747    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14748    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14749    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14750    ctx.fmtpos = 0;
14751
14752    _PyUnicodeWriter_Init(&ctx.writer);
14753    ctx.writer.min_length = ctx.fmtcnt + 100;
14754    ctx.writer.overallocate = 1;
14755
14756    if (PyTuple_Check(args)) {
14757        ctx.arglen = PyTuple_Size(args);
14758        ctx.argidx = 0;
14759    }
14760    else {
14761        ctx.arglen = -1;
14762        ctx.argidx = -2;
14763    }
14764    ctx.args_owned = 0;
14765    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14766        ctx.dict = args;
14767    else
14768        ctx.dict = NULL;
14769    ctx.args = args;
14770
14771    while (--ctx.fmtcnt >= 0) {
14772        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14773            Py_ssize_t nonfmtpos;
14774
14775            nonfmtpos = ctx.fmtpos++;
14776            while (ctx.fmtcnt >= 0 &&
14777                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14778                ctx.fmtpos++;
14779                ctx.fmtcnt--;
14780            }
14781            if (ctx.fmtcnt < 0) {
14782                ctx.fmtpos--;
14783                ctx.writer.overallocate = 0;
14784            }
14785
14786            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14787                                                nonfmtpos, ctx.fmtpos) < 0)
14788                goto onError;
14789        }
14790        else {
14791            ctx.fmtpos++;
14792            if (unicode_format_arg(&ctx) == -1)
14793                goto onError;
14794        }
14795    }
14796
14797    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14798        PyErr_SetString(PyExc_TypeError,
14799                        "not all arguments converted during string formatting");
14800        goto onError;
14801    }
14802
14803    if (ctx.args_owned) {
14804        Py_DECREF(ctx.args);
14805    }
14806    Py_DECREF(ctx.fmtstr);
14807    return _PyUnicodeWriter_Finish(&ctx.writer);
14808
14809  onError:
14810    Py_DECREF(ctx.fmtstr);
14811    _PyUnicodeWriter_Dealloc(&ctx.writer);
14812    if (ctx.args_owned) {
14813        Py_DECREF(ctx.args);
14814    }
14815    return NULL;
14816}
14817
14818static PyObject *
14819unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14820
14821static PyObject *
14822unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14823{
14824    PyObject *x = NULL;
14825    static char *kwlist[] = {"object", "encoding", "errors", 0};
14826    char *encoding = NULL;
14827    char *errors = NULL;
14828
14829    if (type != &PyUnicode_Type)
14830        return unicode_subtype_new(type, args, kwds);
14831    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14832                                     kwlist, &x, &encoding, &errors))
14833        return NULL;
14834    if (x == NULL)
14835        _Py_RETURN_UNICODE_EMPTY();
14836    if (encoding == NULL && errors == NULL)
14837        return PyObject_Str(x);
14838    else
14839        return PyUnicode_FromEncodedObject(x, encoding, errors);
14840}
14841
14842static PyObject *
14843unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14844{
14845    PyObject *unicode, *self;
14846    Py_ssize_t length, char_size;
14847    int share_wstr, share_utf8;
14848    unsigned int kind;
14849    void *data;
14850
14851    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14852
14853    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14854    if (unicode == NULL)
14855        return NULL;
14856    assert(_PyUnicode_CHECK(unicode));
14857    if (PyUnicode_READY(unicode) == -1) {
14858        Py_DECREF(unicode);
14859        return NULL;
14860    }
14861
14862    self = type->tp_alloc(type, 0);
14863    if (self == NULL) {
14864        Py_DECREF(unicode);
14865        return NULL;
14866    }
14867    kind = PyUnicode_KIND(unicode);
14868    length = PyUnicode_GET_LENGTH(unicode);
14869
14870    _PyUnicode_LENGTH(self) = length;
14871#ifdef Py_DEBUG
14872    _PyUnicode_HASH(self) = -1;
14873#else
14874    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14875#endif
14876    _PyUnicode_STATE(self).interned = 0;
14877    _PyUnicode_STATE(self).kind = kind;
14878    _PyUnicode_STATE(self).compact = 0;
14879    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14880    _PyUnicode_STATE(self).ready = 1;
14881    _PyUnicode_WSTR(self) = NULL;
14882    _PyUnicode_UTF8_LENGTH(self) = 0;
14883    _PyUnicode_UTF8(self) = NULL;
14884    _PyUnicode_WSTR_LENGTH(self) = 0;
14885    _PyUnicode_DATA_ANY(self) = NULL;
14886
14887    share_utf8 = 0;
14888    share_wstr = 0;
14889    if (kind == PyUnicode_1BYTE_KIND) {
14890        char_size = 1;
14891        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14892            share_utf8 = 1;
14893    }
14894    else if (kind == PyUnicode_2BYTE_KIND) {
14895        char_size = 2;
14896        if (sizeof(wchar_t) == 2)
14897            share_wstr = 1;
14898    }
14899    else {
14900        assert(kind == PyUnicode_4BYTE_KIND);
14901        char_size = 4;
14902        if (sizeof(wchar_t) == 4)
14903            share_wstr = 1;
14904    }
14905
14906    /* Ensure we won't overflow the length. */
14907    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14908        PyErr_NoMemory();
14909        goto onError;
14910    }
14911    data = PyObject_MALLOC((length + 1) * char_size);
14912    if (data == NULL) {
14913        PyErr_NoMemory();
14914        goto onError;
14915    }
14916
14917    _PyUnicode_DATA_ANY(self) = data;
14918    if (share_utf8) {
14919        _PyUnicode_UTF8_LENGTH(self) = length;
14920        _PyUnicode_UTF8(self) = data;
14921    }
14922    if (share_wstr) {
14923        _PyUnicode_WSTR_LENGTH(self) = length;
14924        _PyUnicode_WSTR(self) = (wchar_t *)data;
14925    }
14926
14927    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14928              kind * (length + 1));
14929    assert(_PyUnicode_CheckConsistency(self, 1));
14930#ifdef Py_DEBUG
14931    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14932#endif
14933    Py_DECREF(unicode);
14934    return self;
14935
14936onError:
14937    Py_DECREF(unicode);
14938    Py_DECREF(self);
14939    return NULL;
14940}
14941
14942PyDoc_STRVAR(unicode_doc,
14943"str(object='') -> str\n\
14944str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14945\n\
14946Create a new string object from the given object. If encoding or\n\
14947errors is specified, then the object must expose a data buffer\n\
14948that will be decoded using the given encoding and error handler.\n\
14949Otherwise, returns the result of object.__str__() (if defined)\n\
14950or repr(object).\n\
14951encoding defaults to sys.getdefaultencoding().\n\
14952errors defaults to 'strict'.");
14953
14954static PyObject *unicode_iter(PyObject *seq);
14955
14956PyTypeObject PyUnicode_Type = {
14957    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14958    "str",              /* tp_name */
14959    sizeof(PyUnicodeObject),        /* tp_size */
14960    0,                  /* tp_itemsize */
14961    /* Slots */
14962    (destructor)unicode_dealloc,    /* tp_dealloc */
14963    0,                  /* tp_print */
14964    0,                  /* tp_getattr */
14965    0,                  /* tp_setattr */
14966    0,                  /* tp_reserved */
14967    unicode_repr,           /* tp_repr */
14968    &unicode_as_number,         /* tp_as_number */
14969    &unicode_as_sequence,       /* tp_as_sequence */
14970    &unicode_as_mapping,        /* tp_as_mapping */
14971    (hashfunc) unicode_hash,        /* tp_hash*/
14972    0,                  /* tp_call*/
14973    (reprfunc) unicode_str,     /* tp_str */
14974    PyObject_GenericGetAttr,        /* tp_getattro */
14975    0,                  /* tp_setattro */
14976    0,                  /* tp_as_buffer */
14977    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14978    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14979    unicode_doc,            /* tp_doc */
14980    0,                  /* tp_traverse */
14981    0,                  /* tp_clear */
14982    PyUnicode_RichCompare,      /* tp_richcompare */
14983    0,                  /* tp_weaklistoffset */
14984    unicode_iter,           /* tp_iter */
14985    0,                  /* tp_iternext */
14986    unicode_methods,            /* tp_methods */
14987    0,                  /* tp_members */
14988    0,                  /* tp_getset */
14989    &PyBaseObject_Type,         /* tp_base */
14990    0,                  /* tp_dict */
14991    0,                  /* tp_descr_get */
14992    0,                  /* tp_descr_set */
14993    0,                  /* tp_dictoffset */
14994    0,                  /* tp_init */
14995    0,                  /* tp_alloc */
14996    unicode_new,            /* tp_new */
14997    PyObject_Del,           /* tp_free */
14998};
14999
15000/* Initialize the Unicode implementation */
15001
15002int _PyUnicode_Init(void)
15003{
15004    /* XXX - move this array to unicodectype.c ? */
15005    Py_UCS2 linebreak[] = {
15006        0x000A, /* LINE FEED */
15007        0x000D, /* CARRIAGE RETURN */
15008        0x001C, /* FILE SEPARATOR */
15009        0x001D, /* GROUP SEPARATOR */
15010        0x001E, /* RECORD SEPARATOR */
15011        0x0085, /* NEXT LINE */
15012        0x2028, /* LINE SEPARATOR */
15013        0x2029, /* PARAGRAPH SEPARATOR */
15014    };
15015
15016    /* Init the implementation */
15017    _Py_INCREF_UNICODE_EMPTY();
15018    if (!unicode_empty)
15019        Py_FatalError("Can't create empty string");
15020    Py_DECREF(unicode_empty);
15021
15022    if (PyType_Ready(&PyUnicode_Type) < 0)
15023        Py_FatalError("Can't initialize 'unicode'");
15024
15025    /* initialize the linebreak bloom filter */
15026    bloom_linebreak = make_bloom_mask(
15027        PyUnicode_2BYTE_KIND, linebreak,
15028        Py_ARRAY_LENGTH(linebreak));
15029
15030    if (PyType_Ready(&EncodingMapType) < 0)
15031         Py_FatalError("Can't initialize encoding map type");
15032
15033    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15034        Py_FatalError("Can't initialize field name iterator type");
15035
15036    if (PyType_Ready(&PyFormatterIter_Type) < 0)
15037        Py_FatalError("Can't initialize formatter iter type");
15038
15039    return 0;
15040}
15041
15042/* Finalize the Unicode implementation */
15043
15044int
15045PyUnicode_ClearFreeList(void)
15046{
15047    return 0;
15048}
15049
15050void
15051_PyUnicode_Fini(void)
15052{
15053    int i;
15054
15055    Py_CLEAR(unicode_empty);
15056
15057    for (i = 0; i < 256; i++)
15058        Py_CLEAR(unicode_latin1[i]);
15059    _PyUnicode_ClearStaticStrings();
15060    (void)PyUnicode_ClearFreeList();
15061}
15062
15063void
15064PyUnicode_InternInPlace(PyObject **p)
15065{
15066    PyObject *s = *p;
15067    PyObject *t;
15068#ifdef Py_DEBUG
15069    assert(s != NULL);
15070    assert(_PyUnicode_CHECK(s));
15071#else
15072    if (s == NULL || !PyUnicode_Check(s))
15073        return;
15074#endif
15075    /* If it's a subclass, we don't really know what putting
15076       it in the interned dict might do. */
15077    if (!PyUnicode_CheckExact(s))
15078        return;
15079    if (PyUnicode_CHECK_INTERNED(s))
15080        return;
15081    if (interned == NULL) {
15082        interned = PyDict_New();
15083        if (interned == NULL) {
15084            PyErr_Clear(); /* Don't leave an exception */
15085            return;
15086        }
15087    }
15088    /* It might be that the GetItem call fails even
15089       though the key is present in the dictionary,
15090       namely when this happens during a stack overflow. */
15091    Py_ALLOW_RECURSION
15092    t = PyDict_GetItem(interned, s);
15093    Py_END_ALLOW_RECURSION
15094
15095    if (t) {
15096        Py_INCREF(t);
15097        Py_DECREF(*p);
15098        *p = t;
15099        return;
15100    }
15101
15102    PyThreadState_GET()->recursion_critical = 1;
15103    if (PyDict_SetItem(interned, s, s) < 0) {
15104        PyErr_Clear();
15105        PyThreadState_GET()->recursion_critical = 0;
15106        return;
15107    }
15108    PyThreadState_GET()->recursion_critical = 0;
15109    /* The two references in interned are not counted by refcnt.
15110       The deallocator will take care of this */
15111    Py_REFCNT(s) -= 2;
15112    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15113}
15114
15115void
15116PyUnicode_InternImmortal(PyObject **p)
15117{
15118    PyUnicode_InternInPlace(p);
15119    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15120        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15121        Py_INCREF(*p);
15122    }
15123}
15124
15125PyObject *
15126PyUnicode_InternFromString(const char *cp)
15127{
15128    PyObject *s = PyUnicode_FromString(cp);
15129    if (s == NULL)
15130        return NULL;
15131    PyUnicode_InternInPlace(&s);
15132    return s;
15133}
15134
15135void
15136_Py_ReleaseInternedUnicodeStrings(void)
15137{
15138    PyObject *keys;
15139    PyObject *s;
15140    Py_ssize_t i, n;
15141    Py_ssize_t immortal_size = 0, mortal_size = 0;
15142
15143    if (interned == NULL || !PyDict_Check(interned))
15144        return;
15145    keys = PyDict_Keys(interned);
15146    if (keys == NULL || !PyList_Check(keys)) {
15147        PyErr_Clear();
15148        return;
15149    }
15150
15151    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15152       detector, interned unicode strings are not forcibly deallocated;
15153       rather, we give them their stolen references back, and then clear
15154       and DECREF the interned dict. */
15155
15156    n = PyList_GET_SIZE(keys);
15157    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15158            n);
15159    for (i = 0; i < n; i++) {
15160        s = PyList_GET_ITEM(keys, i);
15161        if (PyUnicode_READY(s) == -1) {
15162            assert(0 && "could not ready string");
15163            fprintf(stderr, "could not ready string\n");
15164        }
15165        switch (PyUnicode_CHECK_INTERNED(s)) {
15166        case SSTATE_NOT_INTERNED:
15167            /* XXX Shouldn't happen */
15168            break;
15169        case SSTATE_INTERNED_IMMORTAL:
15170            Py_REFCNT(s) += 1;
15171            immortal_size += PyUnicode_GET_LENGTH(s);
15172            break;
15173        case SSTATE_INTERNED_MORTAL:
15174            Py_REFCNT(s) += 2;
15175            mortal_size += PyUnicode_GET_LENGTH(s);
15176            break;
15177        default:
15178            Py_FatalError("Inconsistent interned string state.");
15179        }
15180        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15181    }
15182    fprintf(stderr, "total size of all interned strings: "
15183            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15184            "mortal/immortal\n", mortal_size, immortal_size);
15185    Py_DECREF(keys);
15186    PyDict_Clear(interned);
15187    Py_CLEAR(interned);
15188}
15189
15190
15191/********************* Unicode Iterator **************************/
15192
15193typedef struct {
15194    PyObject_HEAD
15195    Py_ssize_t it_index;
15196    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15197} unicodeiterobject;
15198
15199static void
15200unicodeiter_dealloc(unicodeiterobject *it)
15201{
15202    _PyObject_GC_UNTRACK(it);
15203    Py_XDECREF(it->it_seq);
15204    PyObject_GC_Del(it);
15205}
15206
15207static int
15208unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15209{
15210    Py_VISIT(it->it_seq);
15211    return 0;
15212}
15213
15214static PyObject *
15215unicodeiter_next(unicodeiterobject *it)
15216{
15217    PyObject *seq, *item;
15218
15219    assert(it != NULL);
15220    seq = it->it_seq;
15221    if (seq == NULL)
15222        return NULL;
15223    assert(_PyUnicode_CHECK(seq));
15224
15225    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15226        int kind = PyUnicode_KIND(seq);
15227        void *data = PyUnicode_DATA(seq);
15228        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15229        item = PyUnicode_FromOrdinal(chr);
15230        if (item != NULL)
15231            ++it->it_index;
15232        return item;
15233    }
15234
15235    Py_DECREF(seq);
15236    it->it_seq = NULL;
15237    return NULL;
15238}
15239
15240static PyObject *
15241unicodeiter_len(unicodeiterobject *it)
15242{
15243    Py_ssize_t len = 0;
15244    if (it->it_seq)
15245        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15246    return PyLong_FromSsize_t(len);
15247}
15248
15249PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15250
15251static PyObject *
15252unicodeiter_reduce(unicodeiterobject *it)
15253{
15254    if (it->it_seq != NULL) {
15255        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15256                             it->it_seq, it->it_index);
15257    } else {
15258        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15259        if (u == NULL)
15260            return NULL;
15261        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15262    }
15263}
15264
15265PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15266
15267static PyObject *
15268unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15269{
15270    Py_ssize_t index = PyLong_AsSsize_t(state);
15271    if (index == -1 && PyErr_Occurred())
15272        return NULL;
15273    if (it->it_seq != NULL) {
15274        if (index < 0)
15275            index = 0;
15276        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15277            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15278        it->it_index = index;
15279    }
15280    Py_RETURN_NONE;
15281}
15282
15283PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15284
15285static PyMethodDef unicodeiter_methods[] = {
15286    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15287     length_hint_doc},
15288    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15289     reduce_doc},
15290    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15291     setstate_doc},
15292    {NULL,      NULL}       /* sentinel */
15293};
15294
15295PyTypeObject PyUnicodeIter_Type = {
15296    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15297    "str_iterator",         /* tp_name */
15298    sizeof(unicodeiterobject),      /* tp_basicsize */
15299    0,                  /* tp_itemsize */
15300    /* methods */
15301    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15302    0,                  /* tp_print */
15303    0,                  /* tp_getattr */
15304    0,                  /* tp_setattr */
15305    0,                  /* tp_reserved */
15306    0,                  /* tp_repr */
15307    0,                  /* tp_as_number */
15308    0,                  /* tp_as_sequence */
15309    0,                  /* tp_as_mapping */
15310    0,                  /* tp_hash */
15311    0,                  /* tp_call */
15312    0,                  /* tp_str */
15313    PyObject_GenericGetAttr,        /* tp_getattro */
15314    0,                  /* tp_setattro */
15315    0,                  /* tp_as_buffer */
15316    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15317    0,                  /* tp_doc */
15318    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15319    0,                  /* tp_clear */
15320    0,                  /* tp_richcompare */
15321    0,                  /* tp_weaklistoffset */
15322    PyObject_SelfIter,          /* tp_iter */
15323    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15324    unicodeiter_methods,            /* tp_methods */
15325    0,
15326};
15327
15328static PyObject *
15329unicode_iter(PyObject *seq)
15330{
15331    unicodeiterobject *it;
15332
15333    if (!PyUnicode_Check(seq)) {
15334        PyErr_BadInternalCall();
15335        return NULL;
15336    }
15337    if (PyUnicode_READY(seq) == -1)
15338        return NULL;
15339    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15340    if (it == NULL)
15341        return NULL;
15342    it->it_index = 0;
15343    Py_INCREF(seq);
15344    it->it_seq = seq;
15345    _PyObject_GC_TRACK(it);
15346    return (PyObject *)it;
15347}
15348
15349
15350size_t
15351Py_UNICODE_strlen(const Py_UNICODE *u)
15352{
15353    int res = 0;
15354    while(*u++)
15355        res++;
15356    return res;
15357}
15358
15359Py_UNICODE*
15360Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15361{
15362    Py_UNICODE *u = s1;
15363    while ((*u++ = *s2++));
15364    return s1;
15365}
15366
15367Py_UNICODE*
15368Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15369{
15370    Py_UNICODE *u = s1;
15371    while ((*u++ = *s2++))
15372        if (n-- == 0)
15373            break;
15374    return s1;
15375}
15376
15377Py_UNICODE*
15378Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15379{
15380    Py_UNICODE *u1 = s1;
15381    u1 += Py_UNICODE_strlen(u1);
15382    Py_UNICODE_strcpy(u1, s2);
15383    return s1;
15384}
15385
15386int
15387Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15388{
15389    while (*s1 && *s2 && *s1 == *s2)
15390        s1++, s2++;
15391    if (*s1 && *s2)
15392        return (*s1 < *s2) ? -1 : +1;
15393    if (*s1)
15394        return 1;
15395    if (*s2)
15396        return -1;
15397    return 0;
15398}
15399
15400int
15401Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15402{
15403    Py_UNICODE u1, u2;
15404    for (; n != 0; n--) {
15405        u1 = *s1;
15406        u2 = *s2;
15407        if (u1 != u2)
15408            return (u1 < u2) ? -1 : +1;
15409        if (u1 == '\0')
15410            return 0;
15411        s1++;
15412        s2++;
15413    }
15414    return 0;
15415}
15416
15417Py_UNICODE*
15418Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15419{
15420    const Py_UNICODE *p;
15421    for (p = s; *p; p++)
15422        if (*p == c)
15423            return (Py_UNICODE*)p;
15424    return NULL;
15425}
15426
15427Py_UNICODE*
15428Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15429{
15430    const Py_UNICODE *p;
15431    p = s + Py_UNICODE_strlen(s);
15432    while (p != s) {
15433        p--;
15434        if (*p == c)
15435            return (Py_UNICODE*)p;
15436    }
15437    return NULL;
15438}
15439
15440Py_UNICODE*
15441PyUnicode_AsUnicodeCopy(PyObject *unicode)
15442{
15443    Py_UNICODE *u, *copy;
15444    Py_ssize_t len, size;
15445
15446    if (!PyUnicode_Check(unicode)) {
15447        PyErr_BadArgument();
15448        return NULL;
15449    }
15450    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15451    if (u == NULL)
15452        return NULL;
15453    /* Ensure we won't overflow the size. */
15454    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15455        PyErr_NoMemory();
15456        return NULL;
15457    }
15458    size = len + 1; /* copy the null character */
15459    size *= sizeof(Py_UNICODE);
15460    copy = PyMem_Malloc(size);
15461    if (copy == NULL) {
15462        PyErr_NoMemory();
15463        return NULL;
15464    }
15465    memcpy(copy, u, size);
15466    return copy;
15467}
15468
15469/* A _string module, to export formatter_parser and formatter_field_name_split
15470   to the string.Formatter class implemented in Python. */
15471
15472static PyMethodDef _string_methods[] = {
15473    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15474     METH_O, PyDoc_STR("split the argument as a field name")},
15475    {"formatter_parser", (PyCFunction) formatter_parser,
15476     METH_O, PyDoc_STR("parse the argument as a format string")},
15477    {NULL, NULL}
15478};
15479
15480static struct PyModuleDef _string_module = {
15481    PyModuleDef_HEAD_INIT,
15482    "_string",
15483    PyDoc_STR("string helper module"),
15484    0,
15485    _string_methods,
15486    NULL,
15487    NULL,
15488    NULL,
15489    NULL
15490};
15491
15492PyMODINIT_FUNC
15493PyInit__string(void)
15494{
15495    return PyModule_Create(&_string_module);
15496}
15497
15498
15499#ifdef __cplusplus
15500}
15501#endif
15502