unicodeobject.c revision fd97a6fb2d501f0ecb104513b5c0c1707dd6f87e
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/*[clinic input]
51class str "PyUnicodeObject *" "&PyUnicode_Type"
52[clinic start generated code]*/
53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
54
55/* --- Globals ------------------------------------------------------------
56
57NOTE: In the interpreter's initialization phase, some globals are currently
58      initialized dynamically as needed. In the process Unicode objects may
59      be created before the Unicode type is ready.
60
61*/
62
63
64#ifdef __cplusplus
65extern "C" {
66#endif
67
68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
71#ifdef Py_DEBUG
72#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
73#else
74#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
76
77#define _PyUnicode_UTF8(op)                             \
78    (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op)                              \
80    (assert(_PyUnicode_CHECK(op)),                      \
81     assert(PyUnicode_IS_READY(op)),                    \
82     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
83         ((char*)((PyASCIIObject*)(op) + 1)) :          \
84         _PyUnicode_UTF8(op))
85#define _PyUnicode_UTF8_LENGTH(op)                      \
86    (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op)                       \
88    (assert(_PyUnicode_CHECK(op)),                      \
89     assert(PyUnicode_IS_READY(op)),                    \
90     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
91         ((PyASCIIObject*)(op))->length :               \
92         _PyUnicode_UTF8_LENGTH(op))
93#define _PyUnicode_WSTR(op)                             \
94    (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op)                      \
96    (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op)                           \
98    (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op)                            \
100    (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op)                             \
102    (((PyASCIIObject *)(op))->hash)
103#define _PyUnicode_KIND(op)                             \
104    (assert(_PyUnicode_CHECK(op)),                      \
105     ((PyASCIIObject *)(op))->state.kind)
106#define _PyUnicode_GET_LENGTH(op)                       \
107    (assert(_PyUnicode_CHECK(op)),                      \
108     ((PyASCIIObject *)(op))->length)
109#define _PyUnicode_DATA_ANY(op)                         \
110    (((PyUnicodeObject*)(op))->data.any)
111
112#undef PyUnicode_READY
113#define PyUnicode_READY(op)                             \
114    (assert(_PyUnicode_CHECK(op)),                      \
115     (PyUnicode_IS_READY(op) ?                          \
116      0 :                                               \
117      _PyUnicode_Ready(op)))
118
119#define _PyUnicode_SHARE_UTF8(op)                       \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
122     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op)                       \
124    (assert(_PyUnicode_CHECK(op)),                      \
125     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
127/* true if the Unicode object has an allocated UTF-8 memory block
128   (not shared with other data) */
129#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
130    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
131      && _PyUnicode_UTF8(op)                            \
132      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated wstr memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
137    ((_PyUnicode_WSTR(op) &&                            \
138      (!PyUnicode_IS_READY(op) ||                       \
139       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
141/* Generic helper macro to convert characters of different types.
142   from_type and to_type have to be valid type names, begin and end
143   are pointers to the source characters which should be of type
144   "from_type *".  to is a pointer of type "to_type *" and points to the
145   buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147    do {                                                \
148        to_type *_to = (to_type *)(to);                \
149        const from_type *_iter = (from_type *)(begin);  \
150        const from_type *_end = (from_type *)(end);     \
151        Py_ssize_t n = (_end) - (_iter);                \
152        const from_type *_unrolled_end =                \
153            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
154        while (_iter < (_unrolled_end)) {               \
155            _to[0] = (to_type) _iter[0];                \
156            _to[1] = (to_type) _iter[1];                \
157            _to[2] = (to_type) _iter[2];                \
158            _to[3] = (to_type) _iter[3];                \
159            _iter += 4; _to += 4;                       \
160        }                                               \
161        while (_iter < (_end))                          \
162            *_to++ = (to_type) *_iter++;                \
163    } while (0)
164
165/* This dictionary holds all interned unicode strings.  Note that references
166   to strings in this dictionary are *not* counted in the string's ob_refcnt.
167   When the interned string reaches a refcnt of 0 the string deallocation
168   function will delete the reference from this dictionary.
169
170   Another way to look at this is that to say that the actual reference
171   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
172*/
173static PyObject *interned = NULL;
174
175/* The empty Unicode object is shared to improve performance. */
176static PyObject *unicode_empty = NULL;
177
178#define _Py_INCREF_UNICODE_EMPTY()                      \
179    do {                                                \
180        if (unicode_empty != NULL)                      \
181            Py_INCREF(unicode_empty);                   \
182        else {                                          \
183            unicode_empty = PyUnicode_New(0, 0);        \
184            if (unicode_empty != NULL) {                \
185                Py_INCREF(unicode_empty);               \
186                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187            }                                           \
188        }                                               \
189    } while (0)
190
191#define _Py_RETURN_UNICODE_EMPTY()                      \
192    do {                                                \
193        _Py_INCREF_UNICODE_EMPTY();                     \
194        return unicode_empty;                           \
195    } while (0)
196
197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
201/* List of static strings. */
202static _Py_Identifier *static_strings = NULL;
203
204/* Single character Unicode strings in the Latin-1 range are being
205   shared as well. */
206static PyObject *unicode_latin1[256] = {NULL};
207
208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
210    0, 0, 0, 0, 0, 0, 0, 0,
211/*     case 0x0009: * CHARACTER TABULATION */
212/*     case 0x000A: * LINE FEED */
213/*     case 0x000B: * LINE TABULATION */
214/*     case 0x000C: * FORM FEED */
215/*     case 0x000D: * CARRIAGE RETURN */
216    0, 1, 1, 1, 1, 1, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218/*     case 0x001C: * FILE SEPARATOR */
219/*     case 0x001D: * GROUP SEPARATOR */
220/*     case 0x001E: * RECORD SEPARATOR */
221/*     case 0x001F: * UNIT SEPARATOR */
222    0, 0, 0, 0, 1, 1, 1, 1,
223/*     case 0x0020: * SPACE */
224    1, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0
237};
238
239/* forward */
240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
241static PyObject* get_latin1_char(unsigned char ch);
242static int unicode_modifiable(PyObject *unicode);
243
244
245static PyObject *
246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
253unicode_encode_call_errorhandler(const char *errors,
254       PyObject **errorHandler,const char *encoding, const char *reason,
255       PyObject *unicode, PyObject **exceptionObject,
256       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
258static void
259raise_encode_exception(PyObject **exceptionObject,
260                       const char *encoding,
261                       PyObject *unicode,
262                       Py_ssize_t startpos, Py_ssize_t endpos,
263                       const char *reason);
264
265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
267    0, 0, 0, 0, 0, 0, 0, 0,
268/*         0x000A, * LINE FEED */
269/*         0x000B, * LINE TABULATION */
270/*         0x000C, * FORM FEED */
271/*         0x000D, * CARRIAGE RETURN */
272    0, 0, 1, 1, 1, 1, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274/*         0x001C, * FILE SEPARATOR */
275/*         0x001D, * GROUP SEPARATOR */
276/*         0x001E, * RECORD SEPARATOR */
277    0, 0, 0, 0, 1, 1, 1, 0,
278    0, 0, 0, 0, 0, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0
291};
292
293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294   This function is kept for backward compatibility with the old API. */
295Py_UNICODE
296PyUnicode_GetMax(void)
297{
298#ifdef Py_UNICODE_WIDE
299    return 0x10FFFF;
300#else
301    /* This is actually an illegal character, so it should
302       not be passed to unichr. */
303    return 0xFFFF;
304#endif
305}
306
307#ifdef Py_DEBUG
308int
309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
310{
311    PyASCIIObject *ascii;
312    unsigned int kind;
313
314    assert(PyUnicode_Check(op));
315
316    ascii = (PyASCIIObject *)op;
317    kind = ascii->state.kind;
318
319    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
320        assert(kind == PyUnicode_1BYTE_KIND);
321        assert(ascii->state.ready == 1);
322    }
323    else {
324        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325        void *data;
326
327        if (ascii->state.compact == 1) {
328            data = compact + 1;
329            assert(kind == PyUnicode_1BYTE_KIND
330                   || kind == PyUnicode_2BYTE_KIND
331                   || kind == PyUnicode_4BYTE_KIND);
332            assert(ascii->state.ascii == 0);
333            assert(ascii->state.ready == 1);
334            assert (compact->utf8 != data);
335        }
336        else {
337            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339            data = unicode->data.any;
340            if (kind == PyUnicode_WCHAR_KIND) {
341                assert(ascii->length == 0);
342                assert(ascii->hash == -1);
343                assert(ascii->state.compact == 0);
344                assert(ascii->state.ascii == 0);
345                assert(ascii->state.ready == 0);
346                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
347                assert(ascii->wstr != NULL);
348                assert(data == NULL);
349                assert(compact->utf8 == NULL);
350            }
351            else {
352                assert(kind == PyUnicode_1BYTE_KIND
353                       || kind == PyUnicode_2BYTE_KIND
354                       || kind == PyUnicode_4BYTE_KIND);
355                assert(ascii->state.compact == 0);
356                assert(ascii->state.ready == 1);
357                assert(data != NULL);
358                if (ascii->state.ascii) {
359                    assert (compact->utf8 == data);
360                    assert (compact->utf8_length == ascii->length);
361                }
362                else
363                    assert (compact->utf8 != data);
364            }
365        }
366        if (kind != PyUnicode_WCHAR_KIND) {
367            if (
368#if SIZEOF_WCHAR_T == 2
369                kind == PyUnicode_2BYTE_KIND
370#else
371                kind == PyUnicode_4BYTE_KIND
372#endif
373               )
374            {
375                assert(ascii->wstr == data);
376                assert(compact->wstr_length == ascii->length);
377            } else
378                assert(ascii->wstr != data);
379        }
380
381        if (compact->utf8 == NULL)
382            assert(compact->utf8_length == 0);
383        if (ascii->wstr == NULL)
384            assert(compact->wstr_length == 0);
385    }
386    /* check that the best kind is used */
387    if (check_content && kind != PyUnicode_WCHAR_KIND)
388    {
389        Py_ssize_t i;
390        Py_UCS4 maxchar = 0;
391        void *data;
392        Py_UCS4 ch;
393
394        data = PyUnicode_DATA(ascii);
395        for (i=0; i < ascii->length; i++)
396        {
397            ch = PyUnicode_READ(kind, data, i);
398            if (ch > maxchar)
399                maxchar = ch;
400        }
401        if (kind == PyUnicode_1BYTE_KIND) {
402            if (ascii->state.ascii == 0) {
403                assert(maxchar >= 128);
404                assert(maxchar <= 255);
405            }
406            else
407                assert(maxchar < 128);
408        }
409        else if (kind == PyUnicode_2BYTE_KIND) {
410            assert(maxchar >= 0x100);
411            assert(maxchar <= 0xFFFF);
412        }
413        else {
414            assert(maxchar >= 0x10000);
415            assert(maxchar <= MAX_UNICODE);
416        }
417        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
418    }
419    return 1;
420}
421#endif
422
423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427    Py_ssize_t len;
428
429    len = _PyUnicode_WSTR_LENGTH(unicode);
430    if (len == 0) {
431        Py_DECREF(unicode);
432        _Py_RETURN_UNICODE_EMPTY();
433    }
434
435    if (len == 1) {
436        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
437        if ((Py_UCS4)ch < 256) {
438            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439            Py_DECREF(unicode);
440            return latin1_char;
441        }
442    }
443
444    if (_PyUnicode_Ready(unicode) < 0) {
445        Py_DECREF(unicode);
446        return NULL;
447    }
448#else
449    assert(Py_REFCNT(unicode) == 1);
450
451    /* don't make the result ready in debug mode to ensure that the caller
452       makes the string ready before using it */
453    assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455    return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461    Py_ssize_t length;
462
463    length = PyUnicode_GET_LENGTH(unicode);
464    if (length == 0) {
465        if (unicode != unicode_empty) {
466            Py_DECREF(unicode);
467            _Py_RETURN_UNICODE_EMPTY();
468        }
469        return unicode_empty;
470    }
471
472    if (length == 1) {
473        void *data = PyUnicode_DATA(unicode);
474        int kind = PyUnicode_KIND(unicode);
475        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
476        if (ch < 256) {
477            PyObject *latin1_char = unicode_latin1[ch];
478            if (latin1_char != NULL) {
479                if (unicode != latin1_char) {
480                    Py_INCREF(latin1_char);
481                    Py_DECREF(unicode);
482                }
483                return latin1_char;
484            }
485            else {
486                assert(_PyUnicode_CheckConsistency(unicode, 1));
487                Py_INCREF(unicode);
488                unicode_latin1[ch] = unicode;
489                return unicode;
490            }
491        }
492    }
493
494    assert(_PyUnicode_CheckConsistency(unicode, 1));
495    return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501    assert(_PyUnicode_CHECK(unicode));
502    if (PyUnicode_IS_READY(unicode))
503        return unicode_result_ready(unicode);
504    else
505        return unicode_result_wchar(unicode);
506}
507
508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511    if (PyUnicode_CheckExact(unicode)) {
512        if (PyUnicode_READY(unicode) == -1)
513            return NULL;
514        Py_INCREF(unicode);
515        return unicode;
516    }
517    else
518        /* Subtype -- return genuine unicode string with the same value. */
519        return _PyUnicode_Copy(unicode);
520}
521
522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529   to keep things simple, we use a single bitmask, using the least 5
530   bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
544#define BLOOM_MASK unsigned long
545
546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
547
548#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
549
550#define BLOOM_LINEBREAK(ch)                                             \
551    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
552     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
553
554Py_LOCAL_INLINE(BLOOM_MASK)
555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
556{
557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
558    do {                                               \
559        TYPE *data = (TYPE *)PTR;                      \
560        TYPE *end = data + LEN;                        \
561        Py_UCS4 ch;                                    \
562        for (; data != end; data++) {                  \
563            ch = *data;                                \
564            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565        }                                              \
566        break;                                         \
567    } while (0)
568
569    /* calculate simple bloom-style bitmask for a given unicode string */
570
571    BLOOM_MASK mask;
572
573    mask = 0;
574    switch (kind) {
575    case PyUnicode_1BYTE_KIND:
576        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577        break;
578    case PyUnicode_2BYTE_KIND:
579        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580        break;
581    case PyUnicode_4BYTE_KIND:
582        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583        break;
584    default:
585        assert(0);
586    }
587    return mask;
588
589#undef BLOOM_UPDATE
590}
591
592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
610#include "stringlib/replace.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
621#include "stringlib/replace.h"
622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
632#include "stringlib/replace.h"
633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
641#include "stringlib/undef.h"
642
643/* --- Unicode Object ----------------------------------------------------- */
644
645static PyObject *
646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
647
648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649                                     Py_ssize_t size, Py_UCS4 ch,
650                                     int direction)
651{
652    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654    switch (kind) {
655    case PyUnicode_1BYTE_KIND:
656        {
657            Py_UCS1 ch1 = (Py_UCS1) ch;
658            if (ch1 == ch)
659                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660            else
661                return -1;
662        }
663    case PyUnicode_2BYTE_KIND:
664        {
665            Py_UCS2 ch2 = (Py_UCS2) ch;
666            if (ch2 == ch)
667                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668            else
669                return -1;
670        }
671    case PyUnicode_4BYTE_KIND:
672        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673    default:
674        assert(0);
675        return -1;
676    }
677}
678
679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681   earlier.
682
683   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685   invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689    int kind = PyUnicode_KIND(unicode);
690    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692    if (length <= old_length)
693        return;
694    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701    Py_ssize_t char_size;
702    Py_ssize_t struct_size;
703    Py_ssize_t new_size;
704    int share_wstr;
705    PyObject *new_unicode;
706#ifdef Py_DEBUG
707    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
710    assert(unicode_modifiable(unicode));
711    assert(PyUnicode_IS_READY(unicode));
712    assert(PyUnicode_IS_COMPACT(unicode));
713
714    char_size = PyUnicode_KIND(unicode);
715    if (PyUnicode_IS_ASCII(unicode))
716        struct_size = sizeof(PyASCIIObject);
717    else
718        struct_size = sizeof(PyCompactUnicodeObject);
719    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
720
721    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722        PyErr_NoMemory();
723        return NULL;
724    }
725    new_size = (struct_size + (length + 1) * char_size);
726
727    _Py_DEC_REFTOTAL;
728    _Py_ForgetReference(unicode);
729
730    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
731    if (new_unicode == NULL) {
732        _Py_NewReference(unicode);
733        PyErr_NoMemory();
734        return NULL;
735    }
736    unicode = new_unicode;
737    _Py_NewReference(unicode);
738
739    _PyUnicode_LENGTH(unicode) = length;
740    if (share_wstr) {
741        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
742        if (!PyUnicode_IS_ASCII(unicode))
743            _PyUnicode_WSTR_LENGTH(unicode) = length;
744    }
745    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746        PyObject_DEL(_PyUnicode_WSTR(unicode));
747        _PyUnicode_WSTR(unicode) = NULL;
748    }
749#ifdef Py_DEBUG
750    unicode_fill_invalid(unicode, old_length);
751#endif
752    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753                    length, 0);
754    assert(_PyUnicode_CheckConsistency(unicode, 0));
755    return unicode;
756}
757
758static int
759resize_inplace(PyObject *unicode, Py_ssize_t length)
760{
761    wchar_t *wstr;
762    Py_ssize_t new_size;
763    assert(!PyUnicode_IS_COMPACT(unicode));
764    assert(Py_REFCNT(unicode) == 1);
765
766    if (PyUnicode_IS_READY(unicode)) {
767        Py_ssize_t char_size;
768        int share_wstr, share_utf8;
769        void *data;
770#ifdef Py_DEBUG
771        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
773
774        data = _PyUnicode_DATA_ANY(unicode);
775        char_size = PyUnicode_KIND(unicode);
776        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
778
779        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780            PyErr_NoMemory();
781            return -1;
782        }
783        new_size = (length + 1) * char_size;
784
785        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786        {
787            PyObject_DEL(_PyUnicode_UTF8(unicode));
788            _PyUnicode_UTF8(unicode) = NULL;
789            _PyUnicode_UTF8_LENGTH(unicode) = 0;
790        }
791
792        data = (PyObject *)PyObject_REALLOC(data, new_size);
793        if (data == NULL) {
794            PyErr_NoMemory();
795            return -1;
796        }
797        _PyUnicode_DATA_ANY(unicode) = data;
798        if (share_wstr) {
799            _PyUnicode_WSTR(unicode) = data;
800            _PyUnicode_WSTR_LENGTH(unicode) = length;
801        }
802        if (share_utf8) {
803            _PyUnicode_UTF8(unicode) = data;
804            _PyUnicode_UTF8_LENGTH(unicode) = length;
805        }
806        _PyUnicode_LENGTH(unicode) = length;
807        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
808#ifdef Py_DEBUG
809        unicode_fill_invalid(unicode, old_length);
810#endif
811        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
812            assert(_PyUnicode_CheckConsistency(unicode, 0));
813            return 0;
814        }
815    }
816    assert(_PyUnicode_WSTR(unicode) != NULL);
817
818    /* check for integer overflow */
819    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820        PyErr_NoMemory();
821        return -1;
822    }
823    new_size = sizeof(wchar_t) * (length + 1);
824    wstr =  _PyUnicode_WSTR(unicode);
825    wstr = PyObject_REALLOC(wstr, new_size);
826    if (!wstr) {
827        PyErr_NoMemory();
828        return -1;
829    }
830    _PyUnicode_WSTR(unicode) = wstr;
831    _PyUnicode_WSTR(unicode)[length] = 0;
832    _PyUnicode_WSTR_LENGTH(unicode) = length;
833    assert(_PyUnicode_CheckConsistency(unicode, 0));
834    return 0;
835}
836
837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840    Py_ssize_t copy_length;
841    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
842        PyObject *copy;
843
844        if (PyUnicode_READY(unicode) == -1)
845            return NULL;
846
847        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848        if (copy == NULL)
849            return NULL;
850
851        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
852        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
853        return copy;
854    }
855    else {
856        PyObject *w;
857
858        w = (PyObject*)_PyUnicode_New(length);
859        if (w == NULL)
860            return NULL;
861        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862        copy_length = Py_MIN(copy_length, length);
863        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864                  copy_length * sizeof(wchar_t));
865        return w;
866    }
867}
868
869/* We allocate one more byte to make sure the string is
870   Ux0000 terminated; some code (e.g. new_identifier)
871   relies on that.
872
873   XXX This allocator could further be enhanced by assuring that the
874   free list never reduces its size below 1.
875
876*/
877
878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
880{
881    PyUnicodeObject *unicode;
882    size_t new_size;
883
884    /* Optimization for empty strings */
885    if (length == 0 && unicode_empty != NULL) {
886        Py_INCREF(unicode_empty);
887        return (PyUnicodeObject*)unicode_empty;
888    }
889
890    /* Ensure we won't overflow the size. */
891    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892        return (PyUnicodeObject *)PyErr_NoMemory();
893    }
894    if (length < 0) {
895        PyErr_SetString(PyExc_SystemError,
896                        "Negative size passed to _PyUnicode_New");
897        return NULL;
898    }
899
900    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901    if (unicode == NULL)
902        return NULL;
903    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
904
905    _PyUnicode_WSTR_LENGTH(unicode) = length;
906    _PyUnicode_HASH(unicode) = -1;
907    _PyUnicode_STATE(unicode).interned = 0;
908    _PyUnicode_STATE(unicode).kind = 0;
909    _PyUnicode_STATE(unicode).compact = 0;
910    _PyUnicode_STATE(unicode).ready = 0;
911    _PyUnicode_STATE(unicode).ascii = 0;
912    _PyUnicode_DATA_ANY(unicode) = NULL;
913    _PyUnicode_LENGTH(unicode) = 0;
914    _PyUnicode_UTF8(unicode) = NULL;
915    _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
917    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918    if (!_PyUnicode_WSTR(unicode)) {
919        Py_DECREF(unicode);
920        PyErr_NoMemory();
921        return NULL;
922    }
923
924    /* Initialize the first element to guard against cases where
925     * the caller fails before initializing str -- unicode_resize()
926     * reads str[0], and the Keep-Alive optimization can keep memory
927     * allocated for str alive across a call to unicode_dealloc(unicode).
928     * We don't want unicode_resize to read uninitialized memory in
929     * that case.
930     */
931    _PyUnicode_WSTR(unicode)[0] = 0;
932    _PyUnicode_WSTR(unicode)[length] = 0;
933
934    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
935    return unicode;
936}
937
938static const char*
939unicode_kind_name(PyObject *unicode)
940{
941    /* don't check consistency: unicode_kind_name() is called from
942       _PyUnicode_Dump() */
943    if (!PyUnicode_IS_COMPACT(unicode))
944    {
945        if (!PyUnicode_IS_READY(unicode))
946            return "wstr";
947        switch (PyUnicode_KIND(unicode))
948        {
949        case PyUnicode_1BYTE_KIND:
950            if (PyUnicode_IS_ASCII(unicode))
951                return "legacy ascii";
952            else
953                return "legacy latin1";
954        case PyUnicode_2BYTE_KIND:
955            return "legacy UCS2";
956        case PyUnicode_4BYTE_KIND:
957            return "legacy UCS4";
958        default:
959            return "<legacy invalid kind>";
960        }
961    }
962    assert(PyUnicode_IS_READY(unicode));
963    switch (PyUnicode_KIND(unicode)) {
964    case PyUnicode_1BYTE_KIND:
965        if (PyUnicode_IS_ASCII(unicode))
966            return "ascii";
967        else
968            return "latin1";
969    case PyUnicode_2BYTE_KIND:
970        return "UCS2";
971    case PyUnicode_4BYTE_KIND:
972        return "UCS4";
973    default:
974        return "<invalid compact kind>";
975    }
976}
977
978#ifdef Py_DEBUG
979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
981    return PyUnicode_UTF8(unicode);
982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985    return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988    printf("obj %p\n", unicode);
989    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994    return PyUnicode_DATA(unicode);
995}
996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000    PyASCIIObject *ascii = (PyASCIIObject *)op;
1001    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003    void *data;
1004
1005    if (ascii->state.compact)
1006    {
1007        if (ascii->state.ascii)
1008            data = (ascii + 1);
1009        else
1010            data = (compact + 1);
1011    }
1012    else
1013        data = unicode->data.any;
1014    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015           unicode_kind_name(op), ascii->length);
1016
1017    if (ascii->wstr == data)
1018        printf("shared ");
1019    printf("wstr=%p", ascii->wstr);
1020
1021    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1022        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1023        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024            printf("shared ");
1025        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026               compact->utf8, compact->utf8_length);
1027    }
1028    printf(", data=%p\n", data);
1029}
1030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035    PyObject *obj;
1036    PyCompactUnicodeObject *unicode;
1037    void *data;
1038    enum PyUnicode_Kind kind;
1039    int is_sharing, is_ascii;
1040    Py_ssize_t char_size;
1041    Py_ssize_t struct_size;
1042
1043    /* Optimization for empty strings */
1044    if (size == 0 && unicode_empty != NULL) {
1045        Py_INCREF(unicode_empty);
1046        return unicode_empty;
1047    }
1048
1049    is_ascii = 0;
1050    is_sharing = 0;
1051    struct_size = sizeof(PyCompactUnicodeObject);
1052    if (maxchar < 128) {
1053        kind = PyUnicode_1BYTE_KIND;
1054        char_size = 1;
1055        is_ascii = 1;
1056        struct_size = sizeof(PyASCIIObject);
1057    }
1058    else if (maxchar < 256) {
1059        kind = PyUnicode_1BYTE_KIND;
1060        char_size = 1;
1061    }
1062    else if (maxchar < 65536) {
1063        kind = PyUnicode_2BYTE_KIND;
1064        char_size = 2;
1065        if (sizeof(wchar_t) == 2)
1066            is_sharing = 1;
1067    }
1068    else {
1069        if (maxchar > MAX_UNICODE) {
1070            PyErr_SetString(PyExc_SystemError,
1071                            "invalid maximum character passed to PyUnicode_New");
1072            return NULL;
1073        }
1074        kind = PyUnicode_4BYTE_KIND;
1075        char_size = 4;
1076        if (sizeof(wchar_t) == 4)
1077            is_sharing = 1;
1078    }
1079
1080    /* Ensure we won't overflow the size. */
1081    if (size < 0) {
1082        PyErr_SetString(PyExc_SystemError,
1083                        "Negative size passed to PyUnicode_New");
1084        return NULL;
1085    }
1086    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087        return PyErr_NoMemory();
1088
1089    /* Duplicated allocation code from _PyObject_New() instead of a call to
1090     * PyObject_New() so we are able to allocate space for the object and
1091     * it's data buffer.
1092     */
1093    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094    if (obj == NULL)
1095        return PyErr_NoMemory();
1096    obj = PyObject_INIT(obj, &PyUnicode_Type);
1097    if (obj == NULL)
1098        return NULL;
1099
1100    unicode = (PyCompactUnicodeObject *)obj;
1101    if (is_ascii)
1102        data = ((PyASCIIObject*)obj) + 1;
1103    else
1104        data = unicode + 1;
1105    _PyUnicode_LENGTH(unicode) = size;
1106    _PyUnicode_HASH(unicode) = -1;
1107    _PyUnicode_STATE(unicode).interned = 0;
1108    _PyUnicode_STATE(unicode).kind = kind;
1109    _PyUnicode_STATE(unicode).compact = 1;
1110    _PyUnicode_STATE(unicode).ready = 1;
1111    _PyUnicode_STATE(unicode).ascii = is_ascii;
1112    if (is_ascii) {
1113        ((char*)data)[size] = 0;
1114        _PyUnicode_WSTR(unicode) = NULL;
1115    }
1116    else if (kind == PyUnicode_1BYTE_KIND) {
1117        ((char*)data)[size] = 0;
1118        _PyUnicode_WSTR(unicode) = NULL;
1119        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1120        unicode->utf8 = NULL;
1121        unicode->utf8_length = 0;
1122    }
1123    else {
1124        unicode->utf8 = NULL;
1125        unicode->utf8_length = 0;
1126        if (kind == PyUnicode_2BYTE_KIND)
1127            ((Py_UCS2*)data)[size] = 0;
1128        else /* kind == PyUnicode_4BYTE_KIND */
1129            ((Py_UCS4*)data)[size] = 0;
1130        if (is_sharing) {
1131            _PyUnicode_WSTR_LENGTH(unicode) = size;
1132            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133        }
1134        else {
1135            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136            _PyUnicode_WSTR(unicode) = NULL;
1137        }
1138    }
1139#ifdef Py_DEBUG
1140    unicode_fill_invalid((PyObject*)unicode, 0);
1141#endif
1142    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1143    return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148   will decode surrogate pairs, the other conversions are implemented as macros
1149   for efficiency.
1150
1151   This function assumes that unicode can hold one more code point than wstr
1152   characters for a terminating null character. */
1153static void
1154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1155                              PyObject *unicode)
1156{
1157    const wchar_t *iter;
1158    Py_UCS4 *ucs4_out;
1159
1160    assert(unicode != NULL);
1161    assert(_PyUnicode_CHECK(unicode));
1162    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165    for (iter = begin; iter < end; ) {
1166        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167                           _PyUnicode_GET_LENGTH(unicode)));
1168        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169            && (iter+1) < end
1170            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1171        {
1172            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1173            iter += 2;
1174        }
1175        else {
1176            *ucs4_out++ = *iter;
1177            iter++;
1178        }
1179    }
1180    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181                        _PyUnicode_GET_LENGTH(unicode)));
1182
1183}
1184#endif
1185
1186static int
1187unicode_check_modifiable(PyObject *unicode)
1188{
1189    if (!unicode_modifiable(unicode)) {
1190        PyErr_SetString(PyExc_SystemError,
1191                        "Cannot modify a string currently used");
1192        return -1;
1193    }
1194    return 0;
1195}
1196
1197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199                 PyObject *from, Py_ssize_t from_start,
1200                 Py_ssize_t how_many, int check_maxchar)
1201{
1202    unsigned int from_kind, to_kind;
1203    void *from_data, *to_data;
1204
1205    assert(0 <= how_many);
1206    assert(0 <= from_start);
1207    assert(0 <= to_start);
1208    assert(PyUnicode_Check(from));
1209    assert(PyUnicode_IS_READY(from));
1210    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1211
1212    assert(PyUnicode_Check(to));
1213    assert(PyUnicode_IS_READY(to));
1214    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
1216    if (how_many == 0)
1217        return 0;
1218
1219    from_kind = PyUnicode_KIND(from);
1220    from_data = PyUnicode_DATA(from);
1221    to_kind = PyUnicode_KIND(to);
1222    to_data = PyUnicode_DATA(to);
1223
1224#ifdef Py_DEBUG
1225    if (!check_maxchar
1226        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227    {
1228        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229        Py_UCS4 ch;
1230        Py_ssize_t i;
1231        for (i=0; i < how_many; i++) {
1232            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233            assert(ch <= to_maxchar);
1234        }
1235    }
1236#endif
1237
1238    if (from_kind == to_kind) {
1239        if (check_maxchar
1240            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241        {
1242            /* Writing Latin-1 characters into an ASCII string requires to
1243               check that all written characters are pure ASCII */
1244            Py_UCS4 max_char;
1245            max_char = ucs1lib_find_max_char(from_data,
1246                                             (Py_UCS1*)from_data + how_many);
1247            if (max_char >= 128)
1248                return -1;
1249        }
1250        Py_MEMCPY((char*)to_data + to_kind * to_start,
1251                  (char*)from_data + from_kind * from_start,
1252                  to_kind * how_many);
1253    }
1254    else if (from_kind == PyUnicode_1BYTE_KIND
1255             && to_kind == PyUnicode_2BYTE_KIND)
1256    {
1257        _PyUnicode_CONVERT_BYTES(
1258            Py_UCS1, Py_UCS2,
1259            PyUnicode_1BYTE_DATA(from) + from_start,
1260            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261            PyUnicode_2BYTE_DATA(to) + to_start
1262            );
1263    }
1264    else if (from_kind == PyUnicode_1BYTE_KIND
1265             && to_kind == PyUnicode_4BYTE_KIND)
1266    {
1267        _PyUnicode_CONVERT_BYTES(
1268            Py_UCS1, Py_UCS4,
1269            PyUnicode_1BYTE_DATA(from) + from_start,
1270            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271            PyUnicode_4BYTE_DATA(to) + to_start
1272            );
1273    }
1274    else if (from_kind == PyUnicode_2BYTE_KIND
1275             && to_kind == PyUnicode_4BYTE_KIND)
1276    {
1277        _PyUnicode_CONVERT_BYTES(
1278            Py_UCS2, Py_UCS4,
1279            PyUnicode_2BYTE_DATA(from) + from_start,
1280            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281            PyUnicode_4BYTE_DATA(to) + to_start
1282            );
1283    }
1284    else {
1285        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
1287        if (!check_maxchar) {
1288            if (from_kind == PyUnicode_2BYTE_KIND
1289                && to_kind == PyUnicode_1BYTE_KIND)
1290            {
1291                _PyUnicode_CONVERT_BYTES(
1292                    Py_UCS2, Py_UCS1,
1293                    PyUnicode_2BYTE_DATA(from) + from_start,
1294                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295                    PyUnicode_1BYTE_DATA(to) + to_start
1296                    );
1297            }
1298            else if (from_kind == PyUnicode_4BYTE_KIND
1299                     && to_kind == PyUnicode_1BYTE_KIND)
1300            {
1301                _PyUnicode_CONVERT_BYTES(
1302                    Py_UCS4, Py_UCS1,
1303                    PyUnicode_4BYTE_DATA(from) + from_start,
1304                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305                    PyUnicode_1BYTE_DATA(to) + to_start
1306                    );
1307            }
1308            else if (from_kind == PyUnicode_4BYTE_KIND
1309                     && to_kind == PyUnicode_2BYTE_KIND)
1310            {
1311                _PyUnicode_CONVERT_BYTES(
1312                    Py_UCS4, Py_UCS2,
1313                    PyUnicode_4BYTE_DATA(from) + from_start,
1314                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315                    PyUnicode_2BYTE_DATA(to) + to_start
1316                    );
1317            }
1318            else {
1319                assert(0);
1320                return -1;
1321            }
1322        }
1323        else {
1324            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1325            Py_UCS4 ch;
1326            Py_ssize_t i;
1327
1328            for (i=0; i < how_many; i++) {
1329                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1330                if (ch > to_maxchar)
1331                    return -1;
1332                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333            }
1334        }
1335    }
1336    return 0;
1337}
1338
1339void
1340_PyUnicode_FastCopyCharacters(
1341    PyObject *to, Py_ssize_t to_start,
1342    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1343{
1344    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349                         PyObject *from, Py_ssize_t from_start,
1350                         Py_ssize_t how_many)
1351{
1352    int err;
1353
1354    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355        PyErr_BadInternalCall();
1356        return -1;
1357    }
1358
1359    if (PyUnicode_READY(from) == -1)
1360        return -1;
1361    if (PyUnicode_READY(to) == -1)
1362        return -1;
1363
1364    if (from_start < 0) {
1365        PyErr_SetString(PyExc_IndexError, "string index out of range");
1366        return -1;
1367    }
1368    if (to_start < 0) {
1369        PyErr_SetString(PyExc_IndexError, "string index out of range");
1370        return -1;
1371    }
1372    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374        PyErr_Format(PyExc_SystemError,
1375                     "Cannot write %zi characters at %zi "
1376                     "in a string of %zi characters",
1377                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1378        return -1;
1379    }
1380
1381    if (how_many == 0)
1382        return 0;
1383
1384    if (unicode_check_modifiable(to))
1385        return -1;
1386
1387    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388    if (err) {
1389        PyErr_Format(PyExc_SystemError,
1390                     "Cannot copy %s characters "
1391                     "into a string of %s characters",
1392                     unicode_kind_name(from),
1393                     unicode_kind_name(to));
1394        return -1;
1395    }
1396    return how_many;
1397}
1398
1399/* Find the maximum code point and count the number of surrogate pairs so a
1400   correct string length can be computed before converting a string to UCS4.
1401   This function counts single surrogates as a character and not as a pair.
1402
1403   Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1407{
1408    const wchar_t *iter;
1409    Py_UCS4 ch;
1410
1411    assert(num_surrogates != NULL && maxchar != NULL);
1412    *num_surrogates = 0;
1413    *maxchar = 0;
1414
1415    for (iter = begin; iter < end; ) {
1416#if SIZEOF_WCHAR_T == 2
1417        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418            && (iter+1) < end
1419            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420        {
1421            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422            ++(*num_surrogates);
1423            iter += 2;
1424        }
1425        else
1426#endif
1427        {
1428            ch = *iter;
1429            iter++;
1430        }
1431        if (ch > *maxchar) {
1432            *maxchar = ch;
1433            if (*maxchar > MAX_UNICODE) {
1434                PyErr_Format(PyExc_ValueError,
1435                             "character U+%x is not in range [U+0000; U+10ffff]",
1436                             ch);
1437                return -1;
1438            }
1439        }
1440    }
1441    return 0;
1442}
1443
1444int
1445_PyUnicode_Ready(PyObject *unicode)
1446{
1447    wchar_t *end;
1448    Py_UCS4 maxchar = 0;
1449    Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451    Py_ssize_t length_wo_surrogates;
1452#endif
1453
1454    /* _PyUnicode_Ready() is only intended for old-style API usage where
1455       strings were created using _PyObject_New() and where no canonical
1456       representation (the str field) has been set yet aka strings
1457       which are not yet ready. */
1458    assert(_PyUnicode_CHECK(unicode));
1459    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1460    assert(_PyUnicode_WSTR(unicode) != NULL);
1461    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1462    assert(_PyUnicode_UTF8(unicode) == NULL);
1463    /* Actually, it should neither be interned nor be anything else: */
1464    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1465
1466    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1467    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1468                                &maxchar, &num_surrogates) == -1)
1469        return -1;
1470
1471    if (maxchar < 256) {
1472        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473        if (!_PyUnicode_DATA_ANY(unicode)) {
1474            PyErr_NoMemory();
1475            return -1;
1476        }
1477        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1478                                _PyUnicode_WSTR(unicode), end,
1479                                PyUnicode_1BYTE_DATA(unicode));
1480        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483        if (maxchar < 128) {
1484            _PyUnicode_STATE(unicode).ascii = 1;
1485            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1486            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1487        }
1488        else {
1489            _PyUnicode_STATE(unicode).ascii = 0;
1490            _PyUnicode_UTF8(unicode) = NULL;
1491            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1492        }
1493        PyObject_FREE(_PyUnicode_WSTR(unicode));
1494        _PyUnicode_WSTR(unicode) = NULL;
1495        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496    }
1497    /* In this case we might have to convert down from 4-byte native
1498       wchar_t to 2-byte unicode. */
1499    else if (maxchar < 65536) {
1500        assert(num_surrogates == 0 &&
1501               "FindMaxCharAndNumSurrogatePairs() messed up");
1502
1503#if SIZEOF_WCHAR_T == 2
1504        /* We can share representations and are done. */
1505        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1506        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1509        _PyUnicode_UTF8(unicode) = NULL;
1510        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1511#else
1512        /* sizeof(wchar_t) == 4 */
1513        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1514            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1515        if (!_PyUnicode_DATA_ANY(unicode)) {
1516            PyErr_NoMemory();
1517            return -1;
1518        }
1519        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520                                _PyUnicode_WSTR(unicode), end,
1521                                PyUnicode_2BYTE_DATA(unicode));
1522        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1525        _PyUnicode_UTF8(unicode) = NULL;
1526        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1527        PyObject_FREE(_PyUnicode_WSTR(unicode));
1528        _PyUnicode_WSTR(unicode) = NULL;
1529        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
1531    }
1532    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533    else {
1534#if SIZEOF_WCHAR_T == 2
1535        /* in case the native representation is 2-bytes, we need to allocate a
1536           new normalized 4-byte version. */
1537        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1538        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1539        if (!_PyUnicode_DATA_ANY(unicode)) {
1540            PyErr_NoMemory();
1541            return -1;
1542        }
1543        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1544        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1545        _PyUnicode_UTF8(unicode) = NULL;
1546        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1547        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1548        _PyUnicode_STATE(unicode).ready = 1;
1549        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1550        PyObject_FREE(_PyUnicode_WSTR(unicode));
1551        _PyUnicode_WSTR(unicode) = NULL;
1552        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1553#else
1554        assert(num_surrogates == 0);
1555
1556        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1557        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1558        _PyUnicode_UTF8(unicode) = NULL;
1559        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1560        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1561#endif
1562        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1563    }
1564    _PyUnicode_STATE(unicode).ready = 1;
1565    assert(_PyUnicode_CheckConsistency(unicode, 1));
1566    return 0;
1567}
1568
1569static void
1570unicode_dealloc(PyObject *unicode)
1571{
1572    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1573    case SSTATE_NOT_INTERNED:
1574        break;
1575
1576    case SSTATE_INTERNED_MORTAL:
1577        /* revive dead object temporarily for DelItem */
1578        Py_REFCNT(unicode) = 3;
1579        if (PyDict_DelItem(interned, unicode) != 0)
1580            Py_FatalError(
1581                "deletion of interned string failed");
1582        break;
1583
1584    case SSTATE_INTERNED_IMMORTAL:
1585        Py_FatalError("Immortal interned string died.");
1586
1587    default:
1588        Py_FatalError("Inconsistent interned string state.");
1589    }
1590
1591    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1592        PyObject_DEL(_PyUnicode_WSTR(unicode));
1593    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1594        PyObject_DEL(_PyUnicode_UTF8(unicode));
1595    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1596        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1597
1598    Py_TYPE(unicode)->tp_free(unicode);
1599}
1600
1601#ifdef Py_DEBUG
1602static int
1603unicode_is_singleton(PyObject *unicode)
1604{
1605    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1606    if (unicode == unicode_empty)
1607        return 1;
1608    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1609    {
1610        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1611        if (ch < 256 && unicode_latin1[ch] == unicode)
1612            return 1;
1613    }
1614    return 0;
1615}
1616#endif
1617
1618static int
1619unicode_modifiable(PyObject *unicode)
1620{
1621    assert(_PyUnicode_CHECK(unicode));
1622    if (Py_REFCNT(unicode) != 1)
1623        return 0;
1624    if (_PyUnicode_HASH(unicode) != -1)
1625        return 0;
1626    if (PyUnicode_CHECK_INTERNED(unicode))
1627        return 0;
1628    if (!PyUnicode_CheckExact(unicode))
1629        return 0;
1630#ifdef Py_DEBUG
1631    /* singleton refcount is greater than 1 */
1632    assert(!unicode_is_singleton(unicode));
1633#endif
1634    return 1;
1635}
1636
1637static int
1638unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1639{
1640    PyObject *unicode;
1641    Py_ssize_t old_length;
1642
1643    assert(p_unicode != NULL);
1644    unicode = *p_unicode;
1645
1646    assert(unicode != NULL);
1647    assert(PyUnicode_Check(unicode));
1648    assert(0 <= length);
1649
1650    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1651        old_length = PyUnicode_WSTR_LENGTH(unicode);
1652    else
1653        old_length = PyUnicode_GET_LENGTH(unicode);
1654    if (old_length == length)
1655        return 0;
1656
1657    if (length == 0) {
1658        _Py_INCREF_UNICODE_EMPTY();
1659        if (!unicode_empty)
1660            return -1;
1661        Py_DECREF(*p_unicode);
1662        *p_unicode = unicode_empty;
1663        return 0;
1664    }
1665
1666    if (!unicode_modifiable(unicode)) {
1667        PyObject *copy = resize_copy(unicode, length);
1668        if (copy == NULL)
1669            return -1;
1670        Py_DECREF(*p_unicode);
1671        *p_unicode = copy;
1672        return 0;
1673    }
1674
1675    if (PyUnicode_IS_COMPACT(unicode)) {
1676        PyObject *new_unicode = resize_compact(unicode, length);
1677        if (new_unicode == NULL)
1678            return -1;
1679        *p_unicode = new_unicode;
1680        return 0;
1681    }
1682    return resize_inplace(unicode, length);
1683}
1684
1685int
1686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1687{
1688    PyObject *unicode;
1689    if (p_unicode == NULL) {
1690        PyErr_BadInternalCall();
1691        return -1;
1692    }
1693    unicode = *p_unicode;
1694    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1695    {
1696        PyErr_BadInternalCall();
1697        return -1;
1698    }
1699    return unicode_resize(p_unicode, length);
1700}
1701
1702/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1703
1704   WARNING: The function doesn't copy the terminating null character and
1705   doesn't check the maximum character (may write a latin1 character in an
1706   ASCII string). */
1707static void
1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1709                   const char *str, Py_ssize_t len)
1710{
1711    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1712    void *data = PyUnicode_DATA(unicode);
1713    const char *end = str + len;
1714
1715    switch (kind) {
1716    case PyUnicode_1BYTE_KIND: {
1717        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1718#ifdef Py_DEBUG
1719        if (PyUnicode_IS_ASCII(unicode)) {
1720            Py_UCS4 maxchar = ucs1lib_find_max_char(
1721                (const Py_UCS1*)str,
1722                (const Py_UCS1*)str + len);
1723            assert(maxchar < 128);
1724        }
1725#endif
1726        memcpy((char *) data + index, str, len);
1727        break;
1728    }
1729    case PyUnicode_2BYTE_KIND: {
1730        Py_UCS2 *start = (Py_UCS2 *)data + index;
1731        Py_UCS2 *ucs2 = start;
1732        assert(index <= PyUnicode_GET_LENGTH(unicode));
1733
1734        for (; str < end; ++ucs2, ++str)
1735            *ucs2 = (Py_UCS2)*str;
1736
1737        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1738        break;
1739    }
1740    default: {
1741        Py_UCS4 *start = (Py_UCS4 *)data + index;
1742        Py_UCS4 *ucs4 = start;
1743        assert(kind == PyUnicode_4BYTE_KIND);
1744        assert(index <= PyUnicode_GET_LENGTH(unicode));
1745
1746        for (; str < end; ++ucs4, ++str)
1747            *ucs4 = (Py_UCS4)*str;
1748
1749        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1750    }
1751    }
1752}
1753
1754static PyObject*
1755get_latin1_char(unsigned char ch)
1756{
1757    PyObject *unicode = unicode_latin1[ch];
1758    if (!unicode) {
1759        unicode = PyUnicode_New(1, ch);
1760        if (!unicode)
1761            return NULL;
1762        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1763        assert(_PyUnicode_CheckConsistency(unicode, 1));
1764        unicode_latin1[ch] = unicode;
1765    }
1766    Py_INCREF(unicode);
1767    return unicode;
1768}
1769
1770static PyObject*
1771unicode_char(Py_UCS4 ch)
1772{
1773    PyObject *unicode;
1774
1775    assert(ch <= MAX_UNICODE);
1776
1777    if (ch < 256)
1778        return get_latin1_char(ch);
1779
1780    unicode = PyUnicode_New(1, ch);
1781    if (unicode == NULL)
1782        return NULL;
1783    switch (PyUnicode_KIND(unicode)) {
1784    case PyUnicode_1BYTE_KIND:
1785        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1786        break;
1787    case PyUnicode_2BYTE_KIND:
1788        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1789        break;
1790    default:
1791        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1792        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1793    }
1794    assert(_PyUnicode_CheckConsistency(unicode, 1));
1795    return unicode;
1796}
1797
1798PyObject *
1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1800{
1801    PyObject *unicode;
1802    Py_UCS4 maxchar = 0;
1803    Py_ssize_t num_surrogates;
1804
1805    if (u == NULL)
1806        return (PyObject*)_PyUnicode_New(size);
1807
1808    /* If the Unicode data is known at construction time, we can apply
1809       some optimizations which share commonly used objects. */
1810
1811    /* Optimization for empty strings */
1812    if (size == 0)
1813        _Py_RETURN_UNICODE_EMPTY();
1814
1815    /* Single character Unicode objects in the Latin-1 range are
1816       shared when using this constructor */
1817    if (size == 1 && (Py_UCS4)*u < 256)
1818        return get_latin1_char((unsigned char)*u);
1819
1820    /* If not empty and not single character, copy the Unicode data
1821       into the new object */
1822    if (find_maxchar_surrogates(u, u + size,
1823                                &maxchar, &num_surrogates) == -1)
1824        return NULL;
1825
1826    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1827    if (!unicode)
1828        return NULL;
1829
1830    switch (PyUnicode_KIND(unicode)) {
1831    case PyUnicode_1BYTE_KIND:
1832        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1833                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1834        break;
1835    case PyUnicode_2BYTE_KIND:
1836#if Py_UNICODE_SIZE == 2
1837        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1838#else
1839        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1840                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1841#endif
1842        break;
1843    case PyUnicode_4BYTE_KIND:
1844#if SIZEOF_WCHAR_T == 2
1845        /* This is the only case which has to process surrogates, thus
1846           a simple copy loop is not enough and we need a function. */
1847        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1848#else
1849        assert(num_surrogates == 0);
1850        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1851#endif
1852        break;
1853    default:
1854        assert(0 && "Impossible state");
1855    }
1856
1857    return unicode_result(unicode);
1858}
1859
1860PyObject *
1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1862{
1863    if (size < 0) {
1864        PyErr_SetString(PyExc_SystemError,
1865                        "Negative size passed to PyUnicode_FromStringAndSize");
1866        return NULL;
1867    }
1868    if (u != NULL)
1869        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1870    else
1871        return (PyObject *)_PyUnicode_New(size);
1872}
1873
1874PyObject *
1875PyUnicode_FromString(const char *u)
1876{
1877    size_t size = strlen(u);
1878    if (size > PY_SSIZE_T_MAX) {
1879        PyErr_SetString(PyExc_OverflowError, "input too long");
1880        return NULL;
1881    }
1882    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1883}
1884
1885PyObject *
1886_PyUnicode_FromId(_Py_Identifier *id)
1887{
1888    if (!id->object) {
1889        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1890                                                  strlen(id->string),
1891                                                  NULL, NULL);
1892        if (!id->object)
1893            return NULL;
1894        PyUnicode_InternInPlace(&id->object);
1895        assert(!id->next);
1896        id->next = static_strings;
1897        static_strings = id;
1898    }
1899    return id->object;
1900}
1901
1902void
1903_PyUnicode_ClearStaticStrings()
1904{
1905    _Py_Identifier *tmp, *s = static_strings;
1906    while (s) {
1907        Py_CLEAR(s->object);
1908        tmp = s->next;
1909        s->next = NULL;
1910        s = tmp;
1911    }
1912    static_strings = NULL;
1913}
1914
1915/* Internal function, doesn't check maximum character */
1916
1917PyObject*
1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1919{
1920    const unsigned char *s = (const unsigned char *)buffer;
1921    PyObject *unicode;
1922    if (size == 1) {
1923#ifdef Py_DEBUG
1924        assert((unsigned char)s[0] < 128);
1925#endif
1926        return get_latin1_char(s[0]);
1927    }
1928    unicode = PyUnicode_New(size, 127);
1929    if (!unicode)
1930        return NULL;
1931    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1932    assert(_PyUnicode_CheckConsistency(unicode, 1));
1933    return unicode;
1934}
1935
1936static Py_UCS4
1937kind_maxchar_limit(unsigned int kind)
1938{
1939    switch (kind) {
1940    case PyUnicode_1BYTE_KIND:
1941        return 0x80;
1942    case PyUnicode_2BYTE_KIND:
1943        return 0x100;
1944    case PyUnicode_4BYTE_KIND:
1945        return 0x10000;
1946    default:
1947        assert(0 && "invalid kind");
1948        return MAX_UNICODE;
1949    }
1950}
1951
1952Py_LOCAL_INLINE(Py_UCS4)
1953align_maxchar(Py_UCS4 maxchar)
1954{
1955    if (maxchar <= 127)
1956        return 127;
1957    else if (maxchar <= 255)
1958        return 255;
1959    else if (maxchar <= 65535)
1960        return 65535;
1961    else
1962        return MAX_UNICODE;
1963}
1964
1965static PyObject*
1966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1967{
1968    PyObject *res;
1969    unsigned char max_char;
1970
1971    if (size == 0)
1972        _Py_RETURN_UNICODE_EMPTY();
1973    assert(size > 0);
1974    if (size == 1)
1975        return get_latin1_char(u[0]);
1976
1977    max_char = ucs1lib_find_max_char(u, u + size);
1978    res = PyUnicode_New(size, max_char);
1979    if (!res)
1980        return NULL;
1981    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1982    assert(_PyUnicode_CheckConsistency(res, 1));
1983    return res;
1984}
1985
1986static PyObject*
1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1988{
1989    PyObject *res;
1990    Py_UCS2 max_char;
1991
1992    if (size == 0)
1993        _Py_RETURN_UNICODE_EMPTY();
1994    assert(size > 0);
1995    if (size == 1)
1996        return unicode_char(u[0]);
1997
1998    max_char = ucs2lib_find_max_char(u, u + size);
1999    res = PyUnicode_New(size, max_char);
2000    if (!res)
2001        return NULL;
2002    if (max_char >= 256)
2003        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2004    else {
2005        _PyUnicode_CONVERT_BYTES(
2006            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2007    }
2008    assert(_PyUnicode_CheckConsistency(res, 1));
2009    return res;
2010}
2011
2012static PyObject*
2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2014{
2015    PyObject *res;
2016    Py_UCS4 max_char;
2017
2018    if (size == 0)
2019        _Py_RETURN_UNICODE_EMPTY();
2020    assert(size > 0);
2021    if (size == 1)
2022        return unicode_char(u[0]);
2023
2024    max_char = ucs4lib_find_max_char(u, u + size);
2025    res = PyUnicode_New(size, max_char);
2026    if (!res)
2027        return NULL;
2028    if (max_char < 256)
2029        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030                                 PyUnicode_1BYTE_DATA(res));
2031    else if (max_char < 0x10000)
2032        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033                                 PyUnicode_2BYTE_DATA(res));
2034    else
2035        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2036    assert(_PyUnicode_CheckConsistency(res, 1));
2037    return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
2043    if (size < 0) {
2044        PyErr_SetString(PyExc_ValueError, "size must be positive");
2045        return NULL;
2046    }
2047    switch (kind) {
2048    case PyUnicode_1BYTE_KIND:
2049        return _PyUnicode_FromUCS1(buffer, size);
2050    case PyUnicode_2BYTE_KIND:
2051        return _PyUnicode_FromUCS2(buffer, size);
2052    case PyUnicode_4BYTE_KIND:
2053        return _PyUnicode_FromUCS4(buffer, size);
2054    default:
2055        PyErr_SetString(PyExc_SystemError, "invalid kind");
2056        return NULL;
2057    }
2058}
2059
2060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063    enum PyUnicode_Kind kind;
2064    void *startptr, *endptr;
2065
2066    assert(PyUnicode_IS_READY(unicode));
2067    assert(0 <= start);
2068    assert(end <= PyUnicode_GET_LENGTH(unicode));
2069    assert(start <= end);
2070
2071    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072        return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074    if (start == end)
2075        return 127;
2076
2077    if (PyUnicode_IS_ASCII(unicode))
2078        return 127;
2079
2080    kind = PyUnicode_KIND(unicode);
2081    startptr = PyUnicode_DATA(unicode);
2082    endptr = (char *)startptr + end * kind;
2083    startptr = (char *)startptr + start * kind;
2084    switch(kind) {
2085    case PyUnicode_1BYTE_KIND:
2086        return ucs1lib_find_max_char(startptr, endptr);
2087    case PyUnicode_2BYTE_KIND:
2088        return ucs2lib_find_max_char(startptr, endptr);
2089    case PyUnicode_4BYTE_KIND:
2090        return ucs4lib_find_max_char(startptr, endptr);
2091    default:
2092        assert(0);
2093        return 0;
2094    }
2095}
2096
2097/* Ensure that a string uses the most efficient storage, if it is not the
2098   case: create a new string with of the right kind. Write NULL into *p_unicode
2099   on error. */
2100static void
2101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103    PyObject *unicode, *copy;
2104    Py_UCS4 max_char;
2105    Py_ssize_t len;
2106    unsigned int kind;
2107
2108    assert(p_unicode != NULL);
2109    unicode = *p_unicode;
2110    assert(PyUnicode_IS_READY(unicode));
2111    if (PyUnicode_IS_ASCII(unicode))
2112        return;
2113
2114    len = PyUnicode_GET_LENGTH(unicode);
2115    kind = PyUnicode_KIND(unicode);
2116    if (kind == PyUnicode_1BYTE_KIND) {
2117        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2118        max_char = ucs1lib_find_max_char(u, u + len);
2119        if (max_char >= 128)
2120            return;
2121    }
2122    else if (kind == PyUnicode_2BYTE_KIND) {
2123        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2124        max_char = ucs2lib_find_max_char(u, u + len);
2125        if (max_char >= 256)
2126            return;
2127    }
2128    else {
2129        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2130        assert(kind == PyUnicode_4BYTE_KIND);
2131        max_char = ucs4lib_find_max_char(u, u + len);
2132        if (max_char >= 0x10000)
2133            return;
2134    }
2135    copy = PyUnicode_New(len, max_char);
2136    if (copy != NULL)
2137        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2138    Py_DECREF(unicode);
2139    *p_unicode = copy;
2140}
2141
2142PyObject*
2143_PyUnicode_Copy(PyObject *unicode)
2144{
2145    Py_ssize_t length;
2146    PyObject *copy;
2147
2148    if (!PyUnicode_Check(unicode)) {
2149        PyErr_BadInternalCall();
2150        return NULL;
2151    }
2152    if (PyUnicode_READY(unicode) == -1)
2153        return NULL;
2154
2155    length = PyUnicode_GET_LENGTH(unicode);
2156    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2157    if (!copy)
2158        return NULL;
2159    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
2161    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162              length * PyUnicode_KIND(unicode));
2163    assert(_PyUnicode_CheckConsistency(copy, 1));
2164    return copy;
2165}
2166
2167
2168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169   character. Return NULL on error. */
2170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
2174    Py_ssize_t len;
2175    void *result;
2176    unsigned int skind;
2177
2178    if (PyUnicode_READY(s) == -1)
2179        return NULL;
2180
2181    len = PyUnicode_GET_LENGTH(s);
2182    skind = PyUnicode_KIND(s);
2183    if (skind >= kind) {
2184        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2185        return NULL;
2186    }
2187    switch (kind) {
2188    case PyUnicode_2BYTE_KIND:
2189        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190        if (!result)
2191            return PyErr_NoMemory();
2192        assert(skind == PyUnicode_1BYTE_KIND);
2193        _PyUnicode_CONVERT_BYTES(
2194            Py_UCS1, Py_UCS2,
2195            PyUnicode_1BYTE_DATA(s),
2196            PyUnicode_1BYTE_DATA(s) + len,
2197            result);
2198        return result;
2199    case PyUnicode_4BYTE_KIND:
2200        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201        if (!result)
2202            return PyErr_NoMemory();
2203        if (skind == PyUnicode_2BYTE_KIND) {
2204            _PyUnicode_CONVERT_BYTES(
2205                Py_UCS2, Py_UCS4,
2206                PyUnicode_2BYTE_DATA(s),
2207                PyUnicode_2BYTE_DATA(s) + len,
2208                result);
2209        }
2210        else {
2211            assert(skind == PyUnicode_1BYTE_KIND);
2212            _PyUnicode_CONVERT_BYTES(
2213                Py_UCS1, Py_UCS4,
2214                PyUnicode_1BYTE_DATA(s),
2215                PyUnicode_1BYTE_DATA(s) + len,
2216                result);
2217        }
2218        return result;
2219    default:
2220        break;
2221    }
2222    PyErr_SetString(PyExc_SystemError, "invalid kind");
2223    return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228        int copy_null)
2229{
2230    int kind;
2231    void *data;
2232    Py_ssize_t len, targetlen;
2233    if (PyUnicode_READY(string) == -1)
2234        return NULL;
2235    kind = PyUnicode_KIND(string);
2236    data = PyUnicode_DATA(string);
2237    len = PyUnicode_GET_LENGTH(string);
2238    targetlen = len;
2239    if (copy_null)
2240        targetlen++;
2241    if (!target) {
2242        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2243            PyErr_NoMemory();
2244            return NULL;
2245        }
2246        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247        if (!target) {
2248            PyErr_NoMemory();
2249            return NULL;
2250        }
2251    }
2252    else {
2253        if (targetsize < targetlen) {
2254            PyErr_Format(PyExc_SystemError,
2255                         "string is longer than the buffer");
2256            if (copy_null && 0 < targetsize)
2257                target[0] = 0;
2258            return NULL;
2259        }
2260    }
2261    if (kind == PyUnicode_1BYTE_KIND) {
2262        Py_UCS1 *start = (Py_UCS1 *) data;
2263        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2264    }
2265    else if (kind == PyUnicode_2BYTE_KIND) {
2266        Py_UCS2 *start = (Py_UCS2 *) data;
2267        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268    }
2269    else {
2270        assert(kind == PyUnicode_4BYTE_KIND);
2271        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2272    }
2273    if (copy_null)
2274        target[len] = 0;
2275    return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280                 int copy_null)
2281{
2282    if (target == NULL || targetsize < 0) {
2283        PyErr_BadInternalCall();
2284        return NULL;
2285    }
2286    return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292    return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
2296
2297PyObject *
2298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2299{
2300    if (w == NULL) {
2301        if (size == 0)
2302            _Py_RETURN_UNICODE_EMPTY();
2303        PyErr_BadInternalCall();
2304        return NULL;
2305    }
2306
2307    if (size == -1) {
2308        size = wcslen(w);
2309    }
2310
2311    return PyUnicode_FromUnicode(w, size);
2312}
2313
2314#endif /* HAVE_WCHAR_H */
2315
2316static void
2317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2318        char c)
2319{
2320    *fmt++ = '%';
2321    if (longflag)
2322        *fmt++ = 'l';
2323    else if (longlongflag) {
2324        /* longlongflag should only ever be nonzero on machines with
2325           HAVE_LONG_LONG defined */
2326#ifdef HAVE_LONG_LONG
2327        char *f = PY_FORMAT_LONG_LONG;
2328        while (*f)
2329            *fmt++ = *f++;
2330#else
2331        /* we shouldn't ever get here */
2332        assert(0);
2333        *fmt++ = 'l';
2334#endif
2335    }
2336    else if (size_tflag) {
2337        char *f = PY_FORMAT_SIZE_T;
2338        while (*f)
2339            *fmt++ = *f++;
2340    }
2341    *fmt++ = c;
2342    *fmt = '\0';
2343}
2344
2345/* maximum number of characters required for output of %lld or %p.
2346   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2349
2350static int
2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352                             Py_ssize_t width, Py_ssize_t precision)
2353{
2354    Py_ssize_t length, fill, arglen;
2355    Py_UCS4 maxchar;
2356
2357    if (PyUnicode_READY(str) == -1)
2358        return -1;
2359
2360    length = PyUnicode_GET_LENGTH(str);
2361    if ((precision == -1 || precision >= length)
2362        && width <= length)
2363        return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365    if (precision != -1)
2366        length = Py_MIN(precision, length);
2367
2368    arglen = Py_MAX(length, width);
2369    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371    else
2372        maxchar = writer->maxchar;
2373
2374    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375        return -1;
2376
2377    if (width > length) {
2378        fill = width - length;
2379        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380            return -1;
2381        writer->pos += fill;
2382    }
2383
2384    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385                                  str, 0, length);
2386    writer->pos += length;
2387    return 0;
2388}
2389
2390static int
2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392                              Py_ssize_t width, Py_ssize_t precision)
2393{
2394    /* UTF-8 */
2395    Py_ssize_t length;
2396    PyObject *unicode;
2397    int res;
2398
2399    length = strlen(str);
2400    if (precision != -1)
2401        length = Py_MIN(length, precision);
2402    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403    if (unicode == NULL)
2404        return -1;
2405
2406    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407    Py_DECREF(unicode);
2408    return res;
2409}
2410
2411static const char*
2412unicode_fromformat_arg(_PyUnicodeWriter *writer,
2413                       const char *f, va_list *vargs)
2414{
2415    const char *p;
2416    Py_ssize_t len;
2417    int zeropad;
2418    Py_ssize_t width;
2419    Py_ssize_t precision;
2420    int longflag;
2421    int longlongflag;
2422    int size_tflag;
2423    Py_ssize_t fill;
2424
2425    p = f;
2426    f++;
2427    zeropad = 0;
2428    if (*f == '0') {
2429        zeropad = 1;
2430        f++;
2431    }
2432
2433    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2434    width = -1;
2435    if (Py_ISDIGIT((unsigned)*f)) {
2436        width = *f - '0';
2437        f++;
2438        while (Py_ISDIGIT((unsigned)*f)) {
2439            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2440                PyErr_SetString(PyExc_ValueError,
2441                                "width too big");
2442                return NULL;
2443            }
2444            width = (width * 10) + (*f - '0');
2445            f++;
2446        }
2447    }
2448    precision = -1;
2449    if (*f == '.') {
2450        f++;
2451        if (Py_ISDIGIT((unsigned)*f)) {
2452            precision = (*f - '0');
2453            f++;
2454            while (Py_ISDIGIT((unsigned)*f)) {
2455                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456                    PyErr_SetString(PyExc_ValueError,
2457                                    "precision too big");
2458                    return NULL;
2459                }
2460                precision = (precision * 10) + (*f - '0');
2461                f++;
2462            }
2463        }
2464        if (*f == '%') {
2465            /* "%.3%s" => f points to "3" */
2466            f--;
2467        }
2468    }
2469    if (*f == '\0') {
2470        /* bogus format "%.123" => go backward, f points to "3" */
2471        f--;
2472    }
2473
2474    /* Handle %ld, %lu, %lld and %llu. */
2475    longflag = 0;
2476    longlongflag = 0;
2477    size_tflag = 0;
2478    if (*f == 'l') {
2479        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2480            longflag = 1;
2481            ++f;
2482        }
2483#ifdef HAVE_LONG_LONG
2484        else if (f[1] == 'l' &&
2485                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2486            longlongflag = 1;
2487            f += 2;
2488        }
2489#endif
2490    }
2491    /* handle the size_t flag. */
2492    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2493        size_tflag = 1;
2494        ++f;
2495    }
2496
2497    if (f[1] == '\0')
2498        writer->overallocate = 0;
2499
2500    switch (*f) {
2501    case 'c':
2502    {
2503        int ordinal = va_arg(*vargs, int);
2504        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2505            PyErr_SetString(PyExc_OverflowError,
2506                            "character argument not in range(0x110000)");
2507            return NULL;
2508        }
2509        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2510            return NULL;
2511        break;
2512    }
2513
2514    case 'i':
2515    case 'd':
2516    case 'u':
2517    case 'x':
2518    {
2519        /* used by sprintf */
2520        char fmt[10]; /* should be enough for "%0lld\0" */
2521        char buffer[MAX_LONG_LONG_CHARS];
2522        Py_ssize_t arglen;
2523
2524        if (*f == 'u') {
2525            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527            if (longflag)
2528                len = sprintf(buffer, fmt,
2529                        va_arg(*vargs, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531            else if (longlongflag)
2532                len = sprintf(buffer, fmt,
2533                        va_arg(*vargs, unsigned PY_LONG_LONG));
2534#endif
2535            else if (size_tflag)
2536                len = sprintf(buffer, fmt,
2537                        va_arg(*vargs, size_t));
2538            else
2539                len = sprintf(buffer, fmt,
2540                        va_arg(*vargs, unsigned int));
2541        }
2542        else if (*f == 'x') {
2543            makefmt(fmt, 0, 0, 0, 'x');
2544            len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545        }
2546        else {
2547            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549            if (longflag)
2550                len = sprintf(buffer, fmt,
2551                        va_arg(*vargs, long));
2552#ifdef HAVE_LONG_LONG
2553            else if (longlongflag)
2554                len = sprintf(buffer, fmt,
2555                        va_arg(*vargs, PY_LONG_LONG));
2556#endif
2557            else if (size_tflag)
2558                len = sprintf(buffer, fmt,
2559                        va_arg(*vargs, Py_ssize_t));
2560            else
2561                len = sprintf(buffer, fmt,
2562                        va_arg(*vargs, int));
2563        }
2564        assert(len >= 0);
2565
2566        if (precision < len)
2567            precision = len;
2568
2569        arglen = Py_MAX(precision, width);
2570        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571            return NULL;
2572
2573        if (width > precision) {
2574            Py_UCS4 fillchar;
2575            fill = width - precision;
2576            fillchar = zeropad?'0':' ';
2577            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2578                return NULL;
2579            writer->pos += fill;
2580        }
2581        if (precision > len) {
2582            fill = precision - len;
2583            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584                return NULL;
2585            writer->pos += fill;
2586        }
2587
2588        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589            return NULL;
2590        break;
2591    }
2592
2593    case 'p':
2594    {
2595        char number[MAX_LONG_LONG_CHARS];
2596
2597        len = sprintf(number, "%p", va_arg(*vargs, void*));
2598        assert(len >= 0);
2599
2600        /* %p is ill-defined:  ensure leading 0x. */
2601        if (number[1] == 'X')
2602            number[1] = 'x';
2603        else if (number[1] != 'x') {
2604            memmove(number + 2, number,
2605                    strlen(number) + 1);
2606            number[0] = '0';
2607            number[1] = 'x';
2608            len += 2;
2609        }
2610
2611        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2612            return NULL;
2613        break;
2614    }
2615
2616    case 's':
2617    {
2618        /* UTF-8 */
2619        const char *s = va_arg(*vargs, const char*);
2620        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2621            return NULL;
2622        break;
2623    }
2624
2625    case 'U':
2626    {
2627        PyObject *obj = va_arg(*vargs, PyObject *);
2628        assert(obj && _PyUnicode_CHECK(obj));
2629
2630        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2631            return NULL;
2632        break;
2633    }
2634
2635    case 'V':
2636    {
2637        PyObject *obj = va_arg(*vargs, PyObject *);
2638        const char *str = va_arg(*vargs, const char *);
2639        if (obj) {
2640            assert(_PyUnicode_CHECK(obj));
2641            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2642                return NULL;
2643        }
2644        else {
2645            assert(str != NULL);
2646            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2647                return NULL;
2648        }
2649        break;
2650    }
2651
2652    case 'S':
2653    {
2654        PyObject *obj = va_arg(*vargs, PyObject *);
2655        PyObject *str;
2656        assert(obj);
2657        str = PyObject_Str(obj);
2658        if (!str)
2659            return NULL;
2660        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2661            Py_DECREF(str);
2662            return NULL;
2663        }
2664        Py_DECREF(str);
2665        break;
2666    }
2667
2668    case 'R':
2669    {
2670        PyObject *obj = va_arg(*vargs, PyObject *);
2671        PyObject *repr;
2672        assert(obj);
2673        repr = PyObject_Repr(obj);
2674        if (!repr)
2675            return NULL;
2676        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2677            Py_DECREF(repr);
2678            return NULL;
2679        }
2680        Py_DECREF(repr);
2681        break;
2682    }
2683
2684    case 'A':
2685    {
2686        PyObject *obj = va_arg(*vargs, PyObject *);
2687        PyObject *ascii;
2688        assert(obj);
2689        ascii = PyObject_ASCII(obj);
2690        if (!ascii)
2691            return NULL;
2692        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2693            Py_DECREF(ascii);
2694            return NULL;
2695        }
2696        Py_DECREF(ascii);
2697        break;
2698    }
2699
2700    case '%':
2701        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2702            return NULL;
2703        break;
2704
2705    default:
2706        /* if we stumble upon an unknown formatting code, copy the rest
2707           of the format string to the output string. (we cannot just
2708           skip the code, since there's no way to know what's in the
2709           argument list) */
2710        len = strlen(p);
2711        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2712            return NULL;
2713        f = p+len;
2714        return f;
2715    }
2716
2717    f++;
2718    return f;
2719}
2720
2721PyObject *
2722PyUnicode_FromFormatV(const char *format, va_list vargs)
2723{
2724    va_list vargs2;
2725    const char *f;
2726    _PyUnicodeWriter writer;
2727
2728    _PyUnicodeWriter_Init(&writer);
2729    writer.min_length = strlen(format) + 100;
2730    writer.overallocate = 1;
2731
2732    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2733       Copy it to be able to pass a reference to a subfunction. */
2734    Py_VA_COPY(vargs2, vargs);
2735
2736    for (f = format; *f; ) {
2737        if (*f == '%') {
2738            f = unicode_fromformat_arg(&writer, f, &vargs2);
2739            if (f == NULL)
2740                goto fail;
2741        }
2742        else {
2743            const char *p;
2744            Py_ssize_t len;
2745
2746            p = f;
2747            do
2748            {
2749                if ((unsigned char)*p > 127) {
2750                    PyErr_Format(PyExc_ValueError,
2751                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752                        "string, got a non-ASCII byte: 0x%02x",
2753                        (unsigned char)*p);
2754                    return NULL;
2755                }
2756                p++;
2757            }
2758            while (*p != '\0' && *p != '%');
2759            len = p - f;
2760
2761            if (*p == '\0')
2762                writer.overallocate = 0;
2763
2764            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2765                goto fail;
2766
2767            f = p;
2768        }
2769    }
2770    return _PyUnicodeWriter_Finish(&writer);
2771
2772  fail:
2773    _PyUnicodeWriter_Dealloc(&writer);
2774    return NULL;
2775}
2776
2777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
2780    PyObject* ret;
2781    va_list vargs;
2782
2783#ifdef HAVE_STDARG_PROTOTYPES
2784    va_start(vargs, format);
2785#else
2786    va_start(vargs);
2787#endif
2788    ret = PyUnicode_FromFormatV(format, vargs);
2789    va_end(vargs);
2790    return ret;
2791}
2792
2793#ifdef HAVE_WCHAR_H
2794
2795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796   convert a Unicode object to a wide character string.
2797
2798   - If w is NULL: return the number of wide characters (including the null
2799     character) required to convert the unicode object. Ignore size argument.
2800
2801   - Otherwise: return the number of wide characters (excluding the null
2802     character) written into w. Write at most size wide characters (including
2803     the null character). */
2804static Py_ssize_t
2805unicode_aswidechar(PyObject *unicode,
2806                   wchar_t *w,
2807                   Py_ssize_t size)
2808{
2809    Py_ssize_t res;
2810    const wchar_t *wstr;
2811
2812    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2813    if (wstr == NULL)
2814        return -1;
2815
2816    if (w != NULL) {
2817        if (size > res)
2818            size = res + 1;
2819        else
2820            res = size;
2821        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2822        return res;
2823    }
2824    else
2825        return res + 1;
2826}
2827
2828Py_ssize_t
2829PyUnicode_AsWideChar(PyObject *unicode,
2830                     wchar_t *w,
2831                     Py_ssize_t size)
2832{
2833    if (unicode == NULL) {
2834        PyErr_BadInternalCall();
2835        return -1;
2836    }
2837    return unicode_aswidechar(unicode, w, size);
2838}
2839
2840wchar_t*
2841PyUnicode_AsWideCharString(PyObject *unicode,
2842                           Py_ssize_t *size)
2843{
2844    wchar_t* buffer;
2845    Py_ssize_t buflen;
2846
2847    if (unicode == NULL) {
2848        PyErr_BadInternalCall();
2849        return NULL;
2850    }
2851
2852    buflen = unicode_aswidechar(unicode, NULL, 0);
2853    if (buflen == -1)
2854        return NULL;
2855    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2856        PyErr_NoMemory();
2857        return NULL;
2858    }
2859
2860    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861    if (buffer == NULL) {
2862        PyErr_NoMemory();
2863        return NULL;
2864    }
2865    buflen = unicode_aswidechar(unicode, buffer, buflen);
2866    if (buflen == -1) {
2867        PyMem_FREE(buffer);
2868        return NULL;
2869    }
2870    if (size != NULL)
2871        *size = buflen;
2872    return buffer;
2873}
2874
2875#endif /* HAVE_WCHAR_H */
2876
2877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
2879{
2880    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2881        PyErr_SetString(PyExc_ValueError,
2882                        "chr() arg not in range(0x110000)");
2883        return NULL;
2884    }
2885
2886    return unicode_char((Py_UCS4)ordinal);
2887}
2888
2889PyObject *
2890PyUnicode_FromObject(PyObject *obj)
2891{
2892    /* XXX Perhaps we should make this API an alias of
2893       PyObject_Str() instead ?! */
2894    if (PyUnicode_CheckExact(obj)) {
2895        if (PyUnicode_READY(obj) == -1)
2896            return NULL;
2897        Py_INCREF(obj);
2898        return obj;
2899    }
2900    if (PyUnicode_Check(obj)) {
2901        /* For a Unicode subtype that's not a Unicode object,
2902           return a true Unicode object with the same data. */
2903        return _PyUnicode_Copy(obj);
2904    }
2905    PyErr_Format(PyExc_TypeError,
2906                 "Can't convert '%.100s' object to str implicitly",
2907                 Py_TYPE(obj)->tp_name);
2908    return NULL;
2909}
2910
2911PyObject *
2912PyUnicode_FromEncodedObject(PyObject *obj,
2913                            const char *encoding,
2914                            const char *errors)
2915{
2916    Py_buffer buffer;
2917    PyObject *v;
2918
2919    if (obj == NULL) {
2920        PyErr_BadInternalCall();
2921        return NULL;
2922    }
2923
2924    /* Decoding bytes objects is the most common case and should be fast */
2925    if (PyBytes_Check(obj)) {
2926        if (PyBytes_GET_SIZE(obj) == 0)
2927            _Py_RETURN_UNICODE_EMPTY();
2928        v = PyUnicode_Decode(
2929                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2930                encoding, errors);
2931        return v;
2932    }
2933
2934    if (PyUnicode_Check(obj)) {
2935        PyErr_SetString(PyExc_TypeError,
2936                        "decoding str is not supported");
2937        return NULL;
2938    }
2939
2940    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2941    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2942        PyErr_Format(PyExc_TypeError,
2943                     "coercing to str: need bytes, bytearray "
2944                     "or buffer-like object, %.80s found",
2945                     Py_TYPE(obj)->tp_name);
2946        return NULL;
2947    }
2948
2949    if (buffer.len == 0) {
2950        PyBuffer_Release(&buffer);
2951        _Py_RETURN_UNICODE_EMPTY();
2952    }
2953
2954    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2955    PyBuffer_Release(&buffer);
2956    return v;
2957}
2958
2959/* Convert encoding to lower case and replace '_' with '-' in order to
2960   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2961   1 on success. */
2962int
2963_Py_normalize_encoding(const char *encoding,
2964                       char *lower,
2965                       size_t lower_len)
2966{
2967    const char *e;
2968    char *l;
2969    char *l_end;
2970
2971    if (encoding == NULL) {
2972        /* 6 == strlen("utf-8") + 1 */
2973        if (lower_len < 6)
2974            return 0;
2975        strcpy(lower, "utf-8");
2976        return 1;
2977    }
2978    e = encoding;
2979    l = lower;
2980    l_end = &lower[lower_len - 1];
2981    while (*e) {
2982        if (l == l_end)
2983            return 0;
2984        if (Py_ISUPPER(*e)) {
2985            *l++ = Py_TOLOWER(*e++);
2986        }
2987        else if (*e == '_') {
2988            *l++ = '-';
2989            e++;
2990        }
2991        else {
2992            *l++ = *e++;
2993        }
2994    }
2995    *l = '\0';
2996    return 1;
2997}
2998
2999PyObject *
3000PyUnicode_Decode(const char *s,
3001                 Py_ssize_t size,
3002                 const char *encoding,
3003                 const char *errors)
3004{
3005    PyObject *buffer = NULL, *unicode;
3006    Py_buffer info;
3007    char lower[11];  /* Enough for any encoding shortcut */
3008
3009    /* Shortcuts for common default encodings */
3010    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3011        if ((strcmp(lower, "utf-8") == 0) ||
3012            (strcmp(lower, "utf8") == 0))
3013            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3014        else if ((strcmp(lower, "latin-1") == 0) ||
3015                 (strcmp(lower, "latin1") == 0) ||
3016                 (strcmp(lower, "iso-8859-1") == 0) ||
3017                 (strcmp(lower, "iso8859-1") == 0))
3018            return PyUnicode_DecodeLatin1(s, size, errors);
3019#ifdef HAVE_MBCS
3020        else if (strcmp(lower, "mbcs") == 0)
3021            return PyUnicode_DecodeMBCS(s, size, errors);
3022#endif
3023        else if (strcmp(lower, "ascii") == 0)
3024            return PyUnicode_DecodeASCII(s, size, errors);
3025        else if (strcmp(lower, "utf-16") == 0)
3026            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3027        else if (strcmp(lower, "utf-32") == 0)
3028            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3029    }
3030
3031    /* Decode via the codec registry */
3032    buffer = NULL;
3033    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3034        goto onError;
3035    buffer = PyMemoryView_FromBuffer(&info);
3036    if (buffer == NULL)
3037        goto onError;
3038    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3039    if (unicode == NULL)
3040        goto onError;
3041    if (!PyUnicode_Check(unicode)) {
3042        PyErr_Format(PyExc_TypeError,
3043                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3044                     "use codecs.decode() to decode to arbitrary types",
3045                     encoding,
3046                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3047        Py_DECREF(unicode);
3048        goto onError;
3049    }
3050    Py_DECREF(buffer);
3051    return unicode_result(unicode);
3052
3053  onError:
3054    Py_XDECREF(buffer);
3055    return NULL;
3056}
3057
3058PyObject *
3059PyUnicode_AsDecodedObject(PyObject *unicode,
3060                          const char *encoding,
3061                          const char *errors)
3062{
3063    PyObject *v;
3064
3065    if (!PyUnicode_Check(unicode)) {
3066        PyErr_BadArgument();
3067        goto onError;
3068    }
3069
3070    if (encoding == NULL)
3071        encoding = PyUnicode_GetDefaultEncoding();
3072
3073    /* Decode via the codec registry */
3074    v = PyCodec_Decode(unicode, encoding, errors);
3075    if (v == NULL)
3076        goto onError;
3077    return unicode_result(v);
3078
3079  onError:
3080    return NULL;
3081}
3082
3083PyObject *
3084PyUnicode_AsDecodedUnicode(PyObject *unicode,
3085                           const char *encoding,
3086                           const char *errors)
3087{
3088    PyObject *v;
3089
3090    if (!PyUnicode_Check(unicode)) {
3091        PyErr_BadArgument();
3092        goto onError;
3093    }
3094
3095    if (encoding == NULL)
3096        encoding = PyUnicode_GetDefaultEncoding();
3097
3098    /* Decode via the codec registry */
3099    v = PyCodec_Decode(unicode, encoding, errors);
3100    if (v == NULL)
3101        goto onError;
3102    if (!PyUnicode_Check(v)) {
3103        PyErr_Format(PyExc_TypeError,
3104                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3105                     "use codecs.decode() to decode to arbitrary types",
3106                     encoding,
3107                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3108        Py_DECREF(v);
3109        goto onError;
3110    }
3111    return unicode_result(v);
3112
3113  onError:
3114    return NULL;
3115}
3116
3117PyObject *
3118PyUnicode_Encode(const Py_UNICODE *s,
3119                 Py_ssize_t size,
3120                 const char *encoding,
3121                 const char *errors)
3122{
3123    PyObject *v, *unicode;
3124
3125    unicode = PyUnicode_FromUnicode(s, size);
3126    if (unicode == NULL)
3127        return NULL;
3128    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3129    Py_DECREF(unicode);
3130    return v;
3131}
3132
3133PyObject *
3134PyUnicode_AsEncodedObject(PyObject *unicode,
3135                          const char *encoding,
3136                          const char *errors)
3137{
3138    PyObject *v;
3139
3140    if (!PyUnicode_Check(unicode)) {
3141        PyErr_BadArgument();
3142        goto onError;
3143    }
3144
3145    if (encoding == NULL)
3146        encoding = PyUnicode_GetDefaultEncoding();
3147
3148    /* Encode via the codec registry */
3149    v = PyCodec_Encode(unicode, encoding, errors);
3150    if (v == NULL)
3151        goto onError;
3152    return v;
3153
3154  onError:
3155    return NULL;
3156}
3157
3158static size_t
3159wcstombs_errorpos(const wchar_t *wstr)
3160{
3161    size_t len;
3162#if SIZEOF_WCHAR_T == 2
3163    wchar_t buf[3];
3164#else
3165    wchar_t buf[2];
3166#endif
3167    char outbuf[MB_LEN_MAX];
3168    const wchar_t *start, *previous;
3169
3170#if SIZEOF_WCHAR_T == 2
3171    buf[2] = 0;
3172#else
3173    buf[1] = 0;
3174#endif
3175    start = wstr;
3176    while (*wstr != L'\0')
3177    {
3178        previous = wstr;
3179#if SIZEOF_WCHAR_T == 2
3180        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3181            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3182        {
3183            buf[0] = wstr[0];
3184            buf[1] = wstr[1];
3185            wstr += 2;
3186        }
3187        else {
3188            buf[0] = *wstr;
3189            buf[1] = 0;
3190            wstr++;
3191        }
3192#else
3193        buf[0] = *wstr;
3194        wstr++;
3195#endif
3196        len = wcstombs(outbuf, buf, sizeof(outbuf));
3197        if (len == (size_t)-1)
3198            return previous - start;
3199    }
3200
3201    /* failed to find the unencodable character */
3202    return 0;
3203}
3204
3205static int
3206locale_error_handler(const char *errors, int *surrogateescape)
3207{
3208    if (errors == NULL) {
3209        *surrogateescape = 0;
3210        return 0;
3211    }
3212
3213    if (strcmp(errors, "strict") == 0) {
3214        *surrogateescape = 0;
3215        return 0;
3216    }
3217    if (strcmp(errors, "surrogateescape") == 0) {
3218        *surrogateescape = 1;
3219        return 0;
3220    }
3221    PyErr_Format(PyExc_ValueError,
3222                 "only 'strict' and 'surrogateescape' error handlers "
3223                 "are supported, not '%s'",
3224                 errors);
3225    return -1;
3226}
3227
3228PyObject *
3229PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3230{
3231    Py_ssize_t wlen, wlen2;
3232    wchar_t *wstr;
3233    PyObject *bytes = NULL;
3234    char *errmsg;
3235    PyObject *reason = NULL;
3236    PyObject *exc;
3237    size_t error_pos;
3238    int surrogateescape;
3239
3240    if (locale_error_handler(errors, &surrogateescape) < 0)
3241        return NULL;
3242
3243    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3244    if (wstr == NULL)
3245        return NULL;
3246
3247    wlen2 = wcslen(wstr);
3248    if (wlen2 != wlen) {
3249        PyMem_Free(wstr);
3250        PyErr_SetString(PyExc_ValueError, "embedded null character");
3251        return NULL;
3252    }
3253
3254    if (surrogateescape) {
3255        /* "surrogateescape" error handler */
3256        char *str;
3257
3258        str = Py_EncodeLocale(wstr, &error_pos);
3259        if (str == NULL) {
3260            if (error_pos == (size_t)-1) {
3261                PyErr_NoMemory();
3262                PyMem_Free(wstr);
3263                return NULL;
3264            }
3265            else {
3266                goto encode_error;
3267            }
3268        }
3269        PyMem_Free(wstr);
3270
3271        bytes = PyBytes_FromString(str);
3272        PyMem_Free(str);
3273    }
3274    else {
3275        /* strict mode */
3276        size_t len, len2;
3277
3278        len = wcstombs(NULL, wstr, 0);
3279        if (len == (size_t)-1) {
3280            error_pos = (size_t)-1;
3281            goto encode_error;
3282        }
3283
3284        bytes = PyBytes_FromStringAndSize(NULL, len);
3285        if (bytes == NULL) {
3286            PyMem_Free(wstr);
3287            return NULL;
3288        }
3289
3290        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3291        if (len2 == (size_t)-1 || len2 > len) {
3292            error_pos = (size_t)-1;
3293            goto encode_error;
3294        }
3295        PyMem_Free(wstr);
3296    }
3297    return bytes;
3298
3299encode_error:
3300    errmsg = strerror(errno);
3301    assert(errmsg != NULL);
3302
3303    if (error_pos == (size_t)-1)
3304        error_pos = wcstombs_errorpos(wstr);
3305
3306    PyMem_Free(wstr);
3307    Py_XDECREF(bytes);
3308
3309    if (errmsg != NULL) {
3310        size_t errlen;
3311        wstr = Py_DecodeLocale(errmsg, &errlen);
3312        if (wstr != NULL) {
3313            reason = PyUnicode_FromWideChar(wstr, errlen);
3314            PyMem_RawFree(wstr);
3315        } else
3316            errmsg = NULL;
3317    }
3318    if (errmsg == NULL)
3319        reason = PyUnicode_FromString(
3320            "wcstombs() encountered an unencodable "
3321            "wide character");
3322    if (reason == NULL)
3323        return NULL;
3324
3325    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3326                                "locale", unicode,
3327                                (Py_ssize_t)error_pos,
3328                                (Py_ssize_t)(error_pos+1),
3329                                reason);
3330    Py_DECREF(reason);
3331    if (exc != NULL) {
3332        PyCodec_StrictErrors(exc);
3333        Py_XDECREF(exc);
3334    }
3335    return NULL;
3336}
3337
3338PyObject *
3339PyUnicode_EncodeFSDefault(PyObject *unicode)
3340{
3341#ifdef HAVE_MBCS
3342    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3343#elif defined(__APPLE__)
3344    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3345#else
3346    PyInterpreterState *interp = PyThreadState_GET()->interp;
3347    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3348       cannot use it to encode and decode filenames before it is loaded. Load
3349       the Python codec requires to encode at least its own filename. Use the C
3350       version of the locale codec until the codec registry is initialized and
3351       the Python codec is loaded.
3352
3353       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3354       cannot only rely on it: check also interp->fscodec_initialized for
3355       subinterpreters. */
3356    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3357        return PyUnicode_AsEncodedString(unicode,
3358                                         Py_FileSystemDefaultEncoding,
3359                                         "surrogateescape");
3360    }
3361    else {
3362        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3363    }
3364#endif
3365}
3366
3367PyObject *
3368PyUnicode_AsEncodedString(PyObject *unicode,
3369                          const char *encoding,
3370                          const char *errors)
3371{
3372    PyObject *v;
3373    char lower[11];  /* Enough for any encoding shortcut */
3374
3375    if (!PyUnicode_Check(unicode)) {
3376        PyErr_BadArgument();
3377        return NULL;
3378    }
3379
3380    /* Shortcuts for common default encodings */
3381    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3382        if ((strcmp(lower, "utf-8") == 0) ||
3383            (strcmp(lower, "utf8") == 0))
3384        {
3385            if (errors == NULL || strcmp(errors, "strict") == 0)
3386                return _PyUnicode_AsUTF8String(unicode, NULL);
3387            else
3388                return _PyUnicode_AsUTF8String(unicode, errors);
3389        }
3390        else if ((strcmp(lower, "latin-1") == 0) ||
3391                 (strcmp(lower, "latin1") == 0) ||
3392                 (strcmp(lower, "iso-8859-1") == 0) ||
3393                 (strcmp(lower, "iso8859-1") == 0))
3394            return _PyUnicode_AsLatin1String(unicode, errors);
3395#ifdef HAVE_MBCS
3396        else if (strcmp(lower, "mbcs") == 0)
3397            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3398#endif
3399        else if (strcmp(lower, "ascii") == 0)
3400            return _PyUnicode_AsASCIIString(unicode, errors);
3401    }
3402
3403    /* Encode via the codec registry */
3404    v = _PyCodec_EncodeText(unicode, encoding, errors);
3405    if (v == NULL)
3406        return NULL;
3407
3408    /* The normal path */
3409    if (PyBytes_Check(v))
3410        return v;
3411
3412    /* If the codec returns a buffer, raise a warning and convert to bytes */
3413    if (PyByteArray_Check(v)) {
3414        int error;
3415        PyObject *b;
3416
3417        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3418            "encoder %s returned bytearray instead of bytes; "
3419            "use codecs.encode() to encode to arbitrary types",
3420            encoding);
3421        if (error) {
3422            Py_DECREF(v);
3423            return NULL;
3424        }
3425
3426        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3427        Py_DECREF(v);
3428        return b;
3429    }
3430
3431    PyErr_Format(PyExc_TypeError,
3432                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3433                 "use codecs.encode() to encode to arbitrary types",
3434                 encoding,
3435                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3436    Py_DECREF(v);
3437    return NULL;
3438}
3439
3440PyObject *
3441PyUnicode_AsEncodedUnicode(PyObject *unicode,
3442                           const char *encoding,
3443                           const char *errors)
3444{
3445    PyObject *v;
3446
3447    if (!PyUnicode_Check(unicode)) {
3448        PyErr_BadArgument();
3449        goto onError;
3450    }
3451
3452    if (encoding == NULL)
3453        encoding = PyUnicode_GetDefaultEncoding();
3454
3455    /* Encode via the codec registry */
3456    v = PyCodec_Encode(unicode, encoding, errors);
3457    if (v == NULL)
3458        goto onError;
3459    if (!PyUnicode_Check(v)) {
3460        PyErr_Format(PyExc_TypeError,
3461                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3462                     "use codecs.encode() to encode to arbitrary types",
3463                     encoding,
3464                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3465        Py_DECREF(v);
3466        goto onError;
3467    }
3468    return v;
3469
3470  onError:
3471    return NULL;
3472}
3473
3474static size_t
3475mbstowcs_errorpos(const char *str, size_t len)
3476{
3477#ifdef HAVE_MBRTOWC
3478    const char *start = str;
3479    mbstate_t mbs;
3480    size_t converted;
3481    wchar_t ch;
3482
3483    memset(&mbs, 0, sizeof mbs);
3484    while (len)
3485    {
3486        converted = mbrtowc(&ch, str, len, &mbs);
3487        if (converted == 0)
3488            /* Reached end of string */
3489            break;
3490        if (converted == (size_t)-1 || converted == (size_t)-2) {
3491            /* Conversion error or incomplete character */
3492            return str - start;
3493        }
3494        else {
3495            str += converted;
3496            len -= converted;
3497        }
3498    }
3499    /* failed to find the undecodable byte sequence */
3500    return 0;
3501#endif
3502    return 0;
3503}
3504
3505PyObject*
3506PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3507                              const char *errors)
3508{
3509    wchar_t smallbuf[256];
3510    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3511    wchar_t *wstr;
3512    size_t wlen, wlen2;
3513    PyObject *unicode;
3514    int surrogateescape;
3515    size_t error_pos;
3516    char *errmsg;
3517    PyObject *reason, *exc;
3518
3519    if (locale_error_handler(errors, &surrogateescape) < 0)
3520        return NULL;
3521
3522    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3523        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3524        return NULL;
3525    }
3526
3527    if (surrogateescape) {
3528        /* "surrogateescape" error handler */
3529        wstr = Py_DecodeLocale(str, &wlen);
3530        if (wstr == NULL) {
3531            if (wlen == (size_t)-1)
3532                PyErr_NoMemory();
3533            else
3534                PyErr_SetFromErrno(PyExc_OSError);
3535            return NULL;
3536        }
3537
3538        unicode = PyUnicode_FromWideChar(wstr, wlen);
3539        PyMem_RawFree(wstr);
3540    }
3541    else {
3542        /* strict mode */
3543#ifndef HAVE_BROKEN_MBSTOWCS
3544        wlen = mbstowcs(NULL, str, 0);
3545#else
3546        wlen = len;
3547#endif
3548        if (wlen == (size_t)-1)
3549            goto decode_error;
3550        if (wlen+1 <= smallbuf_len) {
3551            wstr = smallbuf;
3552        }
3553        else {
3554            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3555                return PyErr_NoMemory();
3556
3557            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3558            if (!wstr)
3559                return PyErr_NoMemory();
3560        }
3561
3562        wlen2 = mbstowcs(wstr, str, wlen+1);
3563        if (wlen2 == (size_t)-1) {
3564            if (wstr != smallbuf)
3565                PyMem_Free(wstr);
3566            goto decode_error;
3567        }
3568#ifdef HAVE_BROKEN_MBSTOWCS
3569        assert(wlen2 == wlen);
3570#endif
3571        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3572        if (wstr != smallbuf)
3573            PyMem_Free(wstr);
3574    }
3575    return unicode;
3576
3577decode_error:
3578    errmsg = strerror(errno);
3579    assert(errmsg != NULL);
3580
3581    error_pos = mbstowcs_errorpos(str, len);
3582    if (errmsg != NULL) {
3583        size_t errlen;
3584        wstr = Py_DecodeLocale(errmsg, &errlen);
3585        if (wstr != NULL) {
3586            reason = PyUnicode_FromWideChar(wstr, errlen);
3587            PyMem_RawFree(wstr);
3588        } else
3589            errmsg = NULL;
3590    }
3591    if (errmsg == NULL)
3592        reason = PyUnicode_FromString(
3593            "mbstowcs() encountered an invalid multibyte sequence");
3594    if (reason == NULL)
3595        return NULL;
3596
3597    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3598                                "locale", str, len,
3599                                (Py_ssize_t)error_pos,
3600                                (Py_ssize_t)(error_pos+1),
3601                                reason);
3602    Py_DECREF(reason);
3603    if (exc != NULL) {
3604        PyCodec_StrictErrors(exc);
3605        Py_XDECREF(exc);
3606    }
3607    return NULL;
3608}
3609
3610PyObject*
3611PyUnicode_DecodeLocale(const char *str, const char *errors)
3612{
3613    Py_ssize_t size = (Py_ssize_t)strlen(str);
3614    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3615}
3616
3617
3618PyObject*
3619PyUnicode_DecodeFSDefault(const char *s) {
3620    Py_ssize_t size = (Py_ssize_t)strlen(s);
3621    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3622}
3623
3624PyObject*
3625PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3626{
3627#ifdef HAVE_MBCS
3628    return PyUnicode_DecodeMBCS(s, size, NULL);
3629#elif defined(__APPLE__)
3630    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3631#else
3632    PyInterpreterState *interp = PyThreadState_GET()->interp;
3633    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3634       cannot use it to encode and decode filenames before it is loaded. Load
3635       the Python codec requires to encode at least its own filename. Use the C
3636       version of the locale codec until the codec registry is initialized and
3637       the Python codec is loaded.
3638
3639       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3640       cannot only rely on it: check also interp->fscodec_initialized for
3641       subinterpreters. */
3642    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3643        return PyUnicode_Decode(s, size,
3644                                Py_FileSystemDefaultEncoding,
3645                                "surrogateescape");
3646    }
3647    else {
3648        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3649    }
3650#endif
3651}
3652
3653
3654int
3655_PyUnicode_HasNULChars(PyObject* str)
3656{
3657    Py_ssize_t pos;
3658
3659    if (PyUnicode_READY(str) == -1)
3660        return -1;
3661    pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3662                   PyUnicode_GET_LENGTH(str), '\0', 1);
3663    if (pos == -1)
3664        return 0;
3665    else
3666        return 1;
3667}
3668
3669int
3670PyUnicode_FSConverter(PyObject* arg, void* addr)
3671{
3672    PyObject *output = NULL;
3673    Py_ssize_t size;
3674    void *data;
3675    if (arg == NULL) {
3676        Py_DECREF(*(PyObject**)addr);
3677        return 1;
3678    }
3679    if (PyBytes_Check(arg)) {
3680        output = arg;
3681        Py_INCREF(output);
3682    }
3683    else {
3684        arg = PyUnicode_FromObject(arg);
3685        if (!arg)
3686            return 0;
3687        output = PyUnicode_EncodeFSDefault(arg);
3688        Py_DECREF(arg);
3689        if (!output)
3690            return 0;
3691        if (!PyBytes_Check(output)) {
3692            Py_DECREF(output);
3693            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3694            return 0;
3695        }
3696    }
3697    size = PyBytes_GET_SIZE(output);
3698    data = PyBytes_AS_STRING(output);
3699    if ((size_t)size != strlen(data)) {
3700        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3701        Py_DECREF(output);
3702        return 0;
3703    }
3704    *(PyObject**)addr = output;
3705    return Py_CLEANUP_SUPPORTED;
3706}
3707
3708
3709int
3710PyUnicode_FSDecoder(PyObject* arg, void* addr)
3711{
3712    PyObject *output = NULL;
3713    if (arg == NULL) {
3714        Py_DECREF(*(PyObject**)addr);
3715        return 1;
3716    }
3717    if (PyUnicode_Check(arg)) {
3718        if (PyUnicode_READY(arg) == -1)
3719            return 0;
3720        output = arg;
3721        Py_INCREF(output);
3722    }
3723    else {
3724        arg = PyBytes_FromObject(arg);
3725        if (!arg)
3726            return 0;
3727        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3728                                                  PyBytes_GET_SIZE(arg));
3729        Py_DECREF(arg);
3730        if (!output)
3731            return 0;
3732        if (!PyUnicode_Check(output)) {
3733            Py_DECREF(output);
3734            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3735            return 0;
3736        }
3737    }
3738    if (PyUnicode_READY(output) == -1) {
3739        Py_DECREF(output);
3740        return 0;
3741    }
3742    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3743                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3744        PyErr_SetString(PyExc_ValueError, "embedded null character");
3745        Py_DECREF(output);
3746        return 0;
3747    }
3748    *(PyObject**)addr = output;
3749    return Py_CLEANUP_SUPPORTED;
3750}
3751
3752
3753char*
3754PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3755{
3756    PyObject *bytes;
3757
3758    if (!PyUnicode_Check(unicode)) {
3759        PyErr_BadArgument();
3760        return NULL;
3761    }
3762    if (PyUnicode_READY(unicode) == -1)
3763        return NULL;
3764
3765    if (PyUnicode_UTF8(unicode) == NULL) {
3766        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3767        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3768        if (bytes == NULL)
3769            return NULL;
3770        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3771        if (_PyUnicode_UTF8(unicode) == NULL) {
3772            PyErr_NoMemory();
3773            Py_DECREF(bytes);
3774            return NULL;
3775        }
3776        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3777        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3778                  PyBytes_AS_STRING(bytes),
3779                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3780        Py_DECREF(bytes);
3781    }
3782
3783    if (psize)
3784        *psize = PyUnicode_UTF8_LENGTH(unicode);
3785    return PyUnicode_UTF8(unicode);
3786}
3787
3788char*
3789PyUnicode_AsUTF8(PyObject *unicode)
3790{
3791    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3792}
3793
3794Py_UNICODE *
3795PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3796{
3797    const unsigned char *one_byte;
3798#if SIZEOF_WCHAR_T == 4
3799    const Py_UCS2 *two_bytes;
3800#else
3801    const Py_UCS4 *four_bytes;
3802    const Py_UCS4 *ucs4_end;
3803    Py_ssize_t num_surrogates;
3804#endif
3805    wchar_t *w;
3806    wchar_t *wchar_end;
3807
3808    if (!PyUnicode_Check(unicode)) {
3809        PyErr_BadArgument();
3810        return NULL;
3811    }
3812    if (_PyUnicode_WSTR(unicode) == NULL) {
3813        /* Non-ASCII compact unicode object */
3814        assert(_PyUnicode_KIND(unicode) != 0);
3815        assert(PyUnicode_IS_READY(unicode));
3816
3817        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3818#if SIZEOF_WCHAR_T == 2
3819            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3820            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3821            num_surrogates = 0;
3822
3823            for (; four_bytes < ucs4_end; ++four_bytes) {
3824                if (*four_bytes > 0xFFFF)
3825                    ++num_surrogates;
3826            }
3827
3828            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3829                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3830            if (!_PyUnicode_WSTR(unicode)) {
3831                PyErr_NoMemory();
3832                return NULL;
3833            }
3834            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3835
3836            w = _PyUnicode_WSTR(unicode);
3837            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3838            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3839            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3840                if (*four_bytes > 0xFFFF) {
3841                    assert(*four_bytes <= MAX_UNICODE);
3842                    /* encode surrogate pair in this case */
3843                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3844                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3845                }
3846                else
3847                    *w = *four_bytes;
3848
3849                if (w > wchar_end) {
3850                    assert(0 && "Miscalculated string end");
3851                }
3852            }
3853            *w = 0;
3854#else
3855            /* sizeof(wchar_t) == 4 */
3856            Py_FatalError("Impossible unicode object state, wstr and str "
3857                          "should share memory already.");
3858            return NULL;
3859#endif
3860        }
3861        else {
3862            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3863                                                  (_PyUnicode_LENGTH(unicode) + 1));
3864            if (!_PyUnicode_WSTR(unicode)) {
3865                PyErr_NoMemory();
3866                return NULL;
3867            }
3868            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3869                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3870            w = _PyUnicode_WSTR(unicode);
3871            wchar_end = w + _PyUnicode_LENGTH(unicode);
3872
3873            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3874                one_byte = PyUnicode_1BYTE_DATA(unicode);
3875                for (; w < wchar_end; ++one_byte, ++w)
3876                    *w = *one_byte;
3877                /* null-terminate the wstr */
3878                *w = 0;
3879            }
3880            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3881#if SIZEOF_WCHAR_T == 4
3882                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3883                for (; w < wchar_end; ++two_bytes, ++w)
3884                    *w = *two_bytes;
3885                /* null-terminate the wstr */
3886                *w = 0;
3887#else
3888                /* sizeof(wchar_t) == 2 */
3889                PyObject_FREE(_PyUnicode_WSTR(unicode));
3890                _PyUnicode_WSTR(unicode) = NULL;
3891                Py_FatalError("Impossible unicode object state, wstr "
3892                              "and str should share memory already.");
3893                return NULL;
3894#endif
3895            }
3896            else {
3897                assert(0 && "This should never happen.");
3898            }
3899        }
3900    }
3901    if (size != NULL)
3902        *size = PyUnicode_WSTR_LENGTH(unicode);
3903    return _PyUnicode_WSTR(unicode);
3904}
3905
3906Py_UNICODE *
3907PyUnicode_AsUnicode(PyObject *unicode)
3908{
3909    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3910}
3911
3912
3913Py_ssize_t
3914PyUnicode_GetSize(PyObject *unicode)
3915{
3916    if (!PyUnicode_Check(unicode)) {
3917        PyErr_BadArgument();
3918        goto onError;
3919    }
3920    return PyUnicode_GET_SIZE(unicode);
3921
3922  onError:
3923    return -1;
3924}
3925
3926Py_ssize_t
3927PyUnicode_GetLength(PyObject *unicode)
3928{
3929    if (!PyUnicode_Check(unicode)) {
3930        PyErr_BadArgument();
3931        return -1;
3932    }
3933    if (PyUnicode_READY(unicode) == -1)
3934        return -1;
3935    return PyUnicode_GET_LENGTH(unicode);
3936}
3937
3938Py_UCS4
3939PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3940{
3941    void *data;
3942    int kind;
3943
3944    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3945        PyErr_BadArgument();
3946        return (Py_UCS4)-1;
3947    }
3948    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3949        PyErr_SetString(PyExc_IndexError, "string index out of range");
3950        return (Py_UCS4)-1;
3951    }
3952    data = PyUnicode_DATA(unicode);
3953    kind = PyUnicode_KIND(unicode);
3954    return PyUnicode_READ(kind, data, index);
3955}
3956
3957int
3958PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3959{
3960    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3961        PyErr_BadArgument();
3962        return -1;
3963    }
3964    assert(PyUnicode_IS_READY(unicode));
3965    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3966        PyErr_SetString(PyExc_IndexError, "string index out of range");
3967        return -1;
3968    }
3969    if (unicode_check_modifiable(unicode))
3970        return -1;
3971    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3972        PyErr_SetString(PyExc_ValueError, "character out of range");
3973        return -1;
3974    }
3975    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3976                    index, ch);
3977    return 0;
3978}
3979
3980const char *
3981PyUnicode_GetDefaultEncoding(void)
3982{
3983    return "utf-8";
3984}
3985
3986/* create or adjust a UnicodeDecodeError */
3987static void
3988make_decode_exception(PyObject **exceptionObject,
3989                      const char *encoding,
3990                      const char *input, Py_ssize_t length,
3991                      Py_ssize_t startpos, Py_ssize_t endpos,
3992                      const char *reason)
3993{
3994    if (*exceptionObject == NULL) {
3995        *exceptionObject = PyUnicodeDecodeError_Create(
3996            encoding, input, length, startpos, endpos, reason);
3997    }
3998    else {
3999        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4000            goto onError;
4001        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4002            goto onError;
4003        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4004            goto onError;
4005    }
4006    return;
4007
4008onError:
4009    Py_CLEAR(*exceptionObject);
4010}
4011
4012#ifdef HAVE_MBCS
4013/* error handling callback helper:
4014   build arguments, call the callback and check the arguments,
4015   if no exception occurred, copy the replacement to the output
4016   and adjust various state variables.
4017   return 0 on success, -1 on error
4018*/
4019
4020static int
4021unicode_decode_call_errorhandler_wchar(
4022    const char *errors, PyObject **errorHandler,
4023    const char *encoding, const char *reason,
4024    const char **input, const char **inend, Py_ssize_t *startinpos,
4025    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4026    PyObject **output, Py_ssize_t *outpos)
4027{
4028    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4029
4030    PyObject *restuple = NULL;
4031    PyObject *repunicode = NULL;
4032    Py_ssize_t outsize;
4033    Py_ssize_t insize;
4034    Py_ssize_t requiredsize;
4035    Py_ssize_t newpos;
4036    PyObject *inputobj = NULL;
4037    wchar_t *repwstr;
4038    Py_ssize_t repwlen;
4039
4040    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4041    outsize = _PyUnicode_WSTR_LENGTH(*output);
4042
4043    if (*errorHandler == NULL) {
4044        *errorHandler = PyCodec_LookupError(errors);
4045        if (*errorHandler == NULL)
4046            goto onError;
4047    }
4048
4049    make_decode_exception(exceptionObject,
4050        encoding,
4051        *input, *inend - *input,
4052        *startinpos, *endinpos,
4053        reason);
4054    if (*exceptionObject == NULL)
4055        goto onError;
4056
4057    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4058    if (restuple == NULL)
4059        goto onError;
4060    if (!PyTuple_Check(restuple)) {
4061        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4062        goto onError;
4063    }
4064    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4065        goto onError;
4066
4067    /* Copy back the bytes variables, which might have been modified by the
4068       callback */
4069    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4070    if (!inputobj)
4071        goto onError;
4072    if (!PyBytes_Check(inputobj)) {
4073        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4074    }
4075    *input = PyBytes_AS_STRING(inputobj);
4076    insize = PyBytes_GET_SIZE(inputobj);
4077    *inend = *input + insize;
4078    /* we can DECREF safely, as the exception has another reference,
4079       so the object won't go away. */
4080    Py_DECREF(inputobj);
4081
4082    if (newpos<0)
4083        newpos = insize+newpos;
4084    if (newpos<0 || newpos>insize) {
4085        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4086        goto onError;
4087    }
4088
4089    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4090    if (repwstr == NULL)
4091        goto onError;
4092    /* need more space? (at least enough for what we
4093       have+the replacement+the rest of the string (starting
4094       at the new input position), so we won't have to check space
4095       when there are no errors in the rest of the string) */
4096    requiredsize = *outpos;
4097    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4098        goto overflow;
4099    requiredsize += repwlen;
4100    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4101        goto overflow;
4102    requiredsize += insize - newpos;
4103    if (requiredsize > outsize) {
4104        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4105            requiredsize = 2*outsize;
4106        if (unicode_resize(output, requiredsize) < 0)
4107            goto onError;
4108    }
4109    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4110    *outpos += repwlen;
4111    *endinpos = newpos;
4112    *inptr = *input + newpos;
4113
4114    /* we made it! */
4115    Py_XDECREF(restuple);
4116    return 0;
4117
4118  overflow:
4119    PyErr_SetString(PyExc_OverflowError,
4120                    "decoded result is too long for a Python string");
4121
4122  onError:
4123    Py_XDECREF(restuple);
4124    return -1;
4125}
4126#endif   /* HAVE_MBCS */
4127
4128static int
4129unicode_decode_call_errorhandler_writer(
4130    const char *errors, PyObject **errorHandler,
4131    const char *encoding, const char *reason,
4132    const char **input, const char **inend, Py_ssize_t *startinpos,
4133    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4134    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4135{
4136    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4137
4138    PyObject *restuple = NULL;
4139    PyObject *repunicode = NULL;
4140    Py_ssize_t insize;
4141    Py_ssize_t newpos;
4142    Py_ssize_t replen;
4143    PyObject *inputobj = NULL;
4144
4145    if (*errorHandler == NULL) {
4146        *errorHandler = PyCodec_LookupError(errors);
4147        if (*errorHandler == NULL)
4148            goto onError;
4149    }
4150
4151    make_decode_exception(exceptionObject,
4152        encoding,
4153        *input, *inend - *input,
4154        *startinpos, *endinpos,
4155        reason);
4156    if (*exceptionObject == NULL)
4157        goto onError;
4158
4159    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4160    if (restuple == NULL)
4161        goto onError;
4162    if (!PyTuple_Check(restuple)) {
4163        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4164        goto onError;
4165    }
4166    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4167        goto onError;
4168
4169    /* Copy back the bytes variables, which might have been modified by the
4170       callback */
4171    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4172    if (!inputobj)
4173        goto onError;
4174    if (!PyBytes_Check(inputobj)) {
4175        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4176    }
4177    *input = PyBytes_AS_STRING(inputobj);
4178    insize = PyBytes_GET_SIZE(inputobj);
4179    *inend = *input + insize;
4180    /* we can DECREF safely, as the exception has another reference,
4181       so the object won't go away. */
4182    Py_DECREF(inputobj);
4183
4184    if (newpos<0)
4185        newpos = insize+newpos;
4186    if (newpos<0 || newpos>insize) {
4187        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4188        goto onError;
4189    }
4190
4191    if (PyUnicode_READY(repunicode) < 0)
4192        goto onError;
4193    replen = PyUnicode_GET_LENGTH(repunicode);
4194    writer->min_length += replen;
4195    if (replen > 1)
4196        writer->overallocate = 1;
4197    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4198        goto onError;
4199
4200    *endinpos = newpos;
4201    *inptr = *input + newpos;
4202
4203    /* we made it! */
4204    Py_XDECREF(restuple);
4205    return 0;
4206
4207  onError:
4208    Py_XDECREF(restuple);
4209    return -1;
4210}
4211
4212/* --- UTF-7 Codec -------------------------------------------------------- */
4213
4214/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4215
4216/* Three simple macros defining base-64. */
4217
4218/* Is c a base-64 character? */
4219
4220#define IS_BASE64(c) \
4221    (((c) >= 'A' && (c) <= 'Z') ||     \
4222     ((c) >= 'a' && (c) <= 'z') ||     \
4223     ((c) >= '0' && (c) <= '9') ||     \
4224     (c) == '+' || (c) == '/')
4225
4226/* given that c is a base-64 character, what is its base-64 value? */
4227
4228#define FROM_BASE64(c)                                                  \
4229    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4230     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4231     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4232     (c) == '+' ? 62 : 63)
4233
4234/* What is the base-64 character of the bottom 6 bits of n? */
4235
4236#define TO_BASE64(n)  \
4237    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4238
4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4240 * decoded as itself.  We are permissive on decoding; the only ASCII
4241 * byte not decoding to itself is the + which begins a base64
4242 * string. */
4243
4244#define DECODE_DIRECT(c)                                \
4245    ((c) <= 127 && (c) != '+')
4246
4247/* The UTF-7 encoder treats ASCII characters differently according to
4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4249 * the above).  See RFC2152.  This array identifies these different
4250 * sets:
4251 * 0 : "Set D"
4252 *     alphanumeric and '(),-./:?
4253 * 1 : "Set O"
4254 *     !"#$%&*;<=>@[]^_`{|}
4255 * 2 : "whitespace"
4256 *     ht nl cr sp
4257 * 3 : special (must be base64 encoded)
4258 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4259 */
4260
4261static
4262char utf7_category[128] = {
4263/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4264    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4266    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4267/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4268    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4269/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4270    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4271/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4272    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4273/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4274    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4275/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4276    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4277/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4278    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4279};
4280
4281/* ENCODE_DIRECT: this character should be encoded as itself.  The
4282 * answer depends on whether we are encoding set O as itself, and also
4283 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4284 * clear that the answers to these questions vary between
4285 * applications, so this code needs to be flexible.  */
4286
4287#define ENCODE_DIRECT(c, directO, directWS)             \
4288    ((c) < 128 && (c) > 0 &&                            \
4289     ((utf7_category[(c)] == 0) ||                      \
4290      (directWS && (utf7_category[(c)] == 2)) ||        \
4291      (directO && (utf7_category[(c)] == 1))))
4292
4293PyObject *
4294PyUnicode_DecodeUTF7(const char *s,
4295                     Py_ssize_t size,
4296                     const char *errors)
4297{
4298    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4299}
4300
4301/* The decoder.  The only state we preserve is our read position,
4302 * i.e. how many characters we have consumed.  So if we end in the
4303 * middle of a shift sequence we have to back off the read position
4304 * and the output to the beginning of the sequence, otherwise we lose
4305 * all the shift state (seen bits, number of bits seen, high
4306 * surrogate). */
4307
4308PyObject *
4309PyUnicode_DecodeUTF7Stateful(const char *s,
4310                             Py_ssize_t size,
4311                             const char *errors,
4312                             Py_ssize_t *consumed)
4313{
4314    const char *starts = s;
4315    Py_ssize_t startinpos;
4316    Py_ssize_t endinpos;
4317    const char *e;
4318    _PyUnicodeWriter writer;
4319    const char *errmsg = "";
4320    int inShift = 0;
4321    Py_ssize_t shiftOutStart;
4322    unsigned int base64bits = 0;
4323    unsigned long base64buffer = 0;
4324    Py_UCS4 surrogate = 0;
4325    PyObject *errorHandler = NULL;
4326    PyObject *exc = NULL;
4327
4328    if (size == 0) {
4329        if (consumed)
4330            *consumed = 0;
4331        _Py_RETURN_UNICODE_EMPTY();
4332    }
4333
4334    /* Start off assuming it's all ASCII. Widen later as necessary. */
4335    _PyUnicodeWriter_Init(&writer);
4336    writer.min_length = size;
4337
4338    shiftOutStart = 0;
4339    e = s + size;
4340
4341    while (s < e) {
4342        Py_UCS4 ch;
4343      restart:
4344        ch = (unsigned char) *s;
4345
4346        if (inShift) { /* in a base-64 section */
4347            if (IS_BASE64(ch)) { /* consume a base-64 character */
4348                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4349                base64bits += 6;
4350                s++;
4351                if (base64bits >= 16) {
4352                    /* we have enough bits for a UTF-16 value */
4353                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4354                    base64bits -= 16;
4355                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4356                    assert(outCh <= 0xffff);
4357                    if (surrogate) {
4358                        /* expecting a second surrogate */
4359                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4360                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4361                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4362                                goto onError;
4363                            surrogate = 0;
4364                            continue;
4365                        }
4366                        else {
4367                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4368                                goto onError;
4369                            surrogate = 0;
4370                        }
4371                    }
4372                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4373                        /* first surrogate */
4374                        surrogate = outCh;
4375                    }
4376                    else {
4377                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4378                            goto onError;
4379                    }
4380                }
4381            }
4382            else { /* now leaving a base-64 section */
4383                inShift = 0;
4384                s++;
4385                if (surrogate) {
4386                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4387                        goto onError;
4388                    surrogate = 0;
4389                }
4390                if (base64bits > 0) { /* left-over bits */
4391                    if (base64bits >= 6) {
4392                        /* We've seen at least one base-64 character */
4393                        errmsg = "partial character in shift sequence";
4394                        goto utf7Error;
4395                    }
4396                    else {
4397                        /* Some bits remain; they should be zero */
4398                        if (base64buffer != 0) {
4399                            errmsg = "non-zero padding bits in shift sequence";
4400                            goto utf7Error;
4401                        }
4402                    }
4403                }
4404                if (ch != '-') {
4405                    /* '-' is absorbed; other terminating
4406                       characters are preserved */
4407                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4408                        goto onError;
4409                }
4410            }
4411        }
4412        else if ( ch == '+' ) {
4413            startinpos = s-starts;
4414            s++; /* consume '+' */
4415            if (s < e && *s == '-') { /* '+-' encodes '+' */
4416                s++;
4417                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4418                    goto onError;
4419            }
4420            else { /* begin base64-encoded section */
4421                inShift = 1;
4422                shiftOutStart = writer.pos;
4423                base64bits = 0;
4424                base64buffer = 0;
4425            }
4426        }
4427        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4428            s++;
4429            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4430                goto onError;
4431        }
4432        else {
4433            startinpos = s-starts;
4434            s++;
4435            errmsg = "unexpected special character";
4436            goto utf7Error;
4437        }
4438        continue;
4439utf7Error:
4440        endinpos = s-starts;
4441        if (unicode_decode_call_errorhandler_writer(
4442                errors, &errorHandler,
4443                "utf7", errmsg,
4444                &starts, &e, &startinpos, &endinpos, &exc, &s,
4445                &writer))
4446            goto onError;
4447    }
4448
4449    /* end of string */
4450
4451    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4452        /* if we're in an inconsistent state, that's an error */
4453        if (surrogate ||
4454                (base64bits >= 6) ||
4455                (base64bits > 0 && base64buffer != 0)) {
4456            endinpos = size;
4457            if (unicode_decode_call_errorhandler_writer(
4458                    errors, &errorHandler,
4459                    "utf7", "unterminated shift sequence",
4460                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4461                    &writer))
4462                goto onError;
4463            if (s < e)
4464                goto restart;
4465        }
4466    }
4467
4468    /* return state */
4469    if (consumed) {
4470        if (inShift) {
4471            *consumed = startinpos;
4472            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4473                PyObject *result = PyUnicode_FromKindAndData(
4474                        writer.kind, writer.data, shiftOutStart);
4475                Py_XDECREF(errorHandler);
4476                Py_XDECREF(exc);
4477                _PyUnicodeWriter_Dealloc(&writer);
4478                return result;
4479            }
4480            writer.pos = shiftOutStart; /* back off output */
4481        }
4482        else {
4483            *consumed = s-starts;
4484        }
4485    }
4486
4487    Py_XDECREF(errorHandler);
4488    Py_XDECREF(exc);
4489    return _PyUnicodeWriter_Finish(&writer);
4490
4491  onError:
4492    Py_XDECREF(errorHandler);
4493    Py_XDECREF(exc);
4494    _PyUnicodeWriter_Dealloc(&writer);
4495    return NULL;
4496}
4497
4498
4499PyObject *
4500_PyUnicode_EncodeUTF7(PyObject *str,
4501                      int base64SetO,
4502                      int base64WhiteSpace,
4503                      const char *errors)
4504{
4505    int kind;
4506    void *data;
4507    Py_ssize_t len;
4508    PyObject *v;
4509    int inShift = 0;
4510    Py_ssize_t i;
4511    unsigned int base64bits = 0;
4512    unsigned long base64buffer = 0;
4513    char * out;
4514    char * start;
4515
4516    if (PyUnicode_READY(str) == -1)
4517        return NULL;
4518    kind = PyUnicode_KIND(str);
4519    data = PyUnicode_DATA(str);
4520    len = PyUnicode_GET_LENGTH(str);
4521
4522    if (len == 0)
4523        return PyBytes_FromStringAndSize(NULL, 0);
4524
4525    /* It might be possible to tighten this worst case */
4526    if (len > PY_SSIZE_T_MAX / 8)
4527        return PyErr_NoMemory();
4528    v = PyBytes_FromStringAndSize(NULL, len * 8);
4529    if (v == NULL)
4530        return NULL;
4531
4532    start = out = PyBytes_AS_STRING(v);
4533    for (i = 0; i < len; ++i) {
4534        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4535
4536        if (inShift) {
4537            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4538                /* shifting out */
4539                if (base64bits) { /* output remaining bits */
4540                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4541                    base64buffer = 0;
4542                    base64bits = 0;
4543                }
4544                inShift = 0;
4545                /* Characters not in the BASE64 set implicitly unshift the sequence
4546                   so no '-' is required, except if the character is itself a '-' */
4547                if (IS_BASE64(ch) || ch == '-') {
4548                    *out++ = '-';
4549                }
4550                *out++ = (char) ch;
4551            }
4552            else {
4553                goto encode_char;
4554            }
4555        }
4556        else { /* not in a shift sequence */
4557            if (ch == '+') {
4558                *out++ = '+';
4559                        *out++ = '-';
4560            }
4561            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4562                *out++ = (char) ch;
4563            }
4564            else {
4565                *out++ = '+';
4566                inShift = 1;
4567                goto encode_char;
4568            }
4569        }
4570        continue;
4571encode_char:
4572        if (ch >= 0x10000) {
4573            assert(ch <= MAX_UNICODE);
4574
4575            /* code first surrogate */
4576            base64bits += 16;
4577            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4578            while (base64bits >= 6) {
4579                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4580                base64bits -= 6;
4581            }
4582            /* prepare second surrogate */
4583            ch = Py_UNICODE_LOW_SURROGATE(ch);
4584        }
4585        base64bits += 16;
4586        base64buffer = (base64buffer << 16) | ch;
4587        while (base64bits >= 6) {
4588            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4589            base64bits -= 6;
4590        }
4591    }
4592    if (base64bits)
4593        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4594    if (inShift)
4595        *out++ = '-';
4596    if (_PyBytes_Resize(&v, out - start) < 0)
4597        return NULL;
4598    return v;
4599}
4600PyObject *
4601PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4602                     Py_ssize_t size,
4603                     int base64SetO,
4604                     int base64WhiteSpace,
4605                     const char *errors)
4606{
4607    PyObject *result;
4608    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4609    if (tmp == NULL)
4610        return NULL;
4611    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4612                                   base64WhiteSpace, errors);
4613    Py_DECREF(tmp);
4614    return result;
4615}
4616
4617#undef IS_BASE64
4618#undef FROM_BASE64
4619#undef TO_BASE64
4620#undef DECODE_DIRECT
4621#undef ENCODE_DIRECT
4622
4623/* --- UTF-8 Codec -------------------------------------------------------- */
4624
4625PyObject *
4626PyUnicode_DecodeUTF8(const char *s,
4627                     Py_ssize_t size,
4628                     const char *errors)
4629{
4630    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4631}
4632
4633#include "stringlib/asciilib.h"
4634#include "stringlib/codecs.h"
4635#include "stringlib/undef.h"
4636
4637#include "stringlib/ucs1lib.h"
4638#include "stringlib/codecs.h"
4639#include "stringlib/undef.h"
4640
4641#include "stringlib/ucs2lib.h"
4642#include "stringlib/codecs.h"
4643#include "stringlib/undef.h"
4644
4645#include "stringlib/ucs4lib.h"
4646#include "stringlib/codecs.h"
4647#include "stringlib/undef.h"
4648
4649/* Mask to quickly check whether a C 'long' contains a
4650   non-ASCII, UTF8-encoded char. */
4651#if (SIZEOF_LONG == 8)
4652# define ASCII_CHAR_MASK 0x8080808080808080UL
4653#elif (SIZEOF_LONG == 4)
4654# define ASCII_CHAR_MASK 0x80808080UL
4655#else
4656# error C 'long' size should be either 4 or 8!
4657#endif
4658
4659static Py_ssize_t
4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4661{
4662    const char *p = start;
4663    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4664
4665    /*
4666     * Issue #17237: m68k is a bit different from most architectures in
4667     * that objects do not use "natural alignment" - for example, int and
4668     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4669     * won't work; also, tests have shown that skipping the "optimised
4670     * version" will even speed up m68k.
4671     */
4672#if !defined(__m68k__)
4673#if SIZEOF_LONG <= SIZEOF_VOID_P
4674    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4675    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4676        /* Fast path, see in STRINGLIB(utf8_decode) for
4677           an explanation. */
4678        /* Help allocation */
4679        const char *_p = p;
4680        Py_UCS1 * q = dest;
4681        while (_p < aligned_end) {
4682            unsigned long value = *(const unsigned long *) _p;
4683            if (value & ASCII_CHAR_MASK)
4684                break;
4685            *((unsigned long *)q) = value;
4686            _p += SIZEOF_LONG;
4687            q += SIZEOF_LONG;
4688        }
4689        p = _p;
4690        while (p < end) {
4691            if ((unsigned char)*p & 0x80)
4692                break;
4693            *q++ = *p++;
4694        }
4695        return p - start;
4696    }
4697#endif
4698#endif
4699    while (p < end) {
4700        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4701           for an explanation. */
4702        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4703            /* Help allocation */
4704            const char *_p = p;
4705            while (_p < aligned_end) {
4706                unsigned long value = *(unsigned long *) _p;
4707                if (value & ASCII_CHAR_MASK)
4708                    break;
4709                _p += SIZEOF_LONG;
4710            }
4711            p = _p;
4712            if (_p == end)
4713                break;
4714        }
4715        if ((unsigned char)*p & 0x80)
4716            break;
4717        ++p;
4718    }
4719    memcpy(dest, start, p - start);
4720    return p - start;
4721}
4722
4723PyObject *
4724PyUnicode_DecodeUTF8Stateful(const char *s,
4725                             Py_ssize_t size,
4726                             const char *errors,
4727                             Py_ssize_t *consumed)
4728{
4729    _PyUnicodeWriter writer;
4730    const char *starts = s;
4731    const char *end = s + size;
4732
4733    Py_ssize_t startinpos;
4734    Py_ssize_t endinpos;
4735    const char *errmsg = "";
4736    PyObject *errorHandler = NULL;
4737    PyObject *exc = NULL;
4738
4739    if (size == 0) {
4740        if (consumed)
4741            *consumed = 0;
4742        _Py_RETURN_UNICODE_EMPTY();
4743    }
4744
4745    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4746    if (size == 1 && (unsigned char)s[0] < 128) {
4747        if (consumed)
4748            *consumed = 1;
4749        return get_latin1_char((unsigned char)s[0]);
4750    }
4751
4752    _PyUnicodeWriter_Init(&writer);
4753    writer.min_length = size;
4754    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4755        goto onError;
4756
4757    writer.pos = ascii_decode(s, end, writer.data);
4758    s += writer.pos;
4759    while (s < end) {
4760        Py_UCS4 ch;
4761        int kind = writer.kind;
4762        if (kind == PyUnicode_1BYTE_KIND) {
4763            if (PyUnicode_IS_ASCII(writer.buffer))
4764                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4765            else
4766                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4767        } else if (kind == PyUnicode_2BYTE_KIND) {
4768            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4769        } else {
4770            assert(kind == PyUnicode_4BYTE_KIND);
4771            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4772        }
4773
4774        switch (ch) {
4775        case 0:
4776            if (s == end || consumed)
4777                goto End;
4778            errmsg = "unexpected end of data";
4779            startinpos = s - starts;
4780            endinpos = end - starts;
4781            break;
4782        case 1:
4783            errmsg = "invalid start byte";
4784            startinpos = s - starts;
4785            endinpos = startinpos + 1;
4786            break;
4787        case 2:
4788        case 3:
4789        case 4:
4790            errmsg = "invalid continuation byte";
4791            startinpos = s - starts;
4792            endinpos = startinpos + ch - 1;
4793            break;
4794        default:
4795            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4796                goto onError;
4797            continue;
4798        }
4799
4800        if (unicode_decode_call_errorhandler_writer(
4801                errors, &errorHandler,
4802                "utf-8", errmsg,
4803                &starts, &end, &startinpos, &endinpos, &exc, &s,
4804                &writer))
4805            goto onError;
4806    }
4807
4808End:
4809    if (consumed)
4810        *consumed = s - starts;
4811
4812    Py_XDECREF(errorHandler);
4813    Py_XDECREF(exc);
4814    return _PyUnicodeWriter_Finish(&writer);
4815
4816onError:
4817    Py_XDECREF(errorHandler);
4818    Py_XDECREF(exc);
4819    _PyUnicodeWriter_Dealloc(&writer);
4820    return NULL;
4821}
4822
4823#ifdef __APPLE__
4824
4825/* Simplified UTF-8 decoder using surrogateescape error handler,
4826   used to decode the command line arguments on Mac OS X.
4827
4828   Return a pointer to a newly allocated wide character string (use
4829   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4830
4831wchar_t*
4832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4833{
4834    const char *e;
4835    wchar_t *unicode;
4836    Py_ssize_t outpos;
4837
4838    /* Note: size will always be longer than the resulting Unicode
4839       character count */
4840    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4841        return NULL;
4842    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4843    if (!unicode)
4844        return NULL;
4845
4846    /* Unpack UTF-8 encoded data */
4847    e = s + size;
4848    outpos = 0;
4849    while (s < e) {
4850        Py_UCS4 ch;
4851#if SIZEOF_WCHAR_T == 4
4852        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4853#else
4854        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4855#endif
4856        if (ch > 0xFF) {
4857#if SIZEOF_WCHAR_T == 4
4858            assert(0);
4859#else
4860            assert(Py_UNICODE_IS_SURROGATE(ch));
4861            /*  compute and append the two surrogates: */
4862            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4863            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4864#endif
4865        }
4866        else {
4867            if (!ch && s == e)
4868                break;
4869            /* surrogateescape */
4870            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4871        }
4872    }
4873    unicode[outpos] = L'\0';
4874    return unicode;
4875}
4876
4877#endif /* __APPLE__ */
4878
4879/* Primary internal function which creates utf8 encoded bytes objects.
4880
4881   Allocation strategy:  if the string is short, convert into a stack buffer
4882   and allocate exactly as much space needed at the end.  Else allocate the
4883   maximum possible needed (4 result bytes per Unicode character), and return
4884   the excess memory at the end.
4885*/
4886PyObject *
4887_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4888{
4889    enum PyUnicode_Kind kind;
4890    void *data;
4891    Py_ssize_t size;
4892
4893    if (!PyUnicode_Check(unicode)) {
4894        PyErr_BadArgument();
4895        return NULL;
4896    }
4897
4898    if (PyUnicode_READY(unicode) == -1)
4899        return NULL;
4900
4901    if (PyUnicode_UTF8(unicode))
4902        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4903                                         PyUnicode_UTF8_LENGTH(unicode));
4904
4905    kind = PyUnicode_KIND(unicode);
4906    data = PyUnicode_DATA(unicode);
4907    size = PyUnicode_GET_LENGTH(unicode);
4908
4909    switch (kind) {
4910    default:
4911        assert(0);
4912    case PyUnicode_1BYTE_KIND:
4913        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4914        assert(!PyUnicode_IS_ASCII(unicode));
4915        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4916    case PyUnicode_2BYTE_KIND:
4917        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4918    case PyUnicode_4BYTE_KIND:
4919        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4920    }
4921}
4922
4923PyObject *
4924PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4925                     Py_ssize_t size,
4926                     const char *errors)
4927{
4928    PyObject *v, *unicode;
4929
4930    unicode = PyUnicode_FromUnicode(s, size);
4931    if (unicode == NULL)
4932        return NULL;
4933    v = _PyUnicode_AsUTF8String(unicode, errors);
4934    Py_DECREF(unicode);
4935    return v;
4936}
4937
4938PyObject *
4939PyUnicode_AsUTF8String(PyObject *unicode)
4940{
4941    return _PyUnicode_AsUTF8String(unicode, NULL);
4942}
4943
4944/* --- UTF-32 Codec ------------------------------------------------------- */
4945
4946PyObject *
4947PyUnicode_DecodeUTF32(const char *s,
4948                      Py_ssize_t size,
4949                      const char *errors,
4950                      int *byteorder)
4951{
4952    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4953}
4954
4955PyObject *
4956PyUnicode_DecodeUTF32Stateful(const char *s,
4957                              Py_ssize_t size,
4958                              const char *errors,
4959                              int *byteorder,
4960                              Py_ssize_t *consumed)
4961{
4962    const char *starts = s;
4963    Py_ssize_t startinpos;
4964    Py_ssize_t endinpos;
4965    _PyUnicodeWriter writer;
4966    const unsigned char *q, *e;
4967    int le, bo = 0;       /* assume native ordering by default */
4968    const char *encoding;
4969    const char *errmsg = "";
4970    PyObject *errorHandler = NULL;
4971    PyObject *exc = NULL;
4972
4973    q = (unsigned char *)s;
4974    e = q + size;
4975
4976    if (byteorder)
4977        bo = *byteorder;
4978
4979    /* Check for BOM marks (U+FEFF) in the input and adjust current
4980       byte order setting accordingly. In native mode, the leading BOM
4981       mark is skipped, in all other modes, it is copied to the output
4982       stream as-is (giving a ZWNBSP character). */
4983    if (bo == 0 && size >= 4) {
4984        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4985        if (bom == 0x0000FEFF) {
4986            bo = -1;
4987            q += 4;
4988        }
4989        else if (bom == 0xFFFE0000) {
4990            bo = 1;
4991            q += 4;
4992        }
4993        if (byteorder)
4994            *byteorder = bo;
4995    }
4996
4997    if (q == e) {
4998        if (consumed)
4999            *consumed = size;
5000        _Py_RETURN_UNICODE_EMPTY();
5001    }
5002
5003#ifdef WORDS_BIGENDIAN
5004    le = bo < 0;
5005#else
5006    le = bo <= 0;
5007#endif
5008    encoding = le ? "utf-32-le" : "utf-32-be";
5009
5010    _PyUnicodeWriter_Init(&writer);
5011    writer.min_length = (e - q + 3) / 4;
5012    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5013        goto onError;
5014
5015    while (1) {
5016        Py_UCS4 ch = 0;
5017        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5018
5019        if (e - q >= 4) {
5020            enum PyUnicode_Kind kind = writer.kind;
5021            void *data = writer.data;
5022            const unsigned char *last = e - 4;
5023            Py_ssize_t pos = writer.pos;
5024            if (le) {
5025                do {
5026                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5027                    if (ch > maxch)
5028                        break;
5029                    if (kind != PyUnicode_1BYTE_KIND &&
5030                        Py_UNICODE_IS_SURROGATE(ch))
5031                        break;
5032                    PyUnicode_WRITE(kind, data, pos++, ch);
5033                    q += 4;
5034                } while (q <= last);
5035            }
5036            else {
5037                do {
5038                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5039                    if (ch > maxch)
5040                        break;
5041                    if (kind != PyUnicode_1BYTE_KIND &&
5042                        Py_UNICODE_IS_SURROGATE(ch))
5043                        break;
5044                    PyUnicode_WRITE(kind, data, pos++, ch);
5045                    q += 4;
5046                } while (q <= last);
5047            }
5048            writer.pos = pos;
5049        }
5050
5051        if (Py_UNICODE_IS_SURROGATE(ch)) {
5052            errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5053            startinpos = ((const char *)q) - starts;
5054            endinpos = startinpos + 4;
5055        }
5056        else if (ch <= maxch) {
5057            if (q == e || consumed)
5058                break;
5059            /* remaining bytes at the end? (size should be divisible by 4) */
5060            errmsg = "truncated data";
5061            startinpos = ((const char *)q) - starts;
5062            endinpos = ((const char *)e) - starts;
5063        }
5064        else {
5065            if (ch < 0x110000) {
5066                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5067                    goto onError;
5068                q += 4;
5069                continue;
5070            }
5071            errmsg = "codepoint not in range(0x110000)";
5072            startinpos = ((const char *)q) - starts;
5073            endinpos = startinpos + 4;
5074        }
5075
5076        /* The remaining input chars are ignored if the callback
5077           chooses to skip the input */
5078        if (unicode_decode_call_errorhandler_writer(
5079                errors, &errorHandler,
5080                encoding, errmsg,
5081                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5082                &writer))
5083            goto onError;
5084    }
5085
5086    if (consumed)
5087        *consumed = (const char *)q-starts;
5088
5089    Py_XDECREF(errorHandler);
5090    Py_XDECREF(exc);
5091    return _PyUnicodeWriter_Finish(&writer);
5092
5093  onError:
5094    _PyUnicodeWriter_Dealloc(&writer);
5095    Py_XDECREF(errorHandler);
5096    Py_XDECREF(exc);
5097    return NULL;
5098}
5099
5100PyObject *
5101_PyUnicode_EncodeUTF32(PyObject *str,
5102                       const char *errors,
5103                       int byteorder)
5104{
5105    int kind;
5106    void *data;
5107    Py_ssize_t len;
5108    PyObject *v;
5109    unsigned char *p;
5110    Py_ssize_t nsize, i;
5111    /* Offsets from p for storing byte pairs in the right order. */
5112#if PY_LITTLE_ENDIAN
5113    int iorder[] = {0, 1, 2, 3};
5114#else
5115    int iorder[] = {3, 2, 1, 0};
5116#endif
5117    const char *encoding;
5118    PyObject *errorHandler = NULL;
5119    PyObject *exc = NULL;
5120    PyObject *rep = NULL;
5121
5122#define STORECHAR(CH)                           \
5123    do {                                        \
5124        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5125        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5126        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5127        p[iorder[0]] = (CH) & 0xff;             \
5128        p += 4;                                 \
5129    } while(0)
5130
5131    if (!PyUnicode_Check(str)) {
5132        PyErr_BadArgument();
5133        return NULL;
5134    }
5135    if (PyUnicode_READY(str) == -1)
5136        return NULL;
5137    kind = PyUnicode_KIND(str);
5138    data = PyUnicode_DATA(str);
5139    len = PyUnicode_GET_LENGTH(str);
5140
5141    nsize = len + (byteorder == 0);
5142    if (nsize > PY_SSIZE_T_MAX / 4)
5143        return PyErr_NoMemory();
5144    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5145    if (v == NULL)
5146        return NULL;
5147
5148    p = (unsigned char *)PyBytes_AS_STRING(v);
5149    if (byteorder == 0)
5150        STORECHAR(0xFEFF);
5151    if (len == 0)
5152        return v;
5153
5154    if (byteorder == -1) {
5155        /* force LE */
5156        iorder[0] = 0;
5157        iorder[1] = 1;
5158        iorder[2] = 2;
5159        iorder[3] = 3;
5160        encoding = "utf-32-le";
5161    }
5162    else if (byteorder == 1) {
5163        /* force BE */
5164        iorder[0] = 3;
5165        iorder[1] = 2;
5166        iorder[2] = 1;
5167        iorder[3] = 0;
5168        encoding = "utf-32-be";
5169    }
5170    else
5171        encoding = "utf-32";
5172
5173    if (kind == PyUnicode_1BYTE_KIND) {
5174        for (i = 0; i < len; i++)
5175            STORECHAR(PyUnicode_READ(kind, data, i));
5176        return v;
5177    }
5178
5179    for (i = 0; i < len;) {
5180        Py_ssize_t repsize, moreunits;
5181        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5182        i++;
5183        assert(ch <= MAX_UNICODE);
5184        if (!Py_UNICODE_IS_SURROGATE(ch)) {
5185            STORECHAR(ch);
5186            continue;
5187        }
5188
5189        rep = unicode_encode_call_errorhandler(
5190                errors, &errorHandler,
5191                encoding, "surrogates not allowed",
5192                str, &exc, i-1, i, &i);
5193
5194        if (!rep)
5195            goto error;
5196
5197        if (PyBytes_Check(rep)) {
5198            repsize = PyBytes_GET_SIZE(rep);
5199            if (repsize & 3) {
5200                raise_encode_exception(&exc, encoding,
5201                                       str, i - 1, i,
5202                                       "surrogates not allowed");
5203                goto error;
5204            }
5205            moreunits = repsize / 4;
5206        }
5207        else {
5208            assert(PyUnicode_Check(rep));
5209            if (PyUnicode_READY(rep) < 0)
5210                goto error;
5211            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5212            if (!PyUnicode_IS_ASCII(rep)) {
5213                raise_encode_exception(&exc, encoding,
5214                                       str, i - 1, i,
5215                                       "surrogates not allowed");
5216                goto error;
5217            }
5218        }
5219
5220        /* four bytes are reserved for each surrogate */
5221        if (moreunits > 1) {
5222            Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5223            Py_ssize_t morebytes = 4 * (moreunits - 1);
5224            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5225                /* integer overflow */
5226                PyErr_NoMemory();
5227                goto error;
5228            }
5229            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5230                goto error;
5231            p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5232        }
5233
5234        if (PyBytes_Check(rep)) {
5235            Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5236            p += repsize;
5237        } else /* rep is unicode */ {
5238            const Py_UCS1 *repdata;
5239            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5240            repdata = PyUnicode_1BYTE_DATA(rep);
5241            while (repsize--) {
5242                Py_UCS4 ch = *repdata++;
5243                STORECHAR(ch);
5244            }
5245        }
5246
5247        Py_CLEAR(rep);
5248    }
5249
5250    /* Cut back to size actually needed. This is necessary for, for example,
5251       encoding of a string containing isolated surrogates and the 'ignore'
5252       handler is used. */
5253    nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5254    if (nsize != PyBytes_GET_SIZE(v))
5255      _PyBytes_Resize(&v, nsize);
5256    Py_XDECREF(errorHandler);
5257    Py_XDECREF(exc);
5258    return v;
5259  error:
5260    Py_XDECREF(rep);
5261    Py_XDECREF(errorHandler);
5262    Py_XDECREF(exc);
5263    Py_XDECREF(v);
5264    return NULL;
5265#undef STORECHAR
5266}
5267
5268PyObject *
5269PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5270                      Py_ssize_t size,
5271                      const char *errors,
5272                      int byteorder)
5273{
5274    PyObject *result;
5275    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5276    if (tmp == NULL)
5277        return NULL;
5278    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5279    Py_DECREF(tmp);
5280    return result;
5281}
5282
5283PyObject *
5284PyUnicode_AsUTF32String(PyObject *unicode)
5285{
5286    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5287}
5288
5289/* --- UTF-16 Codec ------------------------------------------------------- */
5290
5291PyObject *
5292PyUnicode_DecodeUTF16(const char *s,
5293                      Py_ssize_t size,
5294                      const char *errors,
5295                      int *byteorder)
5296{
5297    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5298}
5299
5300PyObject *
5301PyUnicode_DecodeUTF16Stateful(const char *s,
5302                              Py_ssize_t size,
5303                              const char *errors,
5304                              int *byteorder,
5305                              Py_ssize_t *consumed)
5306{
5307    const char *starts = s;
5308    Py_ssize_t startinpos;
5309    Py_ssize_t endinpos;
5310    _PyUnicodeWriter writer;
5311    const unsigned char *q, *e;
5312    int bo = 0;       /* assume native ordering by default */
5313    int native_ordering;
5314    const char *errmsg = "";
5315    PyObject *errorHandler = NULL;
5316    PyObject *exc = NULL;
5317    const char *encoding;
5318
5319    q = (unsigned char *)s;
5320    e = q + size;
5321
5322    if (byteorder)
5323        bo = *byteorder;
5324
5325    /* Check for BOM marks (U+FEFF) in the input and adjust current
5326       byte order setting accordingly. In native mode, the leading BOM
5327       mark is skipped, in all other modes, it is copied to the output
5328       stream as-is (giving a ZWNBSP character). */
5329    if (bo == 0 && size >= 2) {
5330        const Py_UCS4 bom = (q[1] << 8) | q[0];
5331        if (bom == 0xFEFF) {
5332            q += 2;
5333            bo = -1;
5334        }
5335        else if (bom == 0xFFFE) {
5336            q += 2;
5337            bo = 1;
5338        }
5339        if (byteorder)
5340            *byteorder = bo;
5341    }
5342
5343    if (q == e) {
5344        if (consumed)
5345            *consumed = size;
5346        _Py_RETURN_UNICODE_EMPTY();
5347    }
5348
5349#if PY_LITTLE_ENDIAN
5350    native_ordering = bo <= 0;
5351    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5352#else
5353    native_ordering = bo >= 0;
5354    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5355#endif
5356
5357    /* Note: size will always be longer than the resulting Unicode
5358       character count */
5359    _PyUnicodeWriter_Init(&writer);
5360    writer.min_length = (e - q + 1) / 2;
5361    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5362        goto onError;
5363
5364    while (1) {
5365        Py_UCS4 ch = 0;
5366        if (e - q >= 2) {
5367            int kind = writer.kind;
5368            if (kind == PyUnicode_1BYTE_KIND) {
5369                if (PyUnicode_IS_ASCII(writer.buffer))
5370                    ch = asciilib_utf16_decode(&q, e,
5371                            (Py_UCS1*)writer.data, &writer.pos,
5372                            native_ordering);
5373                else
5374                    ch = ucs1lib_utf16_decode(&q, e,
5375                            (Py_UCS1*)writer.data, &writer.pos,
5376                            native_ordering);
5377            } else if (kind == PyUnicode_2BYTE_KIND) {
5378                ch = ucs2lib_utf16_decode(&q, e,
5379                        (Py_UCS2*)writer.data, &writer.pos,
5380                        native_ordering);
5381            } else {
5382                assert(kind == PyUnicode_4BYTE_KIND);
5383                ch = ucs4lib_utf16_decode(&q, e,
5384                        (Py_UCS4*)writer.data, &writer.pos,
5385                        native_ordering);
5386            }
5387        }
5388
5389        switch (ch)
5390        {
5391        case 0:
5392            /* remaining byte at the end? (size should be even) */
5393            if (q == e || consumed)
5394                goto End;
5395            errmsg = "truncated data";
5396            startinpos = ((const char *)q) - starts;
5397            endinpos = ((const char *)e) - starts;
5398            break;
5399            /* The remaining input chars are ignored if the callback
5400               chooses to skip the input */
5401        case 1:
5402            q -= 2;
5403            if (consumed)
5404                goto End;
5405            errmsg = "unexpected end of data";
5406            startinpos = ((const char *)q) - starts;
5407            endinpos = ((const char *)e) - starts;
5408            break;
5409        case 2:
5410            errmsg = "illegal encoding";
5411            startinpos = ((const char *)q) - 2 - starts;
5412            endinpos = startinpos + 2;
5413            break;
5414        case 3:
5415            errmsg = "illegal UTF-16 surrogate";
5416            startinpos = ((const char *)q) - 4 - starts;
5417            endinpos = startinpos + 2;
5418            break;
5419        default:
5420            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5421                goto onError;
5422            continue;
5423        }
5424
5425        if (unicode_decode_call_errorhandler_writer(
5426                errors,
5427                &errorHandler,
5428                encoding, errmsg,
5429                &starts,
5430                (const char **)&e,
5431                &startinpos,
5432                &endinpos,
5433                &exc,
5434                (const char **)&q,
5435                &writer))
5436            goto onError;
5437    }
5438
5439End:
5440    if (consumed)
5441        *consumed = (const char *)q-starts;
5442
5443    Py_XDECREF(errorHandler);
5444    Py_XDECREF(exc);
5445    return _PyUnicodeWriter_Finish(&writer);
5446
5447  onError:
5448    _PyUnicodeWriter_Dealloc(&writer);
5449    Py_XDECREF(errorHandler);
5450    Py_XDECREF(exc);
5451    return NULL;
5452}
5453
5454PyObject *
5455_PyUnicode_EncodeUTF16(PyObject *str,
5456                       const char *errors,
5457                       int byteorder)
5458{
5459    enum PyUnicode_Kind kind;
5460    const void *data;
5461    Py_ssize_t len;
5462    PyObject *v;
5463    unsigned short *out;
5464    Py_ssize_t pairs;
5465#if PY_BIG_ENDIAN
5466    int native_ordering = byteorder >= 0;
5467#else
5468    int native_ordering = byteorder <= 0;
5469#endif
5470    const char *encoding;
5471    Py_ssize_t nsize, pos;
5472    PyObject *errorHandler = NULL;
5473    PyObject *exc = NULL;
5474    PyObject *rep = NULL;
5475
5476    if (!PyUnicode_Check(str)) {
5477        PyErr_BadArgument();
5478        return NULL;
5479    }
5480    if (PyUnicode_READY(str) == -1)
5481        return NULL;
5482    kind = PyUnicode_KIND(str);
5483    data = PyUnicode_DATA(str);
5484    len = PyUnicode_GET_LENGTH(str);
5485
5486    pairs = 0;
5487    if (kind == PyUnicode_4BYTE_KIND) {
5488        const Py_UCS4 *in = (const Py_UCS4 *)data;
5489        const Py_UCS4 *end = in + len;
5490        while (in < end)
5491            if (*in++ >= 0x10000)
5492                pairs++;
5493    }
5494    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5495        return PyErr_NoMemory();
5496    nsize = len + pairs + (byteorder == 0);
5497    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5498    if (v == NULL)
5499        return NULL;
5500
5501    /* output buffer is 2-bytes aligned */
5502    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5503    out = (unsigned short *)PyBytes_AS_STRING(v);
5504    if (byteorder == 0)
5505        *out++ = 0xFEFF;
5506    if (len == 0)
5507        goto done;
5508
5509    if (kind == PyUnicode_1BYTE_KIND) {
5510        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5511        goto done;
5512    }
5513
5514    if (byteorder < 0)
5515        encoding = "utf-16-le";
5516    else if (byteorder > 0)
5517        encoding = "utf-16-be";
5518    else
5519        encoding = "utf-16";
5520
5521    pos = 0;
5522    while (pos < len) {
5523        Py_ssize_t repsize, moreunits;
5524
5525        if (kind == PyUnicode_2BYTE_KIND) {
5526            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5527                                        &out, native_ordering);
5528        }
5529        else {
5530            assert(kind == PyUnicode_4BYTE_KIND);
5531            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5532                                        &out, native_ordering);
5533        }
5534        if (pos == len)
5535            break;
5536
5537        rep = unicode_encode_call_errorhandler(
5538                errors, &errorHandler,
5539                encoding, "surrogates not allowed",
5540                str, &exc, pos, pos + 1, &pos);
5541        if (!rep)
5542            goto error;
5543
5544        if (PyBytes_Check(rep)) {
5545            repsize = PyBytes_GET_SIZE(rep);
5546            if (repsize & 1) {
5547                raise_encode_exception(&exc, encoding,
5548                                       str, pos - 1, pos,
5549                                       "surrogates not allowed");
5550                goto error;
5551            }
5552            moreunits = repsize / 2;
5553        }
5554        else {
5555            assert(PyUnicode_Check(rep));
5556            if (PyUnicode_READY(rep) < 0)
5557                goto error;
5558            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5559            if (!PyUnicode_IS_ASCII(rep)) {
5560                raise_encode_exception(&exc, encoding,
5561                                       str, pos - 1, pos,
5562                                       "surrogates not allowed");
5563                goto error;
5564            }
5565        }
5566
5567        /* two bytes are reserved for each surrogate */
5568        if (moreunits > 1) {
5569            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5570            Py_ssize_t morebytes = 2 * (moreunits - 1);
5571            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5572                /* integer overflow */
5573                PyErr_NoMemory();
5574                goto error;
5575            }
5576            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5577                goto error;
5578            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5579        }
5580
5581        if (PyBytes_Check(rep)) {
5582            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5583            out += moreunits;
5584        } else /* rep is unicode */ {
5585            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5586            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5587                                 &out, native_ordering);
5588        }
5589
5590        Py_CLEAR(rep);
5591    }
5592
5593    /* Cut back to size actually needed. This is necessary for, for example,
5594    encoding of a string containing isolated surrogates and the 'ignore' handler
5595    is used. */
5596    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5597    if (nsize != PyBytes_GET_SIZE(v))
5598      _PyBytes_Resize(&v, nsize);
5599    Py_XDECREF(errorHandler);
5600    Py_XDECREF(exc);
5601  done:
5602    return v;
5603  error:
5604    Py_XDECREF(rep);
5605    Py_XDECREF(errorHandler);
5606    Py_XDECREF(exc);
5607    Py_XDECREF(v);
5608    return NULL;
5609#undef STORECHAR
5610}
5611
5612PyObject *
5613PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5614                      Py_ssize_t size,
5615                      const char *errors,
5616                      int byteorder)
5617{
5618    PyObject *result;
5619    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5620    if (tmp == NULL)
5621        return NULL;
5622    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5623    Py_DECREF(tmp);
5624    return result;
5625}
5626
5627PyObject *
5628PyUnicode_AsUTF16String(PyObject *unicode)
5629{
5630    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5631}
5632
5633/* --- Unicode Escape Codec ----------------------------------------------- */
5634
5635/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5636   if all the escapes in the string make it still a valid ASCII string.
5637   Returns -1 if any escapes were found which cause the string to
5638   pop out of ASCII range.  Otherwise returns the length of the
5639   required buffer to hold the string.
5640   */
5641static Py_ssize_t
5642length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5643{
5644    const unsigned char *p = (const unsigned char *)s;
5645    const unsigned char *end = p + size;
5646    Py_ssize_t length = 0;
5647
5648    if (size < 0)
5649        return -1;
5650
5651    for (; p < end; ++p) {
5652        if (*p > 127) {
5653            /* Non-ASCII */
5654            return -1;
5655        }
5656        else if (*p != '\\') {
5657            /* Normal character */
5658            ++length;
5659        }
5660        else {
5661            /* Backslash-escape, check next char */
5662            ++p;
5663            /* Escape sequence reaches till end of string or
5664               non-ASCII follow-up. */
5665            if (p >= end || *p > 127)
5666                return -1;
5667            switch (*p) {
5668            case '\n':
5669                /* backslash + \n result in zero characters */
5670                break;
5671            case '\\': case '\'': case '\"':
5672            case 'b': case 'f': case 't':
5673            case 'n': case 'r': case 'v': case 'a':
5674                ++length;
5675                break;
5676            case '0': case '1': case '2': case '3':
5677            case '4': case '5': case '6': case '7':
5678            case 'x': case 'u': case 'U': case 'N':
5679                /* these do not guarantee ASCII characters */
5680                return -1;
5681            default:
5682                /* count the backslash + the other character */
5683                length += 2;
5684            }
5685        }
5686    }
5687    return length;
5688}
5689
5690static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5691
5692PyObject *
5693PyUnicode_DecodeUnicodeEscape(const char *s,
5694                              Py_ssize_t size,
5695                              const char *errors)
5696{
5697    const char *starts = s;
5698    Py_ssize_t startinpos;
5699    Py_ssize_t endinpos;
5700    _PyUnicodeWriter writer;
5701    const char *end;
5702    char* message;
5703    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5704    PyObject *errorHandler = NULL;
5705    PyObject *exc = NULL;
5706    Py_ssize_t len;
5707
5708    len = length_of_escaped_ascii_string(s, size);
5709    if (len == 0)
5710        _Py_RETURN_UNICODE_EMPTY();
5711
5712    /* After length_of_escaped_ascii_string() there are two alternatives,
5713       either the string is pure ASCII with named escapes like \n, etc.
5714       and we determined it's exact size (common case)
5715       or it contains \x, \u, ... escape sequences.  then we create a
5716       legacy wchar string and resize it at the end of this function. */
5717    _PyUnicodeWriter_Init(&writer);
5718    if (len > 0) {
5719        writer.min_length = len;
5720    }
5721    else {
5722        /* Escaped strings will always be longer than the resulting
5723           Unicode string, so we start with size here and then reduce the
5724           length after conversion to the true value.
5725           (but if the error callback returns a long replacement string
5726           we'll have to allocate more space) */
5727        writer.min_length = size;
5728    }
5729
5730    if (size == 0)
5731        return _PyUnicodeWriter_Finish(&writer);
5732    end = s + size;
5733
5734    while (s < end) {
5735        unsigned char c;
5736        Py_UCS4 x;
5737        int digits;
5738
5739        /* Non-escape characters are interpreted as Unicode ordinals */
5740        if (*s != '\\') {
5741            x = (unsigned char)*s;
5742            s++;
5743            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5744                goto onError;
5745            continue;
5746        }
5747
5748        startinpos = s-starts;
5749        /* \ - Escapes */
5750        s++;
5751        c = *s++;
5752        if (s > end)
5753            c = '\0'; /* Invalid after \ */
5754
5755        switch (c) {
5756
5757            /* \x escapes */
5758#define WRITECHAR(ch)                                                      \
5759            do {                                                           \
5760                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5761                    goto onError;                                          \
5762            } while(0)
5763
5764        case '\n': break;
5765        case '\\': WRITECHAR('\\'); break;
5766        case '\'': WRITECHAR('\''); break;
5767        case '\"': WRITECHAR('\"'); break;
5768        case 'b': WRITECHAR('\b'); break;
5769        /* FF */
5770        case 'f': WRITECHAR('\014'); break;
5771        case 't': WRITECHAR('\t'); break;
5772        case 'n': WRITECHAR('\n'); break;
5773        case 'r': WRITECHAR('\r'); break;
5774        /* VT */
5775        case 'v': WRITECHAR('\013'); break;
5776        /* BEL, not classic C */
5777        case 'a': WRITECHAR('\007'); break;
5778
5779            /* \OOO (octal) escapes */
5780        case '0': case '1': case '2': case '3':
5781        case '4': case '5': case '6': case '7':
5782            x = s[-1] - '0';
5783            if (s < end && '0' <= *s && *s <= '7') {
5784                x = (x<<3) + *s++ - '0';
5785                if (s < end && '0' <= *s && *s <= '7')
5786                    x = (x<<3) + *s++ - '0';
5787            }
5788            WRITECHAR(x);
5789            break;
5790
5791            /* hex escapes */
5792            /* \xXX */
5793        case 'x':
5794            digits = 2;
5795            message = "truncated \\xXX escape";
5796            goto hexescape;
5797
5798            /* \uXXXX */
5799        case 'u':
5800            digits = 4;
5801            message = "truncated \\uXXXX escape";
5802            goto hexescape;
5803
5804            /* \UXXXXXXXX */
5805        case 'U':
5806            digits = 8;
5807            message = "truncated \\UXXXXXXXX escape";
5808        hexescape:
5809            chr = 0;
5810            if (end - s < digits) {
5811                /* count only hex digits */
5812                for (; s < end; ++s) {
5813                    c = (unsigned char)*s;
5814                    if (!Py_ISXDIGIT(c))
5815                        goto error;
5816                }
5817                goto error;
5818            }
5819            for (; digits--; ++s) {
5820                c = (unsigned char)*s;
5821                if (!Py_ISXDIGIT(c))
5822                    goto error;
5823                chr = (chr<<4) & ~0xF;
5824                if (c >= '0' && c <= '9')
5825                    chr += c - '0';
5826                else if (c >= 'a' && c <= 'f')
5827                    chr += 10 + c - 'a';
5828                else
5829                    chr += 10 + c - 'A';
5830            }
5831            if (chr == 0xffffffff && PyErr_Occurred())
5832                /* _decoding_error will have already written into the
5833                   target buffer. */
5834                break;
5835        store:
5836            /* when we get here, chr is a 32-bit unicode character */
5837            message = "illegal Unicode character";
5838            if (chr > MAX_UNICODE)
5839                goto error;
5840            WRITECHAR(chr);
5841            break;
5842
5843            /* \N{name} */
5844        case 'N':
5845            message = "malformed \\N character escape";
5846            if (ucnhash_CAPI == NULL) {
5847                /* load the unicode data module */
5848                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5849                                                PyUnicodeData_CAPSULE_NAME, 1);
5850                if (ucnhash_CAPI == NULL)
5851                    goto ucnhashError;
5852            }
5853            if (*s == '{') {
5854                const char *start = s+1;
5855                /* look for the closing brace */
5856                while (*s != '}' && s < end)
5857                    s++;
5858                if (s > start && s < end && *s == '}') {
5859                    /* found a name.  look it up in the unicode database */
5860                    message = "unknown Unicode character name";
5861                    s++;
5862                    if (s - start - 1 <= INT_MAX &&
5863                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5864                                              &chr, 0))
5865                        goto store;
5866                }
5867            }
5868            goto error;
5869
5870        default:
5871            if (s > end) {
5872                message = "\\ at end of string";
5873                s--;
5874                goto error;
5875            }
5876            else {
5877                WRITECHAR('\\');
5878                WRITECHAR((unsigned char)s[-1]);
5879            }
5880            break;
5881        }
5882        continue;
5883
5884      error:
5885        endinpos = s-starts;
5886        if (unicode_decode_call_errorhandler_writer(
5887                errors, &errorHandler,
5888                "unicodeescape", message,
5889                &starts, &end, &startinpos, &endinpos, &exc, &s,
5890                &writer))
5891            goto onError;
5892        continue;
5893    }
5894#undef WRITECHAR
5895
5896    Py_XDECREF(errorHandler);
5897    Py_XDECREF(exc);
5898    return _PyUnicodeWriter_Finish(&writer);
5899
5900  ucnhashError:
5901    PyErr_SetString(
5902        PyExc_UnicodeError,
5903        "\\N escapes not supported (can't load unicodedata module)"
5904        );
5905    _PyUnicodeWriter_Dealloc(&writer);
5906    Py_XDECREF(errorHandler);
5907    Py_XDECREF(exc);
5908    return NULL;
5909
5910  onError:
5911    _PyUnicodeWriter_Dealloc(&writer);
5912    Py_XDECREF(errorHandler);
5913    Py_XDECREF(exc);
5914    return NULL;
5915}
5916
5917/* Return a Unicode-Escape string version of the Unicode object.
5918
5919   If quotes is true, the string is enclosed in u"" or u'' quotes as
5920   appropriate.
5921
5922*/
5923
5924PyObject *
5925PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5926{
5927    Py_ssize_t i, len;
5928    PyObject *repr;
5929    char *p;
5930    int kind;
5931    void *data;
5932    Py_ssize_t expandsize = 0;
5933
5934    /* Initial allocation is based on the longest-possible character
5935       escape.
5936
5937       For UCS1 strings it's '\xxx', 4 bytes per source character.
5938       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5939       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5940    */
5941
5942    if (!PyUnicode_Check(unicode)) {
5943        PyErr_BadArgument();
5944        return NULL;
5945    }
5946    if (PyUnicode_READY(unicode) == -1)
5947        return NULL;
5948    len = PyUnicode_GET_LENGTH(unicode);
5949    kind = PyUnicode_KIND(unicode);
5950    data = PyUnicode_DATA(unicode);
5951    switch (kind) {
5952    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5953    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5954    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5955    }
5956
5957    if (len == 0)
5958        return PyBytes_FromStringAndSize(NULL, 0);
5959
5960    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5961        return PyErr_NoMemory();
5962
5963    repr = PyBytes_FromStringAndSize(NULL,
5964                                     2
5965                                     + expandsize*len
5966                                     + 1);
5967    if (repr == NULL)
5968        return NULL;
5969
5970    p = PyBytes_AS_STRING(repr);
5971
5972    for (i = 0; i < len; i++) {
5973        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5974
5975        /* Escape backslashes */
5976        if (ch == '\\') {
5977            *p++ = '\\';
5978            *p++ = (char) ch;
5979            continue;
5980        }
5981
5982        /* Map 21-bit characters to '\U00xxxxxx' */
5983        else if (ch >= 0x10000) {
5984            assert(ch <= MAX_UNICODE);
5985            *p++ = '\\';
5986            *p++ = 'U';
5987            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5988            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5989            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5990            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5991            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5992            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5993            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5994            *p++ = Py_hexdigits[ch & 0x0000000F];
5995            continue;
5996        }
5997
5998        /* Map 16-bit characters to '\uxxxx' */
5999        if (ch >= 256) {
6000            *p++ = '\\';
6001            *p++ = 'u';
6002            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6003            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6004            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6005            *p++ = Py_hexdigits[ch & 0x000F];
6006        }
6007
6008        /* Map special whitespace to '\t', \n', '\r' */
6009        else if (ch == '\t') {
6010            *p++ = '\\';
6011            *p++ = 't';
6012        }
6013        else if (ch == '\n') {
6014            *p++ = '\\';
6015            *p++ = 'n';
6016        }
6017        else if (ch == '\r') {
6018            *p++ = '\\';
6019            *p++ = 'r';
6020        }
6021
6022        /* Map non-printable US ASCII to '\xhh' */
6023        else if (ch < ' ' || ch >= 0x7F) {
6024            *p++ = '\\';
6025            *p++ = 'x';
6026            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6027            *p++ = Py_hexdigits[ch & 0x000F];
6028        }
6029
6030        /* Copy everything else as-is */
6031        else
6032            *p++ = (char) ch;
6033    }
6034
6035    assert(p - PyBytes_AS_STRING(repr) > 0);
6036    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037        return NULL;
6038    return repr;
6039}
6040
6041PyObject *
6042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6043                              Py_ssize_t size)
6044{
6045    PyObject *result;
6046    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6047    if (tmp == NULL)
6048        return NULL;
6049    result = PyUnicode_AsUnicodeEscapeString(tmp);
6050    Py_DECREF(tmp);
6051    return result;
6052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
6056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
6058                                 Py_ssize_t size,
6059                                 const char *errors)
6060{
6061    const char *starts = s;
6062    Py_ssize_t startinpos;
6063    Py_ssize_t endinpos;
6064    _PyUnicodeWriter writer;
6065    const char *end;
6066    const char *bs;
6067    PyObject *errorHandler = NULL;
6068    PyObject *exc = NULL;
6069
6070    if (size == 0)
6071        _Py_RETURN_UNICODE_EMPTY();
6072
6073    /* Escaped strings will always be longer than the resulting
6074       Unicode string, so we start with size here and then reduce the
6075       length after conversion to the true value. (But decoding error
6076       handler might have to resize the string) */
6077    _PyUnicodeWriter_Init(&writer);
6078    writer.min_length = size;
6079
6080    end = s + size;
6081    while (s < end) {
6082        unsigned char c;
6083        Py_UCS4 x;
6084        int i;
6085        int count;
6086
6087        /* Non-escape characters are interpreted as Unicode ordinals */
6088        if (*s != '\\') {
6089            x = (unsigned char)*s++;
6090            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6091                goto onError;
6092            continue;
6093        }
6094        startinpos = s-starts;
6095
6096        /* \u-escapes are only interpreted iff the number of leading
6097           backslashes if odd */
6098        bs = s;
6099        for (;s < end;) {
6100            if (*s != '\\')
6101                break;
6102            x = (unsigned char)*s++;
6103            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6104                goto onError;
6105        }
6106        if (((s - bs) & 1) == 0 ||
6107            s >= end ||
6108            (*s != 'u' && *s != 'U')) {
6109            continue;
6110        }
6111        writer.pos--;
6112        count = *s=='u' ? 4 : 8;
6113        s++;
6114
6115        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6116        for (x = 0, i = 0; i < count; ++i, ++s) {
6117            c = (unsigned char)*s;
6118            if (!Py_ISXDIGIT(c)) {
6119                endinpos = s-starts;
6120                if (unicode_decode_call_errorhandler_writer(
6121                        errors, &errorHandler,
6122                        "rawunicodeescape", "truncated \\uXXXX",
6123                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6124                        &writer))
6125                    goto onError;
6126                goto nextByte;
6127            }
6128            x = (x<<4) & ~0xF;
6129            if (c >= '0' && c <= '9')
6130                x += c - '0';
6131            else if (c >= 'a' && c <= 'f')
6132                x += 10 + c - 'a';
6133            else
6134                x += 10 + c - 'A';
6135        }
6136        if (x <= MAX_UNICODE) {
6137            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6138                goto onError;
6139        }
6140        else {
6141            endinpos = s-starts;
6142            if (unicode_decode_call_errorhandler_writer(
6143                    errors, &errorHandler,
6144                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6145                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6146                    &writer))
6147                goto onError;
6148        }
6149      nextByte:
6150        ;
6151    }
6152    Py_XDECREF(errorHandler);
6153    Py_XDECREF(exc);
6154    return _PyUnicodeWriter_Finish(&writer);
6155
6156  onError:
6157    _PyUnicodeWriter_Dealloc(&writer);
6158    Py_XDECREF(errorHandler);
6159    Py_XDECREF(exc);
6160    return NULL;
6161}
6162
6163
6164PyObject *
6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6166{
6167    PyObject *repr;
6168    char *p;
6169    char *q;
6170    Py_ssize_t expandsize, pos;
6171    int kind;
6172    void *data;
6173    Py_ssize_t len;
6174
6175    if (!PyUnicode_Check(unicode)) {
6176        PyErr_BadArgument();
6177        return NULL;
6178    }
6179    if (PyUnicode_READY(unicode) == -1)
6180        return NULL;
6181    kind = PyUnicode_KIND(unicode);
6182    data = PyUnicode_DATA(unicode);
6183    len = PyUnicode_GET_LENGTH(unicode);
6184    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6185       bytes, and 1 byte characters 4. */
6186    expandsize = kind * 2 + 2;
6187
6188    if (len > PY_SSIZE_T_MAX / expandsize)
6189        return PyErr_NoMemory();
6190
6191    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6192    if (repr == NULL)
6193        return NULL;
6194    if (len == 0)
6195        return repr;
6196
6197    p = q = PyBytes_AS_STRING(repr);
6198    for (pos = 0; pos < len; pos++) {
6199        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6200        /* Map 32-bit characters to '\Uxxxxxxxx' */
6201        if (ch >= 0x10000) {
6202            assert(ch <= MAX_UNICODE);
6203            *p++ = '\\';
6204            *p++ = 'U';
6205            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6206            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6207            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6208            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6209            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6210            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6211            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6212            *p++ = Py_hexdigits[ch & 15];
6213        }
6214        /* Map 16-bit characters to '\uxxxx' */
6215        else if (ch >= 256) {
6216            *p++ = '\\';
6217            *p++ = 'u';
6218            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6219            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6220            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6221            *p++ = Py_hexdigits[ch & 15];
6222        }
6223        /* Copy everything else as-is */
6224        else
6225            *p++ = (char) ch;
6226    }
6227
6228    assert(p > q);
6229    if (_PyBytes_Resize(&repr, p - q) < 0)
6230        return NULL;
6231    return repr;
6232}
6233
6234PyObject *
6235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6236                                 Py_ssize_t size)
6237{
6238    PyObject *result;
6239    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6240    if (tmp == NULL)
6241        return NULL;
6242    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6243    Py_DECREF(tmp);
6244    return result;
6245}
6246
6247/* --- Unicode Internal Codec ------------------------------------------- */
6248
6249PyObject *
6250_PyUnicode_DecodeUnicodeInternal(const char *s,
6251                                 Py_ssize_t size,
6252                                 const char *errors)
6253{
6254    const char *starts = s;
6255    Py_ssize_t startinpos;
6256    Py_ssize_t endinpos;
6257    _PyUnicodeWriter writer;
6258    const char *end;
6259    const char *reason;
6260    PyObject *errorHandler = NULL;
6261    PyObject *exc = NULL;
6262
6263    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6264                     "unicode_internal codec has been deprecated",
6265                     1))
6266        return NULL;
6267
6268    if (size == 0)
6269        _Py_RETURN_UNICODE_EMPTY();
6270
6271    _PyUnicodeWriter_Init(&writer);
6272    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6273        PyErr_NoMemory();
6274        goto onError;
6275    }
6276    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6277
6278    end = s + size;
6279    while (s < end) {
6280        Py_UNICODE uch;
6281        Py_UCS4 ch;
6282        if (end - s < Py_UNICODE_SIZE) {
6283            endinpos = end-starts;
6284            reason = "truncated input";
6285            goto error;
6286        }
6287        /* We copy the raw representation one byte at a time because the
6288           pointer may be unaligned (see test_codeccallbacks). */
6289        ((char *) &uch)[0] = s[0];
6290        ((char *) &uch)[1] = s[1];
6291#ifdef Py_UNICODE_WIDE
6292        ((char *) &uch)[2] = s[2];
6293        ((char *) &uch)[3] = s[3];
6294#endif
6295        ch = uch;
6296#ifdef Py_UNICODE_WIDE
6297        /* We have to sanity check the raw data, otherwise doom looms for
6298           some malformed UCS-4 data. */
6299        if (ch > 0x10ffff) {
6300            endinpos = s - starts + Py_UNICODE_SIZE;
6301            reason = "illegal code point (> 0x10FFFF)";
6302            goto error;
6303        }
6304#endif
6305        s += Py_UNICODE_SIZE;
6306#ifndef Py_UNICODE_WIDE
6307        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6308        {
6309            Py_UNICODE uch2;
6310            ((char *) &uch2)[0] = s[0];
6311            ((char *) &uch2)[1] = s[1];
6312            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6313            {
6314                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6315                s += Py_UNICODE_SIZE;
6316            }
6317        }
6318#endif
6319
6320        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6321            goto onError;
6322        continue;
6323
6324  error:
6325        startinpos = s - starts;
6326        if (unicode_decode_call_errorhandler_writer(
6327                errors, &errorHandler,
6328                "unicode_internal", reason,
6329                &starts, &end, &startinpos, &endinpos, &exc, &s,
6330                &writer))
6331            goto onError;
6332    }
6333
6334    Py_XDECREF(errorHandler);
6335    Py_XDECREF(exc);
6336    return _PyUnicodeWriter_Finish(&writer);
6337
6338  onError:
6339    _PyUnicodeWriter_Dealloc(&writer);
6340    Py_XDECREF(errorHandler);
6341    Py_XDECREF(exc);
6342    return NULL;
6343}
6344
6345/* --- Latin-1 Codec ------------------------------------------------------ */
6346
6347PyObject *
6348PyUnicode_DecodeLatin1(const char *s,
6349                       Py_ssize_t size,
6350                       const char *errors)
6351{
6352    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6353    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6354}
6355
6356/* create or adjust a UnicodeEncodeError */
6357static void
6358make_encode_exception(PyObject **exceptionObject,
6359                      const char *encoding,
6360                      PyObject *unicode,
6361                      Py_ssize_t startpos, Py_ssize_t endpos,
6362                      const char *reason)
6363{
6364    if (*exceptionObject == NULL) {
6365        *exceptionObject = PyObject_CallFunction(
6366            PyExc_UnicodeEncodeError, "sOnns",
6367            encoding, unicode, startpos, endpos, reason);
6368    }
6369    else {
6370        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6371            goto onError;
6372        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6373            goto onError;
6374        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6375            goto onError;
6376        return;
6377      onError:
6378        Py_CLEAR(*exceptionObject);
6379    }
6380}
6381
6382/* raises a UnicodeEncodeError */
6383static void
6384raise_encode_exception(PyObject **exceptionObject,
6385                       const char *encoding,
6386                       PyObject *unicode,
6387                       Py_ssize_t startpos, Py_ssize_t endpos,
6388                       const char *reason)
6389{
6390    make_encode_exception(exceptionObject,
6391                          encoding, unicode, startpos, endpos, reason);
6392    if (*exceptionObject != NULL)
6393        PyCodec_StrictErrors(*exceptionObject);
6394}
6395
6396/* error handling callback helper:
6397   build arguments, call the callback and check the arguments,
6398   put the result into newpos and return the replacement string, which
6399   has to be freed by the caller */
6400static PyObject *
6401unicode_encode_call_errorhandler(const char *errors,
6402                                 PyObject **errorHandler,
6403                                 const char *encoding, const char *reason,
6404                                 PyObject *unicode, PyObject **exceptionObject,
6405                                 Py_ssize_t startpos, Py_ssize_t endpos,
6406                                 Py_ssize_t *newpos)
6407{
6408    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6409    Py_ssize_t len;
6410    PyObject *restuple;
6411    PyObject *resunicode;
6412
6413    if (*errorHandler == NULL) {
6414        *errorHandler = PyCodec_LookupError(errors);
6415        if (*errorHandler == NULL)
6416            return NULL;
6417    }
6418
6419    if (PyUnicode_READY(unicode) == -1)
6420        return NULL;
6421    len = PyUnicode_GET_LENGTH(unicode);
6422
6423    make_encode_exception(exceptionObject,
6424                          encoding, unicode, startpos, endpos, reason);
6425    if (*exceptionObject == NULL)
6426        return NULL;
6427
6428    restuple = PyObject_CallFunctionObjArgs(
6429        *errorHandler, *exceptionObject, NULL);
6430    if (restuple == NULL)
6431        return NULL;
6432    if (!PyTuple_Check(restuple)) {
6433        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6434        Py_DECREF(restuple);
6435        return NULL;
6436    }
6437    if (!PyArg_ParseTuple(restuple, argparse,
6438                          &resunicode, newpos)) {
6439        Py_DECREF(restuple);
6440        return NULL;
6441    }
6442    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6443        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6444        Py_DECREF(restuple);
6445        return NULL;
6446    }
6447    if (*newpos<0)
6448        *newpos = len + *newpos;
6449    if (*newpos<0 || *newpos>len) {
6450        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6451        Py_DECREF(restuple);
6452        return NULL;
6453    }
6454    Py_INCREF(resunicode);
6455    Py_DECREF(restuple);
6456    return resunicode;
6457}
6458
6459static PyObject *
6460unicode_encode_ucs1(PyObject *unicode,
6461                    const char *errors,
6462                    unsigned int limit)
6463{
6464    /* input state */
6465    Py_ssize_t pos=0, size;
6466    int kind;
6467    void *data;
6468    /* output object */
6469    PyObject *res;
6470    /* pointer into the output */
6471    char *str;
6472    /* current output position */
6473    Py_ssize_t ressize;
6474    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6475    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6476    PyObject *errorHandler = NULL;
6477    PyObject *exc = NULL;
6478    /* the following variable is used for caching string comparisons
6479     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6480    int known_errorHandler = -1;
6481
6482    if (PyUnicode_READY(unicode) == -1)
6483        return NULL;
6484    size = PyUnicode_GET_LENGTH(unicode);
6485    kind = PyUnicode_KIND(unicode);
6486    data = PyUnicode_DATA(unicode);
6487    /* allocate enough for a simple encoding without
6488       replacements, if we need more, we'll resize */
6489    if (size == 0)
6490        return PyBytes_FromStringAndSize(NULL, 0);
6491    res = PyBytes_FromStringAndSize(NULL, size);
6492    if (res == NULL)
6493        return NULL;
6494    str = PyBytes_AS_STRING(res);
6495    ressize = size;
6496
6497    while (pos < size) {
6498        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6499
6500        /* can we encode this? */
6501        if (c<limit) {
6502            /* no overflow check, because we know that the space is enough */
6503            *str++ = (char)c;
6504            ++pos;
6505        }
6506        else {
6507            Py_ssize_t requiredsize;
6508            PyObject *repunicode;
6509            Py_ssize_t repsize, newpos, respos, i;
6510            /* startpos for collecting unencodable chars */
6511            Py_ssize_t collstart = pos;
6512            Py_ssize_t collend = pos;
6513            /* find all unecodable characters */
6514            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6515                ++collend;
6516            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6517            if (known_errorHandler==-1) {
6518                if ((errors==NULL) || (!strcmp(errors, "strict")))
6519                    known_errorHandler = 1;
6520                else if (!strcmp(errors, "replace"))
6521                    known_errorHandler = 2;
6522                else if (!strcmp(errors, "ignore"))
6523                    known_errorHandler = 3;
6524                else if (!strcmp(errors, "xmlcharrefreplace"))
6525                    known_errorHandler = 4;
6526                else
6527                    known_errorHandler = 0;
6528            }
6529            switch (known_errorHandler) {
6530            case 1: /* strict */
6531                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6532                goto onError;
6533            case 2: /* replace */
6534                while (collstart++ < collend)
6535                    *str++ = '?'; /* fall through */
6536            case 3: /* ignore */
6537                pos = collend;
6538                break;
6539            case 4: /* xmlcharrefreplace */
6540                respos = str - PyBytes_AS_STRING(res);
6541                requiredsize = respos;
6542                /* determine replacement size */
6543                for (i = collstart; i < collend; ++i) {
6544                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6545                    Py_ssize_t incr;
6546                    if (ch < 10)
6547                        incr = 2+1+1;
6548                    else if (ch < 100)
6549                        incr = 2+2+1;
6550                    else if (ch < 1000)
6551                        incr = 2+3+1;
6552                    else if (ch < 10000)
6553                        incr = 2+4+1;
6554                    else if (ch < 100000)
6555                        incr = 2+5+1;
6556                    else if (ch < 1000000)
6557                        incr = 2+6+1;
6558                    else {
6559                        assert(ch <= MAX_UNICODE);
6560                        incr = 2+7+1;
6561                    }
6562                    if (requiredsize > PY_SSIZE_T_MAX - incr)
6563                        goto overflow;
6564                    requiredsize += incr;
6565                }
6566                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6567                    goto overflow;
6568                requiredsize += size - collend;
6569                if (requiredsize > ressize) {
6570                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6571                        requiredsize = 2*ressize;
6572                    if (_PyBytes_Resize(&res, requiredsize))
6573                        goto onError;
6574                    str = PyBytes_AS_STRING(res) + respos;
6575                    ressize = requiredsize;
6576                }
6577                /* generate replacement */
6578                for (i = collstart; i < collend; ++i) {
6579                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6580                }
6581                pos = collend;
6582                break;
6583            default:
6584                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6585                                                              encoding, reason, unicode, &exc,
6586                                                              collstart, collend, &newpos);
6587                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6588                                           PyUnicode_READY(repunicode) == -1))
6589                    goto onError;
6590                if (PyBytes_Check(repunicode)) {
6591                    /* Directly copy bytes result to output. */
6592                    repsize = PyBytes_Size(repunicode);
6593                    if (repsize > 1) {
6594                        /* Make room for all additional bytes. */
6595                        respos = str - PyBytes_AS_STRING(res);
6596                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6597                            Py_DECREF(repunicode);
6598                            goto overflow;
6599                        }
6600                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6601                            Py_DECREF(repunicode);
6602                            goto onError;
6603                        }
6604                        str = PyBytes_AS_STRING(res) + respos;
6605                        ressize += repsize-1;
6606                    }
6607                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6608                    str += repsize;
6609                    pos = newpos;
6610                    Py_DECREF(repunicode);
6611                    break;
6612                }
6613                /* need more space? (at least enough for what we
6614                   have+the replacement+the rest of the string, so
6615                   we won't have to check space for encodable characters) */
6616                respos = str - PyBytes_AS_STRING(res);
6617                repsize = PyUnicode_GET_LENGTH(repunicode);
6618                requiredsize = respos;
6619                if (requiredsize > PY_SSIZE_T_MAX - repsize)
6620                    goto overflow;
6621                requiredsize += repsize;
6622                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6623                    goto overflow;
6624                requiredsize += size - collend;
6625                if (requiredsize > ressize) {
6626                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6627                        requiredsize = 2*ressize;
6628                    if (_PyBytes_Resize(&res, requiredsize)) {
6629                        Py_DECREF(repunicode);
6630                        goto onError;
6631                    }
6632                    str = PyBytes_AS_STRING(res) + respos;
6633                    ressize = requiredsize;
6634                }
6635                /* check if there is anything unencodable in the replacement
6636                   and copy it to the output */
6637                for (i = 0; repsize-->0; ++i, ++str) {
6638                    c = PyUnicode_READ_CHAR(repunicode, i);
6639                    if (c >= limit) {
6640                        raise_encode_exception(&exc, encoding, unicode,
6641                                               pos, pos+1, reason);
6642                        Py_DECREF(repunicode);
6643                        goto onError;
6644                    }
6645                    *str = (char)c;
6646                }
6647                pos = newpos;
6648                Py_DECREF(repunicode);
6649            }
6650        }
6651    }
6652    /* Resize if we allocated to much */
6653    size = str - PyBytes_AS_STRING(res);
6654    if (size < ressize) { /* If this falls res will be NULL */
6655        assert(size >= 0);
6656        if (_PyBytes_Resize(&res, size) < 0)
6657            goto onError;
6658    }
6659
6660    Py_XDECREF(errorHandler);
6661    Py_XDECREF(exc);
6662    return res;
6663
6664  overflow:
6665    PyErr_SetString(PyExc_OverflowError,
6666                    "encoded result is too long for a Python string");
6667
6668  onError:
6669    Py_XDECREF(res);
6670    Py_XDECREF(errorHandler);
6671    Py_XDECREF(exc);
6672    return NULL;
6673}
6674
6675/* Deprecated */
6676PyObject *
6677PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6678                       Py_ssize_t size,
6679                       const char *errors)
6680{
6681    PyObject *result;
6682    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6683    if (unicode == NULL)
6684        return NULL;
6685    result = unicode_encode_ucs1(unicode, errors, 256);
6686    Py_DECREF(unicode);
6687    return result;
6688}
6689
6690PyObject *
6691_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6692{
6693    if (!PyUnicode_Check(unicode)) {
6694        PyErr_BadArgument();
6695        return NULL;
6696    }
6697    if (PyUnicode_READY(unicode) == -1)
6698        return NULL;
6699    /* Fast path: if it is a one-byte string, construct
6700       bytes object directly. */
6701    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6702        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6703                                         PyUnicode_GET_LENGTH(unicode));
6704    /* Non-Latin-1 characters present. Defer to above function to
6705       raise the exception. */
6706    return unicode_encode_ucs1(unicode, errors, 256);
6707}
6708
6709PyObject*
6710PyUnicode_AsLatin1String(PyObject *unicode)
6711{
6712    return _PyUnicode_AsLatin1String(unicode, NULL);
6713}
6714
6715/* --- 7-bit ASCII Codec -------------------------------------------------- */
6716
6717PyObject *
6718PyUnicode_DecodeASCII(const char *s,
6719                      Py_ssize_t size,
6720                      const char *errors)
6721{
6722    const char *starts = s;
6723    _PyUnicodeWriter writer;
6724    int kind;
6725    void *data;
6726    Py_ssize_t startinpos;
6727    Py_ssize_t endinpos;
6728    Py_ssize_t outpos;
6729    const char *e;
6730    PyObject *errorHandler = NULL;
6731    PyObject *exc = NULL;
6732
6733    if (size == 0)
6734        _Py_RETURN_UNICODE_EMPTY();
6735
6736    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6737    if (size == 1 && (unsigned char)s[0] < 128)
6738        return get_latin1_char((unsigned char)s[0]);
6739
6740    _PyUnicodeWriter_Init(&writer);
6741    writer.min_length = size;
6742    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6743        return NULL;
6744
6745    e = s + size;
6746    data = writer.data;
6747    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6748    writer.pos = outpos;
6749    if (writer.pos == size)
6750        return _PyUnicodeWriter_Finish(&writer);
6751
6752    s += writer.pos;
6753    kind = writer.kind;
6754    while (s < e) {
6755        unsigned char c = (unsigned char)*s;
6756        if (c < 128) {
6757            PyUnicode_WRITE(kind, data, writer.pos, c);
6758            writer.pos++;
6759            ++s;
6760        }
6761        else {
6762            startinpos = s-starts;
6763            endinpos = startinpos + 1;
6764            if (unicode_decode_call_errorhandler_writer(
6765                    errors, &errorHandler,
6766                    "ascii", "ordinal not in range(128)",
6767                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6768                    &writer))
6769                goto onError;
6770            kind = writer.kind;
6771            data = writer.data;
6772        }
6773    }
6774    Py_XDECREF(errorHandler);
6775    Py_XDECREF(exc);
6776    return _PyUnicodeWriter_Finish(&writer);
6777
6778  onError:
6779    _PyUnicodeWriter_Dealloc(&writer);
6780    Py_XDECREF(errorHandler);
6781    Py_XDECREF(exc);
6782    return NULL;
6783}
6784
6785/* Deprecated */
6786PyObject *
6787PyUnicode_EncodeASCII(const Py_UNICODE *p,
6788                      Py_ssize_t size,
6789                      const char *errors)
6790{
6791    PyObject *result;
6792    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6793    if (unicode == NULL)
6794        return NULL;
6795    result = unicode_encode_ucs1(unicode, errors, 128);
6796    Py_DECREF(unicode);
6797    return result;
6798}
6799
6800PyObject *
6801_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6802{
6803    if (!PyUnicode_Check(unicode)) {
6804        PyErr_BadArgument();
6805        return NULL;
6806    }
6807    if (PyUnicode_READY(unicode) == -1)
6808        return NULL;
6809    /* Fast path: if it is an ASCII-only string, construct bytes object
6810       directly. Else defer to above function to raise the exception. */
6811    if (PyUnicode_IS_ASCII(unicode))
6812        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6813                                         PyUnicode_GET_LENGTH(unicode));
6814    return unicode_encode_ucs1(unicode, errors, 128);
6815}
6816
6817PyObject *
6818PyUnicode_AsASCIIString(PyObject *unicode)
6819{
6820    return _PyUnicode_AsASCIIString(unicode, NULL);
6821}
6822
6823#ifdef HAVE_MBCS
6824
6825/* --- MBCS codecs for Windows -------------------------------------------- */
6826
6827#if SIZEOF_INT < SIZEOF_SIZE_T
6828#define NEED_RETRY
6829#endif
6830
6831#ifndef WC_ERR_INVALID_CHARS
6832#  define WC_ERR_INVALID_CHARS 0x0080
6833#endif
6834
6835static char*
6836code_page_name(UINT code_page, PyObject **obj)
6837{
6838    *obj = NULL;
6839    if (code_page == CP_ACP)
6840        return "mbcs";
6841    if (code_page == CP_UTF7)
6842        return "CP_UTF7";
6843    if (code_page == CP_UTF8)
6844        return "CP_UTF8";
6845
6846    *obj = PyBytes_FromFormat("cp%u", code_page);
6847    if (*obj == NULL)
6848        return NULL;
6849    return PyBytes_AS_STRING(*obj);
6850}
6851
6852static DWORD
6853decode_code_page_flags(UINT code_page)
6854{
6855    if (code_page == CP_UTF7) {
6856        /* The CP_UTF7 decoder only supports flags=0 */
6857        return 0;
6858    }
6859    else
6860        return MB_ERR_INVALID_CHARS;
6861}
6862
6863/*
6864 * Decode a byte string from a Windows code page into unicode object in strict
6865 * mode.
6866 *
6867 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6868 * OSError and returns -1 on other error.
6869 */
6870static int
6871decode_code_page_strict(UINT code_page,
6872                        PyObject **v,
6873                        const char *in,
6874                        int insize)
6875{
6876    const DWORD flags = decode_code_page_flags(code_page);
6877    wchar_t *out;
6878    DWORD outsize;
6879
6880    /* First get the size of the result */
6881    assert(insize > 0);
6882    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6883    if (outsize <= 0)
6884        goto error;
6885
6886    if (*v == NULL) {
6887        /* Create unicode object */
6888        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6889        *v = (PyObject*)_PyUnicode_New(outsize);
6890        if (*v == NULL)
6891            return -1;
6892        out = PyUnicode_AS_UNICODE(*v);
6893    }
6894    else {
6895        /* Extend unicode object */
6896        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6897        if (unicode_resize(v, n + outsize) < 0)
6898            return -1;
6899        out = PyUnicode_AS_UNICODE(*v) + n;
6900    }
6901
6902    /* Do the conversion */
6903    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6904    if (outsize <= 0)
6905        goto error;
6906    return insize;
6907
6908error:
6909    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6910        return -2;
6911    PyErr_SetFromWindowsErr(0);
6912    return -1;
6913}
6914
6915/*
6916 * Decode a byte string from a code page into unicode object with an error
6917 * handler.
6918 *
6919 * Returns consumed size if succeed, or raise an OSError or
6920 * UnicodeDecodeError exception and returns -1 on error.
6921 */
6922static int
6923decode_code_page_errors(UINT code_page,
6924                        PyObject **v,
6925                        const char *in, const int size,
6926                        const char *errors, int final)
6927{
6928    const char *startin = in;
6929    const char *endin = in + size;
6930    const DWORD flags = decode_code_page_flags(code_page);
6931    /* Ideally, we should get reason from FormatMessage. This is the Windows
6932       2000 English version of the message. */
6933    const char *reason = "No mapping for the Unicode character exists "
6934                         "in the target code page.";
6935    /* each step cannot decode more than 1 character, but a character can be
6936       represented as a surrogate pair */
6937    wchar_t buffer[2], *startout, *out;
6938    int insize;
6939    Py_ssize_t outsize;
6940    PyObject *errorHandler = NULL;
6941    PyObject *exc = NULL;
6942    PyObject *encoding_obj = NULL;
6943    char *encoding;
6944    DWORD err;
6945    int ret = -1;
6946
6947    assert(size > 0);
6948
6949    encoding = code_page_name(code_page, &encoding_obj);
6950    if (encoding == NULL)
6951        return -1;
6952
6953    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6954        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6955           UnicodeDecodeError. */
6956        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6957        if (exc != NULL) {
6958            PyCodec_StrictErrors(exc);
6959            Py_CLEAR(exc);
6960        }
6961        goto error;
6962    }
6963
6964    if (*v == NULL) {
6965        /* Create unicode object */
6966        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6967            PyErr_NoMemory();
6968            goto error;
6969        }
6970        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6971        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6972        if (*v == NULL)
6973            goto error;
6974        startout = PyUnicode_AS_UNICODE(*v);
6975    }
6976    else {
6977        /* Extend unicode object */
6978        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6979        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6980            PyErr_NoMemory();
6981            goto error;
6982        }
6983        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6984            goto error;
6985        startout = PyUnicode_AS_UNICODE(*v) + n;
6986    }
6987
6988    /* Decode the byte string character per character */
6989    out = startout;
6990    while (in < endin)
6991    {
6992        /* Decode a character */
6993        insize = 1;
6994        do
6995        {
6996            outsize = MultiByteToWideChar(code_page, flags,
6997                                          in, insize,
6998                                          buffer, Py_ARRAY_LENGTH(buffer));
6999            if (outsize > 0)
7000                break;
7001            err = GetLastError();
7002            if (err != ERROR_NO_UNICODE_TRANSLATION
7003                && err != ERROR_INSUFFICIENT_BUFFER)
7004            {
7005                PyErr_SetFromWindowsErr(0);
7006                goto error;
7007            }
7008            insize++;
7009        }
7010        /* 4=maximum length of a UTF-8 sequence */
7011        while (insize <= 4 && (in + insize) <= endin);
7012
7013        if (outsize <= 0) {
7014            Py_ssize_t startinpos, endinpos, outpos;
7015
7016            /* last character in partial decode? */
7017            if (in + insize >= endin && !final)
7018                break;
7019
7020            startinpos = in - startin;
7021            endinpos = startinpos + 1;
7022            outpos = out - PyUnicode_AS_UNICODE(*v);
7023            if (unicode_decode_call_errorhandler_wchar(
7024                    errors, &errorHandler,
7025                    encoding, reason,
7026                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7027                    v, &outpos))
7028            {
7029                goto error;
7030            }
7031            out = PyUnicode_AS_UNICODE(*v) + outpos;
7032        }
7033        else {
7034            in += insize;
7035            memcpy(out, buffer, outsize * sizeof(wchar_t));
7036            out += outsize;
7037        }
7038    }
7039
7040    /* write a NUL character at the end */
7041    *out = 0;
7042
7043    /* Extend unicode object */
7044    outsize = out - startout;
7045    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7046    if (unicode_resize(v, outsize) < 0)
7047        goto error;
7048    /* (in - startin) <= size and size is an int */
7049    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7050
7051error:
7052    Py_XDECREF(encoding_obj);
7053    Py_XDECREF(errorHandler);
7054    Py_XDECREF(exc);
7055    return ret;
7056}
7057
7058static PyObject *
7059decode_code_page_stateful(int code_page,
7060                          const char *s, Py_ssize_t size,
7061                          const char *errors, Py_ssize_t *consumed)
7062{
7063    PyObject *v = NULL;
7064    int chunk_size, final, converted, done;
7065
7066    if (code_page < 0) {
7067        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7068        return NULL;
7069    }
7070
7071    if (consumed)
7072        *consumed = 0;
7073
7074    do
7075    {
7076#ifdef NEED_RETRY
7077        if (size > INT_MAX) {
7078            chunk_size = INT_MAX;
7079            final = 0;
7080            done = 0;
7081        }
7082        else
7083#endif
7084        {
7085            chunk_size = (int)size;
7086            final = (consumed == NULL);
7087            done = 1;
7088        }
7089
7090        if (chunk_size == 0 && done) {
7091            if (v != NULL)
7092                break;
7093            _Py_RETURN_UNICODE_EMPTY();
7094        }
7095
7096        converted = decode_code_page_strict(code_page, &v,
7097                                            s, chunk_size);
7098        if (converted == -2)
7099            converted = decode_code_page_errors(code_page, &v,
7100                                                s, chunk_size,
7101                                                errors, final);
7102        assert(converted != 0 || done);
7103
7104        if (converted < 0) {
7105            Py_XDECREF(v);
7106            return NULL;
7107        }
7108
7109        if (consumed)
7110            *consumed += converted;
7111
7112        s += converted;
7113        size -= converted;
7114    } while (!done);
7115
7116    return unicode_result(v);
7117}
7118
7119PyObject *
7120PyUnicode_DecodeCodePageStateful(int code_page,
7121                                 const char *s,
7122                                 Py_ssize_t size,
7123                                 const char *errors,
7124                                 Py_ssize_t *consumed)
7125{
7126    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7127}
7128
7129PyObject *
7130PyUnicode_DecodeMBCSStateful(const char *s,
7131                             Py_ssize_t size,
7132                             const char *errors,
7133                             Py_ssize_t *consumed)
7134{
7135    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7136}
7137
7138PyObject *
7139PyUnicode_DecodeMBCS(const char *s,
7140                     Py_ssize_t size,
7141                     const char *errors)
7142{
7143    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7144}
7145
7146static DWORD
7147encode_code_page_flags(UINT code_page, const char *errors)
7148{
7149    if (code_page == CP_UTF8) {
7150        if (winver.dwMajorVersion >= 6)
7151            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7152               and later */
7153            return WC_ERR_INVALID_CHARS;
7154        else
7155            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7156            return 0;
7157    }
7158    else if (code_page == CP_UTF7) {
7159        /* CP_UTF7 only supports flags=0 */
7160        return 0;
7161    }
7162    else {
7163        if (errors != NULL && strcmp(errors, "replace") == 0)
7164            return 0;
7165        else
7166            return WC_NO_BEST_FIT_CHARS;
7167    }
7168}
7169
7170/*
7171 * Encode a Unicode string to a Windows code page into a byte string in strict
7172 * mode.
7173 *
7174 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7175 * an OSError and returns -1 on other error.
7176 */
7177static int
7178encode_code_page_strict(UINT code_page, PyObject **outbytes,
7179                        PyObject *unicode, Py_ssize_t offset, int len,
7180                        const char* errors)
7181{
7182    BOOL usedDefaultChar = FALSE;
7183    BOOL *pusedDefaultChar = &usedDefaultChar;
7184    int outsize;
7185    PyObject *exc = NULL;
7186    wchar_t *p;
7187    Py_ssize_t size;
7188    const DWORD flags = encode_code_page_flags(code_page, NULL);
7189    char *out;
7190    /* Create a substring so that we can get the UTF-16 representation
7191       of just the slice under consideration. */
7192    PyObject *substring;
7193
7194    assert(len > 0);
7195
7196    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7197        pusedDefaultChar = &usedDefaultChar;
7198    else
7199        pusedDefaultChar = NULL;
7200
7201    substring = PyUnicode_Substring(unicode, offset, offset+len);
7202    if (substring == NULL)
7203        return -1;
7204    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7205    if (p == NULL) {
7206        Py_DECREF(substring);
7207        return -1;
7208    }
7209    assert(size <= INT_MAX);
7210
7211    /* First get the size of the result */
7212    outsize = WideCharToMultiByte(code_page, flags,
7213                                  p, (int)size,
7214                                  NULL, 0,
7215                                  NULL, pusedDefaultChar);
7216    if (outsize <= 0)
7217        goto error;
7218    /* If we used a default char, then we failed! */
7219    if (pusedDefaultChar && *pusedDefaultChar) {
7220        Py_DECREF(substring);
7221        return -2;
7222    }
7223
7224    if (*outbytes == NULL) {
7225        /* Create string object */
7226        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7227        if (*outbytes == NULL) {
7228            Py_DECREF(substring);
7229            return -1;
7230        }
7231        out = PyBytes_AS_STRING(*outbytes);
7232    }
7233    else {
7234        /* Extend string object */
7235        const Py_ssize_t n = PyBytes_Size(*outbytes);
7236        if (outsize > PY_SSIZE_T_MAX - n) {
7237            PyErr_NoMemory();
7238            Py_DECREF(substring);
7239            return -1;
7240        }
7241        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7242            Py_DECREF(substring);
7243            return -1;
7244        }
7245        out = PyBytes_AS_STRING(*outbytes) + n;
7246    }
7247
7248    /* Do the conversion */
7249    outsize = WideCharToMultiByte(code_page, flags,
7250                                  p, (int)size,
7251                                  out, outsize,
7252                                  NULL, pusedDefaultChar);
7253    Py_CLEAR(substring);
7254    if (outsize <= 0)
7255        goto error;
7256    if (pusedDefaultChar && *pusedDefaultChar)
7257        return -2;
7258    return 0;
7259
7260error:
7261    Py_XDECREF(substring);
7262    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7263        return -2;
7264    PyErr_SetFromWindowsErr(0);
7265    return -1;
7266}
7267
7268/*
7269 * Encode a Unicode string to a Windows code page into a byte string using a
7270 * error handler.
7271 *
7272 * Returns consumed characters if succeed, or raise an OSError and returns
7273 * -1 on other error.
7274 */
7275static int
7276encode_code_page_errors(UINT code_page, PyObject **outbytes,
7277                        PyObject *unicode, Py_ssize_t unicode_offset,
7278                        Py_ssize_t insize, const char* errors)
7279{
7280    const DWORD flags = encode_code_page_flags(code_page, errors);
7281    Py_ssize_t pos = unicode_offset;
7282    Py_ssize_t endin = unicode_offset + insize;
7283    /* Ideally, we should get reason from FormatMessage. This is the Windows
7284       2000 English version of the message. */
7285    const char *reason = "invalid character";
7286    /* 4=maximum length of a UTF-8 sequence */
7287    char buffer[4];
7288    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7289    Py_ssize_t outsize;
7290    char *out;
7291    PyObject *errorHandler = NULL;
7292    PyObject *exc = NULL;
7293    PyObject *encoding_obj = NULL;
7294    char *encoding;
7295    Py_ssize_t newpos, newoutsize;
7296    PyObject *rep;
7297    int ret = -1;
7298
7299    assert(insize > 0);
7300
7301    encoding = code_page_name(code_page, &encoding_obj);
7302    if (encoding == NULL)
7303        return -1;
7304
7305    if (errors == NULL || strcmp(errors, "strict") == 0) {
7306        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7307           then we raise a UnicodeEncodeError. */
7308        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7309        if (exc != NULL) {
7310            PyCodec_StrictErrors(exc);
7311            Py_DECREF(exc);
7312        }
7313        Py_XDECREF(encoding_obj);
7314        return -1;
7315    }
7316
7317    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7318        pusedDefaultChar = &usedDefaultChar;
7319    else
7320        pusedDefaultChar = NULL;
7321
7322    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7323        PyErr_NoMemory();
7324        goto error;
7325    }
7326    outsize = insize * Py_ARRAY_LENGTH(buffer);
7327
7328    if (*outbytes == NULL) {
7329        /* Create string object */
7330        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7331        if (*outbytes == NULL)
7332            goto error;
7333        out = PyBytes_AS_STRING(*outbytes);
7334    }
7335    else {
7336        /* Extend string object */
7337        Py_ssize_t n = PyBytes_Size(*outbytes);
7338        if (n > PY_SSIZE_T_MAX - outsize) {
7339            PyErr_NoMemory();
7340            goto error;
7341        }
7342        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7343            goto error;
7344        out = PyBytes_AS_STRING(*outbytes) + n;
7345    }
7346
7347    /* Encode the string character per character */
7348    while (pos < endin)
7349    {
7350        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7351        wchar_t chars[2];
7352        int charsize;
7353        if (ch < 0x10000) {
7354            chars[0] = (wchar_t)ch;
7355            charsize = 1;
7356        }
7357        else {
7358            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7359            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7360            charsize = 2;
7361        }
7362
7363        outsize = WideCharToMultiByte(code_page, flags,
7364                                      chars, charsize,
7365                                      buffer, Py_ARRAY_LENGTH(buffer),
7366                                      NULL, pusedDefaultChar);
7367        if (outsize > 0) {
7368            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7369            {
7370                pos++;
7371                memcpy(out, buffer, outsize);
7372                out += outsize;
7373                continue;
7374            }
7375        }
7376        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7377            PyErr_SetFromWindowsErr(0);
7378            goto error;
7379        }
7380
7381        rep = unicode_encode_call_errorhandler(
7382                  errors, &errorHandler, encoding, reason,
7383                  unicode, &exc,
7384                  pos, pos + 1, &newpos);
7385        if (rep == NULL)
7386            goto error;
7387        pos = newpos;
7388
7389        if (PyBytes_Check(rep)) {
7390            outsize = PyBytes_GET_SIZE(rep);
7391            if (outsize != 1) {
7392                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7393                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7394                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7395                    Py_DECREF(rep);
7396                    goto error;
7397                }
7398                out = PyBytes_AS_STRING(*outbytes) + offset;
7399            }
7400            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7401            out += outsize;
7402        }
7403        else {
7404            Py_ssize_t i;
7405            enum PyUnicode_Kind kind;
7406            void *data;
7407
7408            if (PyUnicode_READY(rep) == -1) {
7409                Py_DECREF(rep);
7410                goto error;
7411            }
7412
7413            outsize = PyUnicode_GET_LENGTH(rep);
7414            if (outsize != 1) {
7415                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7416                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7417                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7418                    Py_DECREF(rep);
7419                    goto error;
7420                }
7421                out = PyBytes_AS_STRING(*outbytes) + offset;
7422            }
7423            kind = PyUnicode_KIND(rep);
7424            data = PyUnicode_DATA(rep);
7425            for (i=0; i < outsize; i++) {
7426                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7427                if (ch > 127) {
7428                    raise_encode_exception(&exc,
7429                        encoding, unicode,
7430                        pos, pos + 1,
7431                        "unable to encode error handler result to ASCII");
7432                    Py_DECREF(rep);
7433                    goto error;
7434                }
7435                *out = (unsigned char)ch;
7436                out++;
7437            }
7438        }
7439        Py_DECREF(rep);
7440    }
7441    /* write a NUL byte */
7442    *out = 0;
7443    outsize = out - PyBytes_AS_STRING(*outbytes);
7444    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7445    if (_PyBytes_Resize(outbytes, outsize) < 0)
7446        goto error;
7447    ret = 0;
7448
7449error:
7450    Py_XDECREF(encoding_obj);
7451    Py_XDECREF(errorHandler);
7452    Py_XDECREF(exc);
7453    return ret;
7454}
7455
7456static PyObject *
7457encode_code_page(int code_page,
7458                 PyObject *unicode,
7459                 const char *errors)
7460{
7461    Py_ssize_t len;
7462    PyObject *outbytes = NULL;
7463    Py_ssize_t offset;
7464    int chunk_len, ret, done;
7465
7466    if (PyUnicode_READY(unicode) == -1)
7467        return NULL;
7468    len = PyUnicode_GET_LENGTH(unicode);
7469
7470    if (code_page < 0) {
7471        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7472        return NULL;
7473    }
7474
7475    if (len == 0)
7476        return PyBytes_FromStringAndSize(NULL, 0);
7477
7478    offset = 0;
7479    do
7480    {
7481#ifdef NEED_RETRY
7482        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7483           chunks. */
7484        if (len > INT_MAX/2) {
7485            chunk_len = INT_MAX/2;
7486            done = 0;
7487        }
7488        else
7489#endif
7490        {
7491            chunk_len = (int)len;
7492            done = 1;
7493        }
7494
7495        ret = encode_code_page_strict(code_page, &outbytes,
7496                                      unicode, offset, chunk_len,
7497                                      errors);
7498        if (ret == -2)
7499            ret = encode_code_page_errors(code_page, &outbytes,
7500                                          unicode, offset,
7501                                          chunk_len, errors);
7502        if (ret < 0) {
7503            Py_XDECREF(outbytes);
7504            return NULL;
7505        }
7506
7507        offset += chunk_len;
7508        len -= chunk_len;
7509    } while (!done);
7510
7511    return outbytes;
7512}
7513
7514PyObject *
7515PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7516                     Py_ssize_t size,
7517                     const char *errors)
7518{
7519    PyObject *unicode, *res;
7520    unicode = PyUnicode_FromUnicode(p, size);
7521    if (unicode == NULL)
7522        return NULL;
7523    res = encode_code_page(CP_ACP, unicode, errors);
7524    Py_DECREF(unicode);
7525    return res;
7526}
7527
7528PyObject *
7529PyUnicode_EncodeCodePage(int code_page,
7530                         PyObject *unicode,
7531                         const char *errors)
7532{
7533    return encode_code_page(code_page, unicode, errors);
7534}
7535
7536PyObject *
7537PyUnicode_AsMBCSString(PyObject *unicode)
7538{
7539    if (!PyUnicode_Check(unicode)) {
7540        PyErr_BadArgument();
7541        return NULL;
7542    }
7543    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7544}
7545
7546#undef NEED_RETRY
7547
7548#endif /* HAVE_MBCS */
7549
7550/* --- Character Mapping Codec -------------------------------------------- */
7551
7552static int
7553charmap_decode_string(const char *s,
7554                      Py_ssize_t size,
7555                      PyObject *mapping,
7556                      const char *errors,
7557                      _PyUnicodeWriter *writer)
7558{
7559    const char *starts = s;
7560    const char *e;
7561    Py_ssize_t startinpos, endinpos;
7562    PyObject *errorHandler = NULL, *exc = NULL;
7563    Py_ssize_t maplen;
7564    enum PyUnicode_Kind mapkind;
7565    void *mapdata;
7566    Py_UCS4 x;
7567    unsigned char ch;
7568
7569    if (PyUnicode_READY(mapping) == -1)
7570        return -1;
7571
7572    maplen = PyUnicode_GET_LENGTH(mapping);
7573    mapdata = PyUnicode_DATA(mapping);
7574    mapkind = PyUnicode_KIND(mapping);
7575
7576    e = s + size;
7577
7578    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7579        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7580         * is disabled in encoding aliases, latin1 is preferred because
7581         * its implementation is faster. */
7582        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7583        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7584        Py_UCS4 maxchar = writer->maxchar;
7585
7586        assert (writer->kind == PyUnicode_1BYTE_KIND);
7587        while (s < e) {
7588            ch = *s;
7589            x = mapdata_ucs1[ch];
7590            if (x > maxchar) {
7591                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7592                    goto onError;
7593                maxchar = writer->maxchar;
7594                outdata = (Py_UCS1 *)writer->data;
7595            }
7596            outdata[writer->pos] = x;
7597            writer->pos++;
7598            ++s;
7599        }
7600        return 0;
7601    }
7602
7603    while (s < e) {
7604        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7605            enum PyUnicode_Kind outkind = writer->kind;
7606            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7607            if (outkind == PyUnicode_1BYTE_KIND) {
7608                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7609                Py_UCS4 maxchar = writer->maxchar;
7610                while (s < e) {
7611                    ch = *s;
7612                    x = mapdata_ucs2[ch];
7613                    if (x > maxchar)
7614                        goto Error;
7615                    outdata[writer->pos] = x;
7616                    writer->pos++;
7617                    ++s;
7618                }
7619                break;
7620            }
7621            else if (outkind == PyUnicode_2BYTE_KIND) {
7622                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7623                while (s < e) {
7624                    ch = *s;
7625                    x = mapdata_ucs2[ch];
7626                    if (x == 0xFFFE)
7627                        goto Error;
7628                    outdata[writer->pos] = x;
7629                    writer->pos++;
7630                    ++s;
7631                }
7632                break;
7633            }
7634        }
7635        ch = *s;
7636
7637        if (ch < maplen)
7638            x = PyUnicode_READ(mapkind, mapdata, ch);
7639        else
7640            x = 0xfffe; /* invalid value */
7641Error:
7642        if (x == 0xfffe)
7643        {
7644            /* undefined mapping */
7645            startinpos = s-starts;
7646            endinpos = startinpos+1;
7647            if (unicode_decode_call_errorhandler_writer(
7648                    errors, &errorHandler,
7649                    "charmap", "character maps to <undefined>",
7650                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7651                    writer)) {
7652                goto onError;
7653            }
7654            continue;
7655        }
7656
7657        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7658            goto onError;
7659        ++s;
7660    }
7661    Py_XDECREF(errorHandler);
7662    Py_XDECREF(exc);
7663    return 0;
7664
7665onError:
7666    Py_XDECREF(errorHandler);
7667    Py_XDECREF(exc);
7668    return -1;
7669}
7670
7671static int
7672charmap_decode_mapping(const char *s,
7673                       Py_ssize_t size,
7674                       PyObject *mapping,
7675                       const char *errors,
7676                       _PyUnicodeWriter *writer)
7677{
7678    const char *starts = s;
7679    const char *e;
7680    Py_ssize_t startinpos, endinpos;
7681    PyObject *errorHandler = NULL, *exc = NULL;
7682    unsigned char ch;
7683    PyObject *key, *item = NULL;
7684
7685    e = s + size;
7686
7687    while (s < e) {
7688        ch = *s;
7689
7690        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7691        key = PyLong_FromLong((long)ch);
7692        if (key == NULL)
7693            goto onError;
7694
7695        item = PyObject_GetItem(mapping, key);
7696        Py_DECREF(key);
7697        if (item == NULL) {
7698            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7699                /* No mapping found means: mapping is undefined. */
7700                PyErr_Clear();
7701                goto Undefined;
7702            } else
7703                goto onError;
7704        }
7705
7706        /* Apply mapping */
7707        if (item == Py_None)
7708            goto Undefined;
7709        if (PyLong_Check(item)) {
7710            long value = PyLong_AS_LONG(item);
7711            if (value == 0xFFFE)
7712                goto Undefined;
7713            if (value < 0 || value > MAX_UNICODE) {
7714                PyErr_Format(PyExc_TypeError,
7715                             "character mapping must be in range(0x%lx)",
7716                             (unsigned long)MAX_UNICODE + 1);
7717                goto onError;
7718            }
7719
7720            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7721                goto onError;
7722        }
7723        else if (PyUnicode_Check(item)) {
7724            if (PyUnicode_READY(item) == -1)
7725                goto onError;
7726            if (PyUnicode_GET_LENGTH(item) == 1) {
7727                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7728                if (value == 0xFFFE)
7729                    goto Undefined;
7730                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7731                    goto onError;
7732            }
7733            else {
7734                writer->overallocate = 1;
7735                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7736                    goto onError;
7737            }
7738        }
7739        else {
7740            /* wrong return value */
7741            PyErr_SetString(PyExc_TypeError,
7742                            "character mapping must return integer, None or str");
7743            goto onError;
7744        }
7745        Py_CLEAR(item);
7746        ++s;
7747        continue;
7748
7749Undefined:
7750        /* undefined mapping */
7751        Py_CLEAR(item);
7752        startinpos = s-starts;
7753        endinpos = startinpos+1;
7754        if (unicode_decode_call_errorhandler_writer(
7755                errors, &errorHandler,
7756                "charmap", "character maps to <undefined>",
7757                &starts, &e, &startinpos, &endinpos, &exc, &s,
7758                writer)) {
7759            goto onError;
7760        }
7761    }
7762    Py_XDECREF(errorHandler);
7763    Py_XDECREF(exc);
7764    return 0;
7765
7766onError:
7767    Py_XDECREF(item);
7768    Py_XDECREF(errorHandler);
7769    Py_XDECREF(exc);
7770    return -1;
7771}
7772
7773PyObject *
7774PyUnicode_DecodeCharmap(const char *s,
7775                        Py_ssize_t size,
7776                        PyObject *mapping,
7777                        const char *errors)
7778{
7779    _PyUnicodeWriter writer;
7780
7781    /* Default to Latin-1 */
7782    if (mapping == NULL)
7783        return PyUnicode_DecodeLatin1(s, size, errors);
7784
7785    if (size == 0)
7786        _Py_RETURN_UNICODE_EMPTY();
7787    _PyUnicodeWriter_Init(&writer);
7788    writer.min_length = size;
7789    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7790        goto onError;
7791
7792    if (PyUnicode_CheckExact(mapping)) {
7793        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7794            goto onError;
7795    }
7796    else {
7797        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7798            goto onError;
7799    }
7800    return _PyUnicodeWriter_Finish(&writer);
7801
7802  onError:
7803    _PyUnicodeWriter_Dealloc(&writer);
7804    return NULL;
7805}
7806
7807/* Charmap encoding: the lookup table */
7808
7809struct encoding_map {
7810    PyObject_HEAD
7811    unsigned char level1[32];
7812    int count2, count3;
7813    unsigned char level23[1];
7814};
7815
7816static PyObject*
7817encoding_map_size(PyObject *obj, PyObject* args)
7818{
7819    struct encoding_map *map = (struct encoding_map*)obj;
7820    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7821                           128*map->count3);
7822}
7823
7824static PyMethodDef encoding_map_methods[] = {
7825    {"size", encoding_map_size, METH_NOARGS,
7826     PyDoc_STR("Return the size (in bytes) of this object") },
7827    { 0 }
7828};
7829
7830static void
7831encoding_map_dealloc(PyObject* o)
7832{
7833    PyObject_FREE(o);
7834}
7835
7836static PyTypeObject EncodingMapType = {
7837    PyVarObject_HEAD_INIT(NULL, 0)
7838    "EncodingMap",          /*tp_name*/
7839    sizeof(struct encoding_map),   /*tp_basicsize*/
7840    0,                      /*tp_itemsize*/
7841    /* methods */
7842    encoding_map_dealloc,   /*tp_dealloc*/
7843    0,                      /*tp_print*/
7844    0,                      /*tp_getattr*/
7845    0,                      /*tp_setattr*/
7846    0,                      /*tp_reserved*/
7847    0,                      /*tp_repr*/
7848    0,                      /*tp_as_number*/
7849    0,                      /*tp_as_sequence*/
7850    0,                      /*tp_as_mapping*/
7851    0,                      /*tp_hash*/
7852    0,                      /*tp_call*/
7853    0,                      /*tp_str*/
7854    0,                      /*tp_getattro*/
7855    0,                      /*tp_setattro*/
7856    0,                      /*tp_as_buffer*/
7857    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7858    0,                      /*tp_doc*/
7859    0,                      /*tp_traverse*/
7860    0,                      /*tp_clear*/
7861    0,                      /*tp_richcompare*/
7862    0,                      /*tp_weaklistoffset*/
7863    0,                      /*tp_iter*/
7864    0,                      /*tp_iternext*/
7865    encoding_map_methods,   /*tp_methods*/
7866    0,                      /*tp_members*/
7867    0,                      /*tp_getset*/
7868    0,                      /*tp_base*/
7869    0,                      /*tp_dict*/
7870    0,                      /*tp_descr_get*/
7871    0,                      /*tp_descr_set*/
7872    0,                      /*tp_dictoffset*/
7873    0,                      /*tp_init*/
7874    0,                      /*tp_alloc*/
7875    0,                      /*tp_new*/
7876    0,                      /*tp_free*/
7877    0,                      /*tp_is_gc*/
7878};
7879
7880PyObject*
7881PyUnicode_BuildEncodingMap(PyObject* string)
7882{
7883    PyObject *result;
7884    struct encoding_map *mresult;
7885    int i;
7886    int need_dict = 0;
7887    unsigned char level1[32];
7888    unsigned char level2[512];
7889    unsigned char *mlevel1, *mlevel2, *mlevel3;
7890    int count2 = 0, count3 = 0;
7891    int kind;
7892    void *data;
7893    Py_ssize_t length;
7894    Py_UCS4 ch;
7895
7896    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7897        PyErr_BadArgument();
7898        return NULL;
7899    }
7900    kind = PyUnicode_KIND(string);
7901    data = PyUnicode_DATA(string);
7902    length = PyUnicode_GET_LENGTH(string);
7903    length = Py_MIN(length, 256);
7904    memset(level1, 0xFF, sizeof level1);
7905    memset(level2, 0xFF, sizeof level2);
7906
7907    /* If there isn't a one-to-one mapping of NULL to \0,
7908       or if there are non-BMP characters, we need to use
7909       a mapping dictionary. */
7910    if (PyUnicode_READ(kind, data, 0) != 0)
7911        need_dict = 1;
7912    for (i = 1; i < length; i++) {
7913        int l1, l2;
7914        ch = PyUnicode_READ(kind, data, i);
7915        if (ch == 0 || ch > 0xFFFF) {
7916            need_dict = 1;
7917            break;
7918        }
7919        if (ch == 0xFFFE)
7920            /* unmapped character */
7921            continue;
7922        l1 = ch >> 11;
7923        l2 = ch >> 7;
7924        if (level1[l1] == 0xFF)
7925            level1[l1] = count2++;
7926        if (level2[l2] == 0xFF)
7927            level2[l2] = count3++;
7928    }
7929
7930    if (count2 >= 0xFF || count3 >= 0xFF)
7931        need_dict = 1;
7932
7933    if (need_dict) {
7934        PyObject *result = PyDict_New();
7935        PyObject *key, *value;
7936        if (!result)
7937            return NULL;
7938        for (i = 0; i < length; i++) {
7939            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7940            value = PyLong_FromLong(i);
7941            if (!key || !value)
7942                goto failed1;
7943            if (PyDict_SetItem(result, key, value) == -1)
7944                goto failed1;
7945            Py_DECREF(key);
7946            Py_DECREF(value);
7947        }
7948        return result;
7949      failed1:
7950        Py_XDECREF(key);
7951        Py_XDECREF(value);
7952        Py_DECREF(result);
7953        return NULL;
7954    }
7955
7956    /* Create a three-level trie */
7957    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7958                             16*count2 + 128*count3 - 1);
7959    if (!result)
7960        return PyErr_NoMemory();
7961    PyObject_Init(result, &EncodingMapType);
7962    mresult = (struct encoding_map*)result;
7963    mresult->count2 = count2;
7964    mresult->count3 = count3;
7965    mlevel1 = mresult->level1;
7966    mlevel2 = mresult->level23;
7967    mlevel3 = mresult->level23 + 16*count2;
7968    memcpy(mlevel1, level1, 32);
7969    memset(mlevel2, 0xFF, 16*count2);
7970    memset(mlevel3, 0, 128*count3);
7971    count3 = 0;
7972    for (i = 1; i < length; i++) {
7973        int o1, o2, o3, i2, i3;
7974        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7975        if (ch == 0xFFFE)
7976            /* unmapped character */
7977            continue;
7978        o1 = ch>>11;
7979        o2 = (ch>>7) & 0xF;
7980        i2 = 16*mlevel1[o1] + o2;
7981        if (mlevel2[i2] == 0xFF)
7982            mlevel2[i2] = count3++;
7983        o3 = ch & 0x7F;
7984        i3 = 128*mlevel2[i2] + o3;
7985        mlevel3[i3] = i;
7986    }
7987    return result;
7988}
7989
7990static int
7991encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7992{
7993    struct encoding_map *map = (struct encoding_map*)mapping;
7994    int l1 = c>>11;
7995    int l2 = (c>>7) & 0xF;
7996    int l3 = c & 0x7F;
7997    int i;
7998
7999    if (c > 0xFFFF)
8000        return -1;
8001    if (c == 0)
8002        return 0;
8003    /* level 1*/
8004    i = map->level1[l1];
8005    if (i == 0xFF) {
8006        return -1;
8007    }
8008    /* level 2*/
8009    i = map->level23[16*i+l2];
8010    if (i == 0xFF) {
8011        return -1;
8012    }
8013    /* level 3 */
8014    i = map->level23[16*map->count2 + 128*i + l3];
8015    if (i == 0) {
8016        return -1;
8017    }
8018    return i;
8019}
8020
8021/* Lookup the character ch in the mapping. If the character
8022   can't be found, Py_None is returned (or NULL, if another
8023   error occurred). */
8024static PyObject *
8025charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8026{
8027    PyObject *w = PyLong_FromLong((long)c);
8028    PyObject *x;
8029
8030    if (w == NULL)
8031        return NULL;
8032    x = PyObject_GetItem(mapping, w);
8033    Py_DECREF(w);
8034    if (x == NULL) {
8035        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8036            /* No mapping found means: mapping is undefined. */
8037            PyErr_Clear();
8038            x = Py_None;
8039            Py_INCREF(x);
8040            return x;
8041        } else
8042            return NULL;
8043    }
8044    else if (x == Py_None)
8045        return x;
8046    else if (PyLong_Check(x)) {
8047        long value = PyLong_AS_LONG(x);
8048        if (value < 0 || value > 255) {
8049            PyErr_SetString(PyExc_TypeError,
8050                            "character mapping must be in range(256)");
8051            Py_DECREF(x);
8052            return NULL;
8053        }
8054        return x;
8055    }
8056    else if (PyBytes_Check(x))
8057        return x;
8058    else {
8059        /* wrong return value */
8060        PyErr_Format(PyExc_TypeError,
8061                     "character mapping must return integer, bytes or None, not %.400s",
8062                     x->ob_type->tp_name);
8063        Py_DECREF(x);
8064        return NULL;
8065    }
8066}
8067
8068static int
8069charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8070{
8071    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8072    /* exponentially overallocate to minimize reallocations */
8073    if (requiredsize < 2*outsize)
8074        requiredsize = 2*outsize;
8075    if (_PyBytes_Resize(outobj, requiredsize))
8076        return -1;
8077    return 0;
8078}
8079
8080typedef enum charmapencode_result {
8081    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8082} charmapencode_result;
8083/* lookup the character, put the result in the output string and adjust
8084   various state variables. Resize the output bytes object if not enough
8085   space is available. Return a new reference to the object that
8086   was put in the output buffer, or Py_None, if the mapping was undefined
8087   (in which case no character was written) or NULL, if a
8088   reallocation error occurred. The caller must decref the result */
8089static charmapencode_result
8090charmapencode_output(Py_UCS4 c, PyObject *mapping,
8091                     PyObject **outobj, Py_ssize_t *outpos)
8092{
8093    PyObject *rep;
8094    char *outstart;
8095    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8096
8097    if (Py_TYPE(mapping) == &EncodingMapType) {
8098        int res = encoding_map_lookup(c, mapping);
8099        Py_ssize_t requiredsize = *outpos+1;
8100        if (res == -1)
8101            return enc_FAILED;
8102        if (outsize<requiredsize)
8103            if (charmapencode_resize(outobj, outpos, requiredsize))
8104                return enc_EXCEPTION;
8105        outstart = PyBytes_AS_STRING(*outobj);
8106        outstart[(*outpos)++] = (char)res;
8107        return enc_SUCCESS;
8108    }
8109
8110    rep = charmapencode_lookup(c, mapping);
8111    if (rep==NULL)
8112        return enc_EXCEPTION;
8113    else if (rep==Py_None) {
8114        Py_DECREF(rep);
8115        return enc_FAILED;
8116    } else {
8117        if (PyLong_Check(rep)) {
8118            Py_ssize_t requiredsize = *outpos+1;
8119            if (outsize<requiredsize)
8120                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8121                    Py_DECREF(rep);
8122                    return enc_EXCEPTION;
8123                }
8124            outstart = PyBytes_AS_STRING(*outobj);
8125            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8126        }
8127        else {
8128            const char *repchars = PyBytes_AS_STRING(rep);
8129            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8130            Py_ssize_t requiredsize = *outpos+repsize;
8131            if (outsize<requiredsize)
8132                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8133                    Py_DECREF(rep);
8134                    return enc_EXCEPTION;
8135                }
8136            outstart = PyBytes_AS_STRING(*outobj);
8137            memcpy(outstart + *outpos, repchars, repsize);
8138            *outpos += repsize;
8139        }
8140    }
8141    Py_DECREF(rep);
8142    return enc_SUCCESS;
8143}
8144
8145/* handle an error in PyUnicode_EncodeCharmap
8146   Return 0 on success, -1 on error */
8147static int
8148charmap_encoding_error(
8149    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8150    PyObject **exceptionObject,
8151    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8152    PyObject **res, Py_ssize_t *respos)
8153{
8154    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8155    Py_ssize_t size, repsize;
8156    Py_ssize_t newpos;
8157    enum PyUnicode_Kind kind;
8158    void *data;
8159    Py_ssize_t index;
8160    /* startpos for collecting unencodable chars */
8161    Py_ssize_t collstartpos = *inpos;
8162    Py_ssize_t collendpos = *inpos+1;
8163    Py_ssize_t collpos;
8164    char *encoding = "charmap";
8165    char *reason = "character maps to <undefined>";
8166    charmapencode_result x;
8167    Py_UCS4 ch;
8168    int val;
8169
8170    if (PyUnicode_READY(unicode) == -1)
8171        return -1;
8172    size = PyUnicode_GET_LENGTH(unicode);
8173    /* find all unencodable characters */
8174    while (collendpos < size) {
8175        PyObject *rep;
8176        if (Py_TYPE(mapping) == &EncodingMapType) {
8177            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8178            val = encoding_map_lookup(ch, mapping);
8179            if (val != -1)
8180                break;
8181            ++collendpos;
8182            continue;
8183        }
8184
8185        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8186        rep = charmapencode_lookup(ch, mapping);
8187        if (rep==NULL)
8188            return -1;
8189        else if (rep!=Py_None) {
8190            Py_DECREF(rep);
8191            break;
8192        }
8193        Py_DECREF(rep);
8194        ++collendpos;
8195    }
8196    /* cache callback name lookup
8197     * (if not done yet, i.e. it's the first error) */
8198    if (*known_errorHandler==-1) {
8199        if ((errors==NULL) || (!strcmp(errors, "strict")))
8200            *known_errorHandler = 1;
8201        else if (!strcmp(errors, "replace"))
8202            *known_errorHandler = 2;
8203        else if (!strcmp(errors, "ignore"))
8204            *known_errorHandler = 3;
8205        else if (!strcmp(errors, "xmlcharrefreplace"))
8206            *known_errorHandler = 4;
8207        else
8208            *known_errorHandler = 0;
8209    }
8210    switch (*known_errorHandler) {
8211    case 1: /* strict */
8212        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8213        return -1;
8214    case 2: /* replace */
8215        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8216            x = charmapencode_output('?', mapping, res, respos);
8217            if (x==enc_EXCEPTION) {
8218                return -1;
8219            }
8220            else if (x==enc_FAILED) {
8221                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8222                return -1;
8223            }
8224        }
8225        /* fall through */
8226    case 3: /* ignore */
8227        *inpos = collendpos;
8228        break;
8229    case 4: /* xmlcharrefreplace */
8230        /* generate replacement (temporarily (mis)uses p) */
8231        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8232            char buffer[2+29+1+1];
8233            char *cp;
8234            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8235            for (cp = buffer; *cp; ++cp) {
8236                x = charmapencode_output(*cp, mapping, res, respos);
8237                if (x==enc_EXCEPTION)
8238                    return -1;
8239                else if (x==enc_FAILED) {
8240                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8241                    return -1;
8242                }
8243            }
8244        }
8245        *inpos = collendpos;
8246        break;
8247    default:
8248        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8249                                                      encoding, reason, unicode, exceptionObject,
8250                                                      collstartpos, collendpos, &newpos);
8251        if (repunicode == NULL)
8252            return -1;
8253        if (PyBytes_Check(repunicode)) {
8254            /* Directly copy bytes result to output. */
8255            Py_ssize_t outsize = PyBytes_Size(*res);
8256            Py_ssize_t requiredsize;
8257            repsize = PyBytes_Size(repunicode);
8258            requiredsize = *respos + repsize;
8259            if (requiredsize > outsize)
8260                /* Make room for all additional bytes. */
8261                if (charmapencode_resize(res, respos, requiredsize)) {
8262                    Py_DECREF(repunicode);
8263                    return -1;
8264                }
8265            memcpy(PyBytes_AsString(*res) + *respos,
8266                   PyBytes_AsString(repunicode),  repsize);
8267            *respos += repsize;
8268            *inpos = newpos;
8269            Py_DECREF(repunicode);
8270            break;
8271        }
8272        /* generate replacement  */
8273        if (PyUnicode_READY(repunicode) == -1) {
8274            Py_DECREF(repunicode);
8275            return -1;
8276        }
8277        repsize = PyUnicode_GET_LENGTH(repunicode);
8278        data = PyUnicode_DATA(repunicode);
8279        kind = PyUnicode_KIND(repunicode);
8280        for (index = 0; index < repsize; index++) {
8281            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8282            x = charmapencode_output(repch, mapping, res, respos);
8283            if (x==enc_EXCEPTION) {
8284                Py_DECREF(repunicode);
8285                return -1;
8286            }
8287            else if (x==enc_FAILED) {
8288                Py_DECREF(repunicode);
8289                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8290                return -1;
8291            }
8292        }
8293        *inpos = newpos;
8294        Py_DECREF(repunicode);
8295    }
8296    return 0;
8297}
8298
8299PyObject *
8300_PyUnicode_EncodeCharmap(PyObject *unicode,
8301                         PyObject *mapping,
8302                         const char *errors)
8303{
8304    /* output object */
8305    PyObject *res = NULL;
8306    /* current input position */
8307    Py_ssize_t inpos = 0;
8308    Py_ssize_t size;
8309    /* current output position */
8310    Py_ssize_t respos = 0;
8311    PyObject *errorHandler = NULL;
8312    PyObject *exc = NULL;
8313    /* the following variable is used for caching string comparisons
8314     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8315     * 3=ignore, 4=xmlcharrefreplace */
8316    int known_errorHandler = -1;
8317    void *data;
8318    int kind;
8319
8320    if (PyUnicode_READY(unicode) == -1)
8321        return NULL;
8322    size = PyUnicode_GET_LENGTH(unicode);
8323    data = PyUnicode_DATA(unicode);
8324    kind = PyUnicode_KIND(unicode);
8325
8326    /* Default to Latin-1 */
8327    if (mapping == NULL)
8328        return unicode_encode_ucs1(unicode, errors, 256);
8329
8330    /* allocate enough for a simple encoding without
8331       replacements, if we need more, we'll resize */
8332    res = PyBytes_FromStringAndSize(NULL, size);
8333    if (res == NULL)
8334        goto onError;
8335    if (size == 0)
8336        return res;
8337
8338    while (inpos<size) {
8339        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8340        /* try to encode it */
8341        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8342        if (x==enc_EXCEPTION) /* error */
8343            goto onError;
8344        if (x==enc_FAILED) { /* unencodable character */
8345            if (charmap_encoding_error(unicode, &inpos, mapping,
8346                                       &exc,
8347                                       &known_errorHandler, &errorHandler, errors,
8348                                       &res, &respos)) {
8349                goto onError;
8350            }
8351        }
8352        else
8353            /* done with this character => adjust input position */
8354            ++inpos;
8355    }
8356
8357    /* Resize if we allocated to much */
8358    if (respos<PyBytes_GET_SIZE(res))
8359        if (_PyBytes_Resize(&res, respos) < 0)
8360            goto onError;
8361
8362    Py_XDECREF(exc);
8363    Py_XDECREF(errorHandler);
8364    return res;
8365
8366  onError:
8367    Py_XDECREF(res);
8368    Py_XDECREF(exc);
8369    Py_XDECREF(errorHandler);
8370    return NULL;
8371}
8372
8373/* Deprecated */
8374PyObject *
8375PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8376                        Py_ssize_t size,
8377                        PyObject *mapping,
8378                        const char *errors)
8379{
8380    PyObject *result;
8381    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8382    if (unicode == NULL)
8383        return NULL;
8384    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8385    Py_DECREF(unicode);
8386    return result;
8387}
8388
8389PyObject *
8390PyUnicode_AsCharmapString(PyObject *unicode,
8391                          PyObject *mapping)
8392{
8393    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8394        PyErr_BadArgument();
8395        return NULL;
8396    }
8397    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8398}
8399
8400/* create or adjust a UnicodeTranslateError */
8401static void
8402make_translate_exception(PyObject **exceptionObject,
8403                         PyObject *unicode,
8404                         Py_ssize_t startpos, Py_ssize_t endpos,
8405                         const char *reason)
8406{
8407    if (*exceptionObject == NULL) {
8408        *exceptionObject = _PyUnicodeTranslateError_Create(
8409            unicode, startpos, endpos, reason);
8410    }
8411    else {
8412        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8413            goto onError;
8414        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8415            goto onError;
8416        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8417            goto onError;
8418        return;
8419      onError:
8420        Py_CLEAR(*exceptionObject);
8421    }
8422}
8423
8424/* error handling callback helper:
8425   build arguments, call the callback and check the arguments,
8426   put the result into newpos and return the replacement string, which
8427   has to be freed by the caller */
8428static PyObject *
8429unicode_translate_call_errorhandler(const char *errors,
8430                                    PyObject **errorHandler,
8431                                    const char *reason,
8432                                    PyObject *unicode, PyObject **exceptionObject,
8433                                    Py_ssize_t startpos, Py_ssize_t endpos,
8434                                    Py_ssize_t *newpos)
8435{
8436    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8437
8438    Py_ssize_t i_newpos;
8439    PyObject *restuple;
8440    PyObject *resunicode;
8441
8442    if (*errorHandler == NULL) {
8443        *errorHandler = PyCodec_LookupError(errors);
8444        if (*errorHandler == NULL)
8445            return NULL;
8446    }
8447
8448    make_translate_exception(exceptionObject,
8449                             unicode, startpos, endpos, reason);
8450    if (*exceptionObject == NULL)
8451        return NULL;
8452
8453    restuple = PyObject_CallFunctionObjArgs(
8454        *errorHandler, *exceptionObject, NULL);
8455    if (restuple == NULL)
8456        return NULL;
8457    if (!PyTuple_Check(restuple)) {
8458        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8459        Py_DECREF(restuple);
8460        return NULL;
8461    }
8462    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8463                          &resunicode, &i_newpos)) {
8464        Py_DECREF(restuple);
8465        return NULL;
8466    }
8467    if (i_newpos<0)
8468        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8469    else
8470        *newpos = i_newpos;
8471    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8472        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8473        Py_DECREF(restuple);
8474        return NULL;
8475    }
8476    Py_INCREF(resunicode);
8477    Py_DECREF(restuple);
8478    return resunicode;
8479}
8480
8481/* Lookup the character ch in the mapping and put the result in result,
8482   which must be decrefed by the caller.
8483   Return 0 on success, -1 on error */
8484static int
8485charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8486{
8487    PyObject *w = PyLong_FromLong((long)c);
8488    PyObject *x;
8489
8490    if (w == NULL)
8491        return -1;
8492    x = PyObject_GetItem(mapping, w);
8493    Py_DECREF(w);
8494    if (x == NULL) {
8495        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8496            /* No mapping found means: use 1:1 mapping. */
8497            PyErr_Clear();
8498            *result = NULL;
8499            return 0;
8500        } else
8501            return -1;
8502    }
8503    else if (x == Py_None) {
8504        *result = x;
8505        return 0;
8506    }
8507    else if (PyLong_Check(x)) {
8508        long value = PyLong_AS_LONG(x);
8509        if (value < 0 || value > MAX_UNICODE) {
8510            PyErr_Format(PyExc_ValueError,
8511                         "character mapping must be in range(0x%x)",
8512                         MAX_UNICODE+1);
8513            Py_DECREF(x);
8514            return -1;
8515        }
8516        *result = x;
8517        return 0;
8518    }
8519    else if (PyUnicode_Check(x)) {
8520        *result = x;
8521        return 0;
8522    }
8523    else {
8524        /* wrong return value */
8525        PyErr_SetString(PyExc_TypeError,
8526                        "character mapping must return integer, None or str");
8527        Py_DECREF(x);
8528        return -1;
8529    }
8530}
8531
8532/* lookup the character, write the result into the writer.
8533   Return 1 if the result was written into the writer, return 0 if the mapping
8534   was undefined, raise an exception return -1 on error. */
8535static int
8536charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8537                        _PyUnicodeWriter *writer)
8538{
8539    PyObject *item;
8540
8541    if (charmaptranslate_lookup(ch, mapping, &item))
8542        return -1;
8543
8544    if (item == NULL) {
8545        /* not found => default to 1:1 mapping */
8546        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8547            return -1;
8548        }
8549        return 1;
8550    }
8551
8552    if (item == Py_None) {
8553        Py_DECREF(item);
8554        return 0;
8555    }
8556
8557    if (PyLong_Check(item)) {
8558        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8559        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8560           used it */
8561        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8562            Py_DECREF(item);
8563            return -1;
8564        }
8565        Py_DECREF(item);
8566        return 1;
8567    }
8568
8569    if (!PyUnicode_Check(item)) {
8570        Py_DECREF(item);
8571        return -1;
8572    }
8573
8574    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8575        Py_DECREF(item);
8576        return -1;
8577    }
8578
8579    Py_DECREF(item);
8580    return 1;
8581}
8582
8583static int
8584unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8585                              Py_UCS1 *translate)
8586{
8587    PyObject *item = NULL;
8588    int ret = 0;
8589
8590    if (charmaptranslate_lookup(ch, mapping, &item)) {
8591        return -1;
8592    }
8593
8594    if (item == Py_None) {
8595        /* deletion */
8596        translate[ch] = 0xfe;
8597    }
8598    else if (item == NULL) {
8599        /* not found => default to 1:1 mapping */
8600        translate[ch] = ch;
8601        return 1;
8602    }
8603    else if (PyLong_Check(item)) {
8604        long replace = PyLong_AS_LONG(item);
8605        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8606           used it */
8607        if (127 < replace) {
8608            /* invalid character or character outside ASCII:
8609               skip the fast translate */
8610            goto exit;
8611        }
8612        translate[ch] = (Py_UCS1)replace;
8613    }
8614    else if (PyUnicode_Check(item)) {
8615        Py_UCS4 replace;
8616
8617        if (PyUnicode_READY(item) == -1) {
8618            Py_DECREF(item);
8619            return -1;
8620        }
8621        if (PyUnicode_GET_LENGTH(item) != 1)
8622            goto exit;
8623
8624        replace = PyUnicode_READ_CHAR(item, 0);
8625        if (replace > 127)
8626            goto exit;
8627        translate[ch] = (Py_UCS1)replace;
8628    }
8629    else {
8630        /* not None, NULL, long or unicode */
8631        goto exit;
8632    }
8633    ret = 1;
8634
8635  exit:
8636    Py_DECREF(item);
8637    return ret;
8638}
8639
8640/* Fast path for ascii => ascii translation. Return 1 if the whole string
8641   was translated into writer, return 0 if the input string was partially
8642   translated into writer, raise an exception and return -1 on error. */
8643static int
8644unicode_fast_translate(PyObject *input, PyObject *mapping,
8645                       _PyUnicodeWriter *writer, int ignore)
8646{
8647    Py_UCS1 ascii_table[128], ch, ch2;
8648    Py_ssize_t len;
8649    Py_UCS1 *in, *end, *out;
8650    int res = 0;
8651
8652    if (PyUnicode_READY(input) == -1)
8653        return -1;
8654    if (!PyUnicode_IS_ASCII(input))
8655        return 0;
8656    len = PyUnicode_GET_LENGTH(input);
8657
8658    memset(ascii_table, 0xff, 128);
8659
8660    in = PyUnicode_1BYTE_DATA(input);
8661    end = in + len;
8662
8663    assert(PyUnicode_IS_ASCII(writer->buffer));
8664    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8665    out = PyUnicode_1BYTE_DATA(writer->buffer);
8666
8667    for (; in < end; in++) {
8668        ch = *in;
8669        ch2 = ascii_table[ch];
8670        if (ch2 == 0xff) {
8671            int translate = unicode_fast_translate_lookup(mapping, ch,
8672                                                          ascii_table);
8673            if (translate < 0)
8674                return -1;
8675            if (translate == 0)
8676                goto exit;
8677            ch2 = ascii_table[ch];
8678        }
8679        if (ch2 == 0xfe) {
8680            if (ignore)
8681                continue;
8682            goto exit;
8683        }
8684        assert(ch2 < 128);
8685        *out = ch2;
8686        out++;
8687    }
8688    res = 1;
8689
8690exit:
8691    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8692    return res;
8693}
8694
8695PyObject *
8696_PyUnicode_TranslateCharmap(PyObject *input,
8697                            PyObject *mapping,
8698                            const char *errors)
8699{
8700    /* input object */
8701    char *data;
8702    Py_ssize_t size, i;
8703    int kind;
8704    /* output buffer */
8705    _PyUnicodeWriter writer;
8706    /* error handler */
8707    char *reason = "character maps to <undefined>";
8708    PyObject *errorHandler = NULL;
8709    PyObject *exc = NULL;
8710    int ignore;
8711    int res;
8712
8713    if (mapping == NULL) {
8714        PyErr_BadArgument();
8715        return NULL;
8716    }
8717
8718    if (PyUnicode_READY(input) == -1)
8719        return NULL;
8720    data = (char*)PyUnicode_DATA(input);
8721    kind = PyUnicode_KIND(input);
8722    size = PyUnicode_GET_LENGTH(input);
8723
8724    if (size == 0) {
8725        Py_INCREF(input);
8726        return input;
8727    }
8728
8729    /* allocate enough for a simple 1:1 translation without
8730       replacements, if we need more, we'll resize */
8731    _PyUnicodeWriter_Init(&writer);
8732    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8733        goto onError;
8734
8735    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8736
8737    res = unicode_fast_translate(input, mapping, &writer, ignore);
8738    if (res < 0) {
8739        _PyUnicodeWriter_Dealloc(&writer);
8740        return NULL;
8741    }
8742    if (res == 1)
8743        return _PyUnicodeWriter_Finish(&writer);
8744
8745    i = writer.pos;
8746    while (i<size) {
8747        /* try to encode it */
8748        int translate;
8749        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8750        Py_ssize_t newpos;
8751        /* startpos for collecting untranslatable chars */
8752        Py_ssize_t collstart;
8753        Py_ssize_t collend;
8754        Py_UCS4 ch;
8755
8756        ch = PyUnicode_READ(kind, data, i);
8757        translate = charmaptranslate_output(ch, mapping, &writer);
8758        if (translate < 0)
8759            goto onError;
8760
8761        if (translate != 0) {
8762            /* it worked => adjust input pointer */
8763            ++i;
8764            continue;
8765        }
8766
8767        /* untranslatable character */
8768        collstart = i;
8769        collend = i+1;
8770
8771        /* find all untranslatable characters */
8772        while (collend < size) {
8773            PyObject *x;
8774            ch = PyUnicode_READ(kind, data, collend);
8775            if (charmaptranslate_lookup(ch, mapping, &x))
8776                goto onError;
8777            Py_XDECREF(x);
8778            if (x != Py_None)
8779                break;
8780            ++collend;
8781        }
8782
8783        if (ignore) {
8784            i = collend;
8785        }
8786        else {
8787            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8788                                                             reason, input, &exc,
8789                                                             collstart, collend, &newpos);
8790            if (repunicode == NULL)
8791                goto onError;
8792            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
8793                Py_DECREF(repunicode);
8794                goto onError;
8795            }
8796            Py_DECREF(repunicode);
8797            i = newpos;
8798        }
8799    }
8800    Py_XDECREF(exc);
8801    Py_XDECREF(errorHandler);
8802    return _PyUnicodeWriter_Finish(&writer);
8803
8804  onError:
8805    _PyUnicodeWriter_Dealloc(&writer);
8806    Py_XDECREF(exc);
8807    Py_XDECREF(errorHandler);
8808    return NULL;
8809}
8810
8811/* Deprecated. Use PyUnicode_Translate instead. */
8812PyObject *
8813PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8814                           Py_ssize_t size,
8815                           PyObject *mapping,
8816                           const char *errors)
8817{
8818    PyObject *result;
8819    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8820    if (!unicode)
8821        return NULL;
8822    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8823    Py_DECREF(unicode);
8824    return result;
8825}
8826
8827PyObject *
8828PyUnicode_Translate(PyObject *str,
8829                    PyObject *mapping,
8830                    const char *errors)
8831{
8832    PyObject *result;
8833
8834    str = PyUnicode_FromObject(str);
8835    if (str == NULL)
8836        return NULL;
8837    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8838    Py_DECREF(str);
8839    return result;
8840}
8841
8842static Py_UCS4
8843fix_decimal_and_space_to_ascii(PyObject *self)
8844{
8845    /* No need to call PyUnicode_READY(self) because this function is only
8846       called as a callback from fixup() which does it already. */
8847    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8848    const int kind = PyUnicode_KIND(self);
8849    void *data = PyUnicode_DATA(self);
8850    Py_UCS4 maxchar = 127, ch, fixed;
8851    int modified = 0;
8852    Py_ssize_t i;
8853
8854    for (i = 0; i < len; ++i) {
8855        ch = PyUnicode_READ(kind, data, i);
8856        fixed = 0;
8857        if (ch > 127) {
8858            if (Py_UNICODE_ISSPACE(ch))
8859                fixed = ' ';
8860            else {
8861                const int decimal = Py_UNICODE_TODECIMAL(ch);
8862                if (decimal >= 0)
8863                    fixed = '0' + decimal;
8864            }
8865            if (fixed != 0) {
8866                modified = 1;
8867                maxchar = Py_MAX(maxchar, fixed);
8868                PyUnicode_WRITE(kind, data, i, fixed);
8869            }
8870            else
8871                maxchar = Py_MAX(maxchar, ch);
8872        }
8873    }
8874
8875    return (modified) ? maxchar : 0;
8876}
8877
8878PyObject *
8879_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8880{
8881    if (!PyUnicode_Check(unicode)) {
8882        PyErr_BadInternalCall();
8883        return NULL;
8884    }
8885    if (PyUnicode_READY(unicode) == -1)
8886        return NULL;
8887    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8888        /* If the string is already ASCII, just return the same string */
8889        Py_INCREF(unicode);
8890        return unicode;
8891    }
8892    return fixup(unicode, fix_decimal_and_space_to_ascii);
8893}
8894
8895PyObject *
8896PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8897                                  Py_ssize_t length)
8898{
8899    PyObject *decimal;
8900    Py_ssize_t i;
8901    Py_UCS4 maxchar;
8902    enum PyUnicode_Kind kind;
8903    void *data;
8904
8905    maxchar = 127;
8906    for (i = 0; i < length; i++) {
8907        Py_UCS4 ch = s[i];
8908        if (ch > 127) {
8909            int decimal = Py_UNICODE_TODECIMAL(ch);
8910            if (decimal >= 0)
8911                ch = '0' + decimal;
8912            maxchar = Py_MAX(maxchar, ch);
8913        }
8914    }
8915
8916    /* Copy to a new string */
8917    decimal = PyUnicode_New(length, maxchar);
8918    if (decimal == NULL)
8919        return decimal;
8920    kind = PyUnicode_KIND(decimal);
8921    data = PyUnicode_DATA(decimal);
8922    /* Iterate over code points */
8923    for (i = 0; i < length; i++) {
8924        Py_UCS4 ch = s[i];
8925        if (ch > 127) {
8926            int decimal = Py_UNICODE_TODECIMAL(ch);
8927            if (decimal >= 0)
8928                ch = '0' + decimal;
8929        }
8930        PyUnicode_WRITE(kind, data, i, ch);
8931    }
8932    return unicode_result(decimal);
8933}
8934/* --- Decimal Encoder ---------------------------------------------------- */
8935
8936int
8937PyUnicode_EncodeDecimal(Py_UNICODE *s,
8938                        Py_ssize_t length,
8939                        char *output,
8940                        const char *errors)
8941{
8942    PyObject *unicode;
8943    Py_ssize_t i;
8944    enum PyUnicode_Kind kind;
8945    void *data;
8946
8947    if (output == NULL) {
8948        PyErr_BadArgument();
8949        return -1;
8950    }
8951
8952    unicode = PyUnicode_FromUnicode(s, length);
8953    if (unicode == NULL)
8954        return -1;
8955
8956    if (PyUnicode_READY(unicode) == -1) {
8957        Py_DECREF(unicode);
8958        return -1;
8959    }
8960    kind = PyUnicode_KIND(unicode);
8961    data = PyUnicode_DATA(unicode);
8962
8963    for (i=0; i < length; ) {
8964        PyObject *exc;
8965        Py_UCS4 ch;
8966        int decimal;
8967        Py_ssize_t startpos;
8968
8969        ch = PyUnicode_READ(kind, data, i);
8970
8971        if (Py_UNICODE_ISSPACE(ch)) {
8972            *output++ = ' ';
8973            i++;
8974            continue;
8975        }
8976        decimal = Py_UNICODE_TODECIMAL(ch);
8977        if (decimal >= 0) {
8978            *output++ = '0' + decimal;
8979            i++;
8980            continue;
8981        }
8982        if (0 < ch && ch < 256) {
8983            *output++ = (char)ch;
8984            i++;
8985            continue;
8986        }
8987
8988        startpos = i;
8989        exc = NULL;
8990        raise_encode_exception(&exc, "decimal", unicode,
8991                               startpos, startpos+1,
8992                               "invalid decimal Unicode string");
8993        Py_XDECREF(exc);
8994        Py_DECREF(unicode);
8995        return -1;
8996    }
8997    /* 0-terminate the output string */
8998    *output++ = '\0';
8999    Py_DECREF(unicode);
9000    return 0;
9001}
9002
9003/* --- Helpers ------------------------------------------------------------ */
9004
9005static Py_ssize_t
9006any_find_slice(int direction, PyObject* s1, PyObject* s2,
9007               Py_ssize_t start,
9008               Py_ssize_t end)
9009{
9010    int kind1, kind2, kind;
9011    void *buf1, *buf2;
9012    Py_ssize_t len1, len2, result;
9013
9014    kind1 = PyUnicode_KIND(s1);
9015    kind2 = PyUnicode_KIND(s2);
9016    kind = kind1 > kind2 ? kind1 : kind2;
9017    buf1 = PyUnicode_DATA(s1);
9018    buf2 = PyUnicode_DATA(s2);
9019    if (kind1 != kind)
9020        buf1 = _PyUnicode_AsKind(s1, kind);
9021    if (!buf1)
9022        return -2;
9023    if (kind2 != kind)
9024        buf2 = _PyUnicode_AsKind(s2, kind);
9025    if (!buf2) {
9026        if (kind1 != kind) PyMem_Free(buf1);
9027        return -2;
9028    }
9029    len1 = PyUnicode_GET_LENGTH(s1);
9030    len2 = PyUnicode_GET_LENGTH(s2);
9031
9032    if (direction > 0) {
9033        switch (kind) {
9034        case PyUnicode_1BYTE_KIND:
9035            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9036                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9037            else
9038                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9039            break;
9040        case PyUnicode_2BYTE_KIND:
9041            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9042            break;
9043        case PyUnicode_4BYTE_KIND:
9044            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9045            break;
9046        default:
9047            assert(0); result = -2;
9048        }
9049    }
9050    else {
9051        switch (kind) {
9052        case PyUnicode_1BYTE_KIND:
9053            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9054                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055            else
9056                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9057            break;
9058        case PyUnicode_2BYTE_KIND:
9059            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9060            break;
9061        case PyUnicode_4BYTE_KIND:
9062            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9063            break;
9064        default:
9065            assert(0); result = -2;
9066        }
9067    }
9068
9069    if (kind1 != kind)
9070        PyMem_Free(buf1);
9071    if (kind2 != kind)
9072        PyMem_Free(buf2);
9073
9074    return result;
9075}
9076
9077Py_ssize_t
9078_PyUnicode_InsertThousandsGrouping(
9079    PyObject *unicode, Py_ssize_t index,
9080    Py_ssize_t n_buffer,
9081    void *digits, Py_ssize_t n_digits,
9082    Py_ssize_t min_width,
9083    const char *grouping, PyObject *thousands_sep,
9084    Py_UCS4 *maxchar)
9085{
9086    unsigned int kind, thousands_sep_kind;
9087    char *data, *thousands_sep_data;
9088    Py_ssize_t thousands_sep_len;
9089    Py_ssize_t len;
9090
9091    if (unicode != NULL) {
9092        kind = PyUnicode_KIND(unicode);
9093        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9094    }
9095    else {
9096        kind = PyUnicode_1BYTE_KIND;
9097        data = NULL;
9098    }
9099    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9100    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9101    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9102    if (unicode != NULL && thousands_sep_kind != kind) {
9103        if (thousands_sep_kind < kind) {
9104            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9105            if (!thousands_sep_data)
9106                return -1;
9107        }
9108        else {
9109            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9110            if (!data)
9111                return -1;
9112        }
9113    }
9114
9115    switch (kind) {
9116    case PyUnicode_1BYTE_KIND:
9117        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9118            len = asciilib_InsertThousandsGrouping(
9119                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9120                min_width, grouping,
9121                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9122        else
9123            len = ucs1lib_InsertThousandsGrouping(
9124                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9125                min_width, grouping,
9126                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9127        break;
9128    case PyUnicode_2BYTE_KIND:
9129        len = ucs2lib_InsertThousandsGrouping(
9130            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9131            min_width, grouping,
9132            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9133        break;
9134    case PyUnicode_4BYTE_KIND:
9135        len = ucs4lib_InsertThousandsGrouping(
9136            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9137            min_width, grouping,
9138            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9139        break;
9140    default:
9141        assert(0);
9142        return -1;
9143    }
9144    if (unicode != NULL && thousands_sep_kind != kind) {
9145        if (thousands_sep_kind < kind)
9146            PyMem_Free(thousands_sep_data);
9147        else
9148            PyMem_Free(data);
9149    }
9150    if (unicode == NULL) {
9151        *maxchar = 127;
9152        if (len != n_digits) {
9153            *maxchar = Py_MAX(*maxchar,
9154                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9155        }
9156    }
9157    return len;
9158}
9159
9160
9161/* helper macro to fixup start/end slice values */
9162#define ADJUST_INDICES(start, end, len)         \
9163    if (end > len)                              \
9164        end = len;                              \
9165    else if (end < 0) {                         \
9166        end += len;                             \
9167        if (end < 0)                            \
9168            end = 0;                            \
9169    }                                           \
9170    if (start < 0) {                            \
9171        start += len;                           \
9172        if (start < 0)                          \
9173            start = 0;                          \
9174    }
9175
9176Py_ssize_t
9177PyUnicode_Count(PyObject *str,
9178                PyObject *substr,
9179                Py_ssize_t start,
9180                Py_ssize_t end)
9181{
9182    Py_ssize_t result;
9183    PyObject* str_obj;
9184    PyObject* sub_obj;
9185    int kind1, kind2, kind;
9186    void *buf1 = NULL, *buf2 = NULL;
9187    Py_ssize_t len1, len2;
9188
9189    str_obj = PyUnicode_FromObject(str);
9190    if (!str_obj)
9191        return -1;
9192    sub_obj = PyUnicode_FromObject(substr);
9193    if (!sub_obj) {
9194        Py_DECREF(str_obj);
9195        return -1;
9196    }
9197    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9198        Py_DECREF(sub_obj);
9199        Py_DECREF(str_obj);
9200        return -1;
9201    }
9202
9203    kind1 = PyUnicode_KIND(str_obj);
9204    kind2 = PyUnicode_KIND(sub_obj);
9205    kind = kind1;
9206    buf1 = PyUnicode_DATA(str_obj);
9207    buf2 = PyUnicode_DATA(sub_obj);
9208    if (kind2 != kind) {
9209        if (kind2 > kind) {
9210            Py_DECREF(sub_obj);
9211            Py_DECREF(str_obj);
9212            return 0;
9213        }
9214        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9215    }
9216    if (!buf2)
9217        goto onError;
9218    len1 = PyUnicode_GET_LENGTH(str_obj);
9219    len2 = PyUnicode_GET_LENGTH(sub_obj);
9220
9221    ADJUST_INDICES(start, end, len1);
9222    switch (kind) {
9223    case PyUnicode_1BYTE_KIND:
9224        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9225            result = asciilib_count(
9226                ((Py_UCS1*)buf1) + start, end - start,
9227                buf2, len2, PY_SSIZE_T_MAX
9228                );
9229        else
9230            result = ucs1lib_count(
9231                ((Py_UCS1*)buf1) + start, end - start,
9232                buf2, len2, PY_SSIZE_T_MAX
9233                );
9234        break;
9235    case PyUnicode_2BYTE_KIND:
9236        result = ucs2lib_count(
9237            ((Py_UCS2*)buf1) + start, end - start,
9238            buf2, len2, PY_SSIZE_T_MAX
9239            );
9240        break;
9241    case PyUnicode_4BYTE_KIND:
9242        result = ucs4lib_count(
9243            ((Py_UCS4*)buf1) + start, end - start,
9244            buf2, len2, PY_SSIZE_T_MAX
9245            );
9246        break;
9247    default:
9248        assert(0); result = 0;
9249    }
9250
9251    Py_DECREF(sub_obj);
9252    Py_DECREF(str_obj);
9253
9254    if (kind2 != kind)
9255        PyMem_Free(buf2);
9256
9257    return result;
9258  onError:
9259    Py_DECREF(sub_obj);
9260    Py_DECREF(str_obj);
9261    if (kind2 != kind && buf2)
9262        PyMem_Free(buf2);
9263    return -1;
9264}
9265
9266Py_ssize_t
9267PyUnicode_Find(PyObject *str,
9268               PyObject *sub,
9269               Py_ssize_t start,
9270               Py_ssize_t end,
9271               int direction)
9272{
9273    Py_ssize_t result;
9274
9275    str = PyUnicode_FromObject(str);
9276    if (!str)
9277        return -2;
9278    sub = PyUnicode_FromObject(sub);
9279    if (!sub) {
9280        Py_DECREF(str);
9281        return -2;
9282    }
9283    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9284        Py_DECREF(sub);
9285        Py_DECREF(str);
9286        return -2;
9287    }
9288
9289    result = any_find_slice(direction,
9290        str, sub, start, end
9291        );
9292
9293    Py_DECREF(str);
9294    Py_DECREF(sub);
9295
9296    return result;
9297}
9298
9299Py_ssize_t
9300PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9301                   Py_ssize_t start, Py_ssize_t end,
9302                   int direction)
9303{
9304    int kind;
9305    Py_ssize_t result;
9306    if (PyUnicode_READY(str) == -1)
9307        return -2;
9308    if (start < 0 || end < 0) {
9309        PyErr_SetString(PyExc_IndexError, "string index out of range");
9310        return -2;
9311    }
9312    if (end > PyUnicode_GET_LENGTH(str))
9313        end = PyUnicode_GET_LENGTH(str);
9314    kind = PyUnicode_KIND(str);
9315    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9316                      kind, end-start, ch, direction);
9317    if (result == -1)
9318        return -1;
9319    else
9320        return start + result;
9321}
9322
9323static int
9324tailmatch(PyObject *self,
9325          PyObject *substring,
9326          Py_ssize_t start,
9327          Py_ssize_t end,
9328          int direction)
9329{
9330    int kind_self;
9331    int kind_sub;
9332    void *data_self;
9333    void *data_sub;
9334    Py_ssize_t offset;
9335    Py_ssize_t i;
9336    Py_ssize_t end_sub;
9337
9338    if (PyUnicode_READY(self) == -1 ||
9339        PyUnicode_READY(substring) == -1)
9340        return -1;
9341
9342    if (PyUnicode_GET_LENGTH(substring) == 0)
9343        return 1;
9344
9345    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9346    end -= PyUnicode_GET_LENGTH(substring);
9347    if (end < start)
9348        return 0;
9349
9350    kind_self = PyUnicode_KIND(self);
9351    data_self = PyUnicode_DATA(self);
9352    kind_sub = PyUnicode_KIND(substring);
9353    data_sub = PyUnicode_DATA(substring);
9354    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9355
9356    if (direction > 0)
9357        offset = end;
9358    else
9359        offset = start;
9360
9361    if (PyUnicode_READ(kind_self, data_self, offset) ==
9362        PyUnicode_READ(kind_sub, data_sub, 0) &&
9363        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9364        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9365        /* If both are of the same kind, memcmp is sufficient */
9366        if (kind_self == kind_sub) {
9367            return ! memcmp((char *)data_self +
9368                                (offset * PyUnicode_KIND(substring)),
9369                            data_sub,
9370                            PyUnicode_GET_LENGTH(substring) *
9371                                PyUnicode_KIND(substring));
9372        }
9373        /* otherwise we have to compare each character by first accesing it */
9374        else {
9375            /* We do not need to compare 0 and len(substring)-1 because
9376               the if statement above ensured already that they are equal
9377               when we end up here. */
9378            for (i = 1; i < end_sub; ++i) {
9379                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9380                    PyUnicode_READ(kind_sub, data_sub, i))
9381                    return 0;
9382            }
9383            return 1;
9384        }
9385    }
9386
9387    return 0;
9388}
9389
9390Py_ssize_t
9391PyUnicode_Tailmatch(PyObject *str,
9392                    PyObject *substr,
9393                    Py_ssize_t start,
9394                    Py_ssize_t end,
9395                    int direction)
9396{
9397    Py_ssize_t result;
9398
9399    str = PyUnicode_FromObject(str);
9400    if (str == NULL)
9401        return -1;
9402    substr = PyUnicode_FromObject(substr);
9403    if (substr == NULL) {
9404        Py_DECREF(str);
9405        return -1;
9406    }
9407
9408    result = tailmatch(str, substr,
9409                       start, end, direction);
9410    Py_DECREF(str);
9411    Py_DECREF(substr);
9412    return result;
9413}
9414
9415/* Apply fixfct filter to the Unicode object self and return a
9416   reference to the modified object */
9417
9418static PyObject *
9419fixup(PyObject *self,
9420      Py_UCS4 (*fixfct)(PyObject *s))
9421{
9422    PyObject *u;
9423    Py_UCS4 maxchar_old, maxchar_new = 0;
9424    PyObject *v;
9425
9426    u = _PyUnicode_Copy(self);
9427    if (u == NULL)
9428        return NULL;
9429    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9430
9431    /* fix functions return the new maximum character in a string,
9432       if the kind of the resulting unicode object does not change,
9433       everything is fine.  Otherwise we need to change the string kind
9434       and re-run the fix function. */
9435    maxchar_new = fixfct(u);
9436
9437    if (maxchar_new == 0) {
9438        /* no changes */;
9439        if (PyUnicode_CheckExact(self)) {
9440            Py_DECREF(u);
9441            Py_INCREF(self);
9442            return self;
9443        }
9444        else
9445            return u;
9446    }
9447
9448    maxchar_new = align_maxchar(maxchar_new);
9449
9450    if (maxchar_new == maxchar_old)
9451        return u;
9452
9453    /* In case the maximum character changed, we need to
9454       convert the string to the new category. */
9455    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9456    if (v == NULL) {
9457        Py_DECREF(u);
9458        return NULL;
9459    }
9460    if (maxchar_new > maxchar_old) {
9461        /* If the maxchar increased so that the kind changed, not all
9462           characters are representable anymore and we need to fix the
9463           string again. This only happens in very few cases. */
9464        _PyUnicode_FastCopyCharacters(v, 0,
9465                                      self, 0, PyUnicode_GET_LENGTH(self));
9466        maxchar_old = fixfct(v);
9467        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9468    }
9469    else {
9470        _PyUnicode_FastCopyCharacters(v, 0,
9471                                      u, 0, PyUnicode_GET_LENGTH(self));
9472    }
9473    Py_DECREF(u);
9474    assert(_PyUnicode_CheckConsistency(v, 1));
9475    return v;
9476}
9477
9478static PyObject *
9479ascii_upper_or_lower(PyObject *self, int lower)
9480{
9481    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9482    char *resdata, *data = PyUnicode_DATA(self);
9483    PyObject *res;
9484
9485    res = PyUnicode_New(len, 127);
9486    if (res == NULL)
9487        return NULL;
9488    resdata = PyUnicode_DATA(res);
9489    if (lower)
9490        _Py_bytes_lower(resdata, data, len);
9491    else
9492        _Py_bytes_upper(resdata, data, len);
9493    return res;
9494}
9495
9496static Py_UCS4
9497handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9498{
9499    Py_ssize_t j;
9500    int final_sigma;
9501    Py_UCS4 c;
9502    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9503
9504     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9505
9506    where ! is a negation and \p{xxx} is a character with property xxx.
9507    */
9508    for (j = i - 1; j >= 0; j--) {
9509        c = PyUnicode_READ(kind, data, j);
9510        if (!_PyUnicode_IsCaseIgnorable(c))
9511            break;
9512    }
9513    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9514    if (final_sigma) {
9515        for (j = i + 1; j < length; j++) {
9516            c = PyUnicode_READ(kind, data, j);
9517            if (!_PyUnicode_IsCaseIgnorable(c))
9518                break;
9519        }
9520        final_sigma = j == length || !_PyUnicode_IsCased(c);
9521    }
9522    return (final_sigma) ? 0x3C2 : 0x3C3;
9523}
9524
9525static int
9526lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9527           Py_UCS4 c, Py_UCS4 *mapped)
9528{
9529    /* Obscure special case. */
9530    if (c == 0x3A3) {
9531        mapped[0] = handle_capital_sigma(kind, data, length, i);
9532        return 1;
9533    }
9534    return _PyUnicode_ToLowerFull(c, mapped);
9535}
9536
9537static Py_ssize_t
9538do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9539{
9540    Py_ssize_t i, k = 0;
9541    int n_res, j;
9542    Py_UCS4 c, mapped[3];
9543
9544    c = PyUnicode_READ(kind, data, 0);
9545    n_res = _PyUnicode_ToUpperFull(c, mapped);
9546    for (j = 0; j < n_res; j++) {
9547        *maxchar = Py_MAX(*maxchar, mapped[j]);
9548        res[k++] = mapped[j];
9549    }
9550    for (i = 1; i < length; i++) {
9551        c = PyUnicode_READ(kind, data, i);
9552        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9553        for (j = 0; j < n_res; j++) {
9554            *maxchar = Py_MAX(*maxchar, mapped[j]);
9555            res[k++] = mapped[j];
9556        }
9557    }
9558    return k;
9559}
9560
9561static Py_ssize_t
9562do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9563    Py_ssize_t i, k = 0;
9564
9565    for (i = 0; i < length; i++) {
9566        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9567        int n_res, j;
9568        if (Py_UNICODE_ISUPPER(c)) {
9569            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9570        }
9571        else if (Py_UNICODE_ISLOWER(c)) {
9572            n_res = _PyUnicode_ToUpperFull(c, mapped);
9573        }
9574        else {
9575            n_res = 1;
9576            mapped[0] = c;
9577        }
9578        for (j = 0; j < n_res; j++) {
9579            *maxchar = Py_MAX(*maxchar, mapped[j]);
9580            res[k++] = mapped[j];
9581        }
9582    }
9583    return k;
9584}
9585
9586static Py_ssize_t
9587do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9588                  Py_UCS4 *maxchar, int lower)
9589{
9590    Py_ssize_t i, k = 0;
9591
9592    for (i = 0; i < length; i++) {
9593        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9594        int n_res, j;
9595        if (lower)
9596            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9597        else
9598            n_res = _PyUnicode_ToUpperFull(c, mapped);
9599        for (j = 0; j < n_res; j++) {
9600            *maxchar = Py_MAX(*maxchar, mapped[j]);
9601            res[k++] = mapped[j];
9602        }
9603    }
9604    return k;
9605}
9606
9607static Py_ssize_t
9608do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9609{
9610    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9611}
9612
9613static Py_ssize_t
9614do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9615{
9616    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9617}
9618
9619static Py_ssize_t
9620do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9621{
9622    Py_ssize_t i, k = 0;
9623
9624    for (i = 0; i < length; i++) {
9625        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9626        Py_UCS4 mapped[3];
9627        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9628        for (j = 0; j < n_res; j++) {
9629            *maxchar = Py_MAX(*maxchar, mapped[j]);
9630            res[k++] = mapped[j];
9631        }
9632    }
9633    return k;
9634}
9635
9636static Py_ssize_t
9637do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9638{
9639    Py_ssize_t i, k = 0;
9640    int previous_is_cased;
9641
9642    previous_is_cased = 0;
9643    for (i = 0; i < length; i++) {
9644        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9645        Py_UCS4 mapped[3];
9646        int n_res, j;
9647
9648        if (previous_is_cased)
9649            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9650        else
9651            n_res = _PyUnicode_ToTitleFull(c, mapped);
9652
9653        for (j = 0; j < n_res; j++) {
9654            *maxchar = Py_MAX(*maxchar, mapped[j]);
9655            res[k++] = mapped[j];
9656        }
9657
9658        previous_is_cased = _PyUnicode_IsCased(c);
9659    }
9660    return k;
9661}
9662
9663static PyObject *
9664case_operation(PyObject *self,
9665               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9666{
9667    PyObject *res = NULL;
9668    Py_ssize_t length, newlength = 0;
9669    int kind, outkind;
9670    void *data, *outdata;
9671    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9672
9673    assert(PyUnicode_IS_READY(self));
9674
9675    kind = PyUnicode_KIND(self);
9676    data = PyUnicode_DATA(self);
9677    length = PyUnicode_GET_LENGTH(self);
9678    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9679    if (tmp == NULL)
9680        return PyErr_NoMemory();
9681    newlength = perform(kind, data, length, tmp, &maxchar);
9682    res = PyUnicode_New(newlength, maxchar);
9683    if (res == NULL)
9684        goto leave;
9685    tmpend = tmp + newlength;
9686    outdata = PyUnicode_DATA(res);
9687    outkind = PyUnicode_KIND(res);
9688    switch (outkind) {
9689    case PyUnicode_1BYTE_KIND:
9690        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9691        break;
9692    case PyUnicode_2BYTE_KIND:
9693        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9694        break;
9695    case PyUnicode_4BYTE_KIND:
9696        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9697        break;
9698    default:
9699        assert(0);
9700        break;
9701    }
9702  leave:
9703    PyMem_FREE(tmp);
9704    return res;
9705}
9706
9707PyObject *
9708PyUnicode_Join(PyObject *separator, PyObject *seq)
9709{
9710    PyObject *sep = NULL;
9711    Py_ssize_t seplen;
9712    PyObject *res = NULL; /* the result */
9713    PyObject *fseq;          /* PySequence_Fast(seq) */
9714    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9715    PyObject **items;
9716    PyObject *item;
9717    Py_ssize_t sz, i, res_offset;
9718    Py_UCS4 maxchar;
9719    Py_UCS4 item_maxchar;
9720    int use_memcpy;
9721    unsigned char *res_data = NULL, *sep_data = NULL;
9722    PyObject *last_obj;
9723    unsigned int kind = 0;
9724
9725    fseq = PySequence_Fast(seq, "can only join an iterable");
9726    if (fseq == NULL) {
9727        return NULL;
9728    }
9729
9730    /* NOTE: the following code can't call back into Python code,
9731     * so we are sure that fseq won't be mutated.
9732     */
9733
9734    seqlen = PySequence_Fast_GET_SIZE(fseq);
9735    /* If empty sequence, return u"". */
9736    if (seqlen == 0) {
9737        Py_DECREF(fseq);
9738        _Py_RETURN_UNICODE_EMPTY();
9739    }
9740
9741    /* If singleton sequence with an exact Unicode, return that. */
9742    last_obj = NULL;
9743    items = PySequence_Fast_ITEMS(fseq);
9744    if (seqlen == 1) {
9745        if (PyUnicode_CheckExact(items[0])) {
9746            res = items[0];
9747            Py_INCREF(res);
9748            Py_DECREF(fseq);
9749            return res;
9750        }
9751        seplen = 0;
9752        maxchar = 0;
9753    }
9754    else {
9755        /* Set up sep and seplen */
9756        if (separator == NULL) {
9757            /* fall back to a blank space separator */
9758            sep = PyUnicode_FromOrdinal(' ');
9759            if (!sep)
9760                goto onError;
9761            seplen = 1;
9762            maxchar = 32;
9763        }
9764        else {
9765            if (!PyUnicode_Check(separator)) {
9766                PyErr_Format(PyExc_TypeError,
9767                             "separator: expected str instance,"
9768                             " %.80s found",
9769                             Py_TYPE(separator)->tp_name);
9770                goto onError;
9771            }
9772            if (PyUnicode_READY(separator))
9773                goto onError;
9774            sep = separator;
9775            seplen = PyUnicode_GET_LENGTH(separator);
9776            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9777            /* inc refcount to keep this code path symmetric with the
9778               above case of a blank separator */
9779            Py_INCREF(sep);
9780        }
9781        last_obj = sep;
9782    }
9783
9784    /* There are at least two things to join, or else we have a subclass
9785     * of str in the sequence.
9786     * Do a pre-pass to figure out the total amount of space we'll
9787     * need (sz), and see whether all argument are strings.
9788     */
9789    sz = 0;
9790#ifdef Py_DEBUG
9791    use_memcpy = 0;
9792#else
9793    use_memcpy = 1;
9794#endif
9795    for (i = 0; i < seqlen; i++) {
9796        const Py_ssize_t old_sz = sz;
9797        item = items[i];
9798        if (!PyUnicode_Check(item)) {
9799            PyErr_Format(PyExc_TypeError,
9800                         "sequence item %zd: expected str instance,"
9801                         " %.80s found",
9802                         i, Py_TYPE(item)->tp_name);
9803            goto onError;
9804        }
9805        if (PyUnicode_READY(item) == -1)
9806            goto onError;
9807        sz += PyUnicode_GET_LENGTH(item);
9808        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9809        maxchar = Py_MAX(maxchar, item_maxchar);
9810        if (i != 0)
9811            sz += seplen;
9812        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9813            PyErr_SetString(PyExc_OverflowError,
9814                            "join() result is too long for a Python string");
9815            goto onError;
9816        }
9817        if (use_memcpy && last_obj != NULL) {
9818            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9819                use_memcpy = 0;
9820        }
9821        last_obj = item;
9822    }
9823
9824    res = PyUnicode_New(sz, maxchar);
9825    if (res == NULL)
9826        goto onError;
9827
9828    /* Catenate everything. */
9829#ifdef Py_DEBUG
9830    use_memcpy = 0;
9831#else
9832    if (use_memcpy) {
9833        res_data = PyUnicode_1BYTE_DATA(res);
9834        kind = PyUnicode_KIND(res);
9835        if (seplen != 0)
9836            sep_data = PyUnicode_1BYTE_DATA(sep);
9837    }
9838#endif
9839    if (use_memcpy) {
9840        for (i = 0; i < seqlen; ++i) {
9841            Py_ssize_t itemlen;
9842            item = items[i];
9843
9844            /* Copy item, and maybe the separator. */
9845            if (i && seplen != 0) {
9846                Py_MEMCPY(res_data,
9847                          sep_data,
9848                          kind * seplen);
9849                res_data += kind * seplen;
9850            }
9851
9852            itemlen = PyUnicode_GET_LENGTH(item);
9853            if (itemlen != 0) {
9854                Py_MEMCPY(res_data,
9855                          PyUnicode_DATA(item),
9856                          kind * itemlen);
9857                res_data += kind * itemlen;
9858            }
9859        }
9860        assert(res_data == PyUnicode_1BYTE_DATA(res)
9861                           + kind * PyUnicode_GET_LENGTH(res));
9862    }
9863    else {
9864        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9865            Py_ssize_t itemlen;
9866            item = items[i];
9867
9868            /* Copy item, and maybe the separator. */
9869            if (i && seplen != 0) {
9870                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9871                res_offset += seplen;
9872            }
9873
9874            itemlen = PyUnicode_GET_LENGTH(item);
9875            if (itemlen != 0) {
9876                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9877                res_offset += itemlen;
9878            }
9879        }
9880        assert(res_offset == PyUnicode_GET_LENGTH(res));
9881    }
9882
9883    Py_DECREF(fseq);
9884    Py_XDECREF(sep);
9885    assert(_PyUnicode_CheckConsistency(res, 1));
9886    return res;
9887
9888  onError:
9889    Py_DECREF(fseq);
9890    Py_XDECREF(sep);
9891    Py_XDECREF(res);
9892    return NULL;
9893}
9894
9895#define FILL(kind, data, value, start, length) \
9896    do { \
9897        Py_ssize_t i_ = 0; \
9898        assert(kind != PyUnicode_WCHAR_KIND); \
9899        switch ((kind)) { \
9900        case PyUnicode_1BYTE_KIND: { \
9901            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9902            memset(to_, (unsigned char)value, (length)); \
9903            break; \
9904        } \
9905        case PyUnicode_2BYTE_KIND: { \
9906            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9907            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9908            break; \
9909        } \
9910        case PyUnicode_4BYTE_KIND: { \
9911            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9912            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9913            break; \
9914        default: assert(0); \
9915        } \
9916        } \
9917    } while (0)
9918
9919void
9920_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9921                    Py_UCS4 fill_char)
9922{
9923    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9924    const void *data = PyUnicode_DATA(unicode);
9925    assert(PyUnicode_IS_READY(unicode));
9926    assert(unicode_modifiable(unicode));
9927    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9928    assert(start >= 0);
9929    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9930    FILL(kind, data, fill_char, start, length);
9931}
9932
9933Py_ssize_t
9934PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9935               Py_UCS4 fill_char)
9936{
9937    Py_ssize_t maxlen;
9938
9939    if (!PyUnicode_Check(unicode)) {
9940        PyErr_BadInternalCall();
9941        return -1;
9942    }
9943    if (PyUnicode_READY(unicode) == -1)
9944        return -1;
9945    if (unicode_check_modifiable(unicode))
9946        return -1;
9947
9948    if (start < 0) {
9949        PyErr_SetString(PyExc_IndexError, "string index out of range");
9950        return -1;
9951    }
9952    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9953        PyErr_SetString(PyExc_ValueError,
9954                         "fill character is bigger than "
9955                         "the string maximum character");
9956        return -1;
9957    }
9958
9959    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9960    length = Py_MIN(maxlen, length);
9961    if (length <= 0)
9962        return 0;
9963
9964    _PyUnicode_FastFill(unicode, start, length, fill_char);
9965    return length;
9966}
9967
9968static PyObject *
9969pad(PyObject *self,
9970    Py_ssize_t left,
9971    Py_ssize_t right,
9972    Py_UCS4 fill)
9973{
9974    PyObject *u;
9975    Py_UCS4 maxchar;
9976    int kind;
9977    void *data;
9978
9979    if (left < 0)
9980        left = 0;
9981    if (right < 0)
9982        right = 0;
9983
9984    if (left == 0 && right == 0)
9985        return unicode_result_unchanged(self);
9986
9987    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9988        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9989        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9990        return NULL;
9991    }
9992    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9993    maxchar = Py_MAX(maxchar, fill);
9994    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9995    if (!u)
9996        return NULL;
9997
9998    kind = PyUnicode_KIND(u);
9999    data = PyUnicode_DATA(u);
10000    if (left)
10001        FILL(kind, data, fill, 0, left);
10002    if (right)
10003        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10004    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10005    assert(_PyUnicode_CheckConsistency(u, 1));
10006    return u;
10007}
10008
10009PyObject *
10010PyUnicode_Splitlines(PyObject *string, int keepends)
10011{
10012    PyObject *list;
10013
10014    string = PyUnicode_FromObject(string);
10015    if (string == NULL)
10016        return NULL;
10017    if (PyUnicode_READY(string) == -1) {
10018        Py_DECREF(string);
10019        return NULL;
10020    }
10021
10022    switch (PyUnicode_KIND(string)) {
10023    case PyUnicode_1BYTE_KIND:
10024        if (PyUnicode_IS_ASCII(string))
10025            list = asciilib_splitlines(
10026                string, PyUnicode_1BYTE_DATA(string),
10027                PyUnicode_GET_LENGTH(string), keepends);
10028        else
10029            list = ucs1lib_splitlines(
10030                string, PyUnicode_1BYTE_DATA(string),
10031                PyUnicode_GET_LENGTH(string), keepends);
10032        break;
10033    case PyUnicode_2BYTE_KIND:
10034        list = ucs2lib_splitlines(
10035            string, PyUnicode_2BYTE_DATA(string),
10036            PyUnicode_GET_LENGTH(string), keepends);
10037        break;
10038    case PyUnicode_4BYTE_KIND:
10039        list = ucs4lib_splitlines(
10040            string, PyUnicode_4BYTE_DATA(string),
10041            PyUnicode_GET_LENGTH(string), keepends);
10042        break;
10043    default:
10044        assert(0);
10045        list = 0;
10046    }
10047    Py_DECREF(string);
10048    return list;
10049}
10050
10051static PyObject *
10052split(PyObject *self,
10053      PyObject *substring,
10054      Py_ssize_t maxcount)
10055{
10056    int kind1, kind2, kind;
10057    void *buf1, *buf2;
10058    Py_ssize_t len1, len2;
10059    PyObject* out;
10060
10061    if (maxcount < 0)
10062        maxcount = PY_SSIZE_T_MAX;
10063
10064    if (PyUnicode_READY(self) == -1)
10065        return NULL;
10066
10067    if (substring == NULL)
10068        switch (PyUnicode_KIND(self)) {
10069        case PyUnicode_1BYTE_KIND:
10070            if (PyUnicode_IS_ASCII(self))
10071                return asciilib_split_whitespace(
10072                    self,  PyUnicode_1BYTE_DATA(self),
10073                    PyUnicode_GET_LENGTH(self), maxcount
10074                    );
10075            else
10076                return ucs1lib_split_whitespace(
10077                    self,  PyUnicode_1BYTE_DATA(self),
10078                    PyUnicode_GET_LENGTH(self), maxcount
10079                    );
10080        case PyUnicode_2BYTE_KIND:
10081            return ucs2lib_split_whitespace(
10082                self,  PyUnicode_2BYTE_DATA(self),
10083                PyUnicode_GET_LENGTH(self), maxcount
10084                );
10085        case PyUnicode_4BYTE_KIND:
10086            return ucs4lib_split_whitespace(
10087                self,  PyUnicode_4BYTE_DATA(self),
10088                PyUnicode_GET_LENGTH(self), maxcount
10089                );
10090        default:
10091            assert(0);
10092            return NULL;
10093        }
10094
10095    if (PyUnicode_READY(substring) == -1)
10096        return NULL;
10097
10098    kind1 = PyUnicode_KIND(self);
10099    kind2 = PyUnicode_KIND(substring);
10100    kind = kind1 > kind2 ? kind1 : kind2;
10101    buf1 = PyUnicode_DATA(self);
10102    buf2 = PyUnicode_DATA(substring);
10103    if (kind1 != kind)
10104        buf1 = _PyUnicode_AsKind(self, kind);
10105    if (!buf1)
10106        return NULL;
10107    if (kind2 != kind)
10108        buf2 = _PyUnicode_AsKind(substring, kind);
10109    if (!buf2) {
10110        if (kind1 != kind) PyMem_Free(buf1);
10111        return NULL;
10112    }
10113    len1 = PyUnicode_GET_LENGTH(self);
10114    len2 = PyUnicode_GET_LENGTH(substring);
10115
10116    switch (kind) {
10117    case PyUnicode_1BYTE_KIND:
10118        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10119            out = asciilib_split(
10120                self,  buf1, len1, buf2, len2, maxcount);
10121        else
10122            out = ucs1lib_split(
10123                self,  buf1, len1, buf2, len2, maxcount);
10124        break;
10125    case PyUnicode_2BYTE_KIND:
10126        out = ucs2lib_split(
10127            self,  buf1, len1, buf2, len2, maxcount);
10128        break;
10129    case PyUnicode_4BYTE_KIND:
10130        out = ucs4lib_split(
10131            self,  buf1, len1, buf2, len2, maxcount);
10132        break;
10133    default:
10134        out = NULL;
10135    }
10136    if (kind1 != kind)
10137        PyMem_Free(buf1);
10138    if (kind2 != kind)
10139        PyMem_Free(buf2);
10140    return out;
10141}
10142
10143static PyObject *
10144rsplit(PyObject *self,
10145       PyObject *substring,
10146       Py_ssize_t maxcount)
10147{
10148    int kind1, kind2, kind;
10149    void *buf1, *buf2;
10150    Py_ssize_t len1, len2;
10151    PyObject* out;
10152
10153    if (maxcount < 0)
10154        maxcount = PY_SSIZE_T_MAX;
10155
10156    if (PyUnicode_READY(self) == -1)
10157        return NULL;
10158
10159    if (substring == NULL)
10160        switch (PyUnicode_KIND(self)) {
10161        case PyUnicode_1BYTE_KIND:
10162            if (PyUnicode_IS_ASCII(self))
10163                return asciilib_rsplit_whitespace(
10164                    self,  PyUnicode_1BYTE_DATA(self),
10165                    PyUnicode_GET_LENGTH(self), maxcount
10166                    );
10167            else
10168                return ucs1lib_rsplit_whitespace(
10169                    self,  PyUnicode_1BYTE_DATA(self),
10170                    PyUnicode_GET_LENGTH(self), maxcount
10171                    );
10172        case PyUnicode_2BYTE_KIND:
10173            return ucs2lib_rsplit_whitespace(
10174                self,  PyUnicode_2BYTE_DATA(self),
10175                PyUnicode_GET_LENGTH(self), maxcount
10176                );
10177        case PyUnicode_4BYTE_KIND:
10178            return ucs4lib_rsplit_whitespace(
10179                self,  PyUnicode_4BYTE_DATA(self),
10180                PyUnicode_GET_LENGTH(self), maxcount
10181                );
10182        default:
10183            assert(0);
10184            return NULL;
10185        }
10186
10187    if (PyUnicode_READY(substring) == -1)
10188        return NULL;
10189
10190    kind1 = PyUnicode_KIND(self);
10191    kind2 = PyUnicode_KIND(substring);
10192    kind = kind1 > kind2 ? kind1 : kind2;
10193    buf1 = PyUnicode_DATA(self);
10194    buf2 = PyUnicode_DATA(substring);
10195    if (kind1 != kind)
10196        buf1 = _PyUnicode_AsKind(self, kind);
10197    if (!buf1)
10198        return NULL;
10199    if (kind2 != kind)
10200        buf2 = _PyUnicode_AsKind(substring, kind);
10201    if (!buf2) {
10202        if (kind1 != kind) PyMem_Free(buf1);
10203        return NULL;
10204    }
10205    len1 = PyUnicode_GET_LENGTH(self);
10206    len2 = PyUnicode_GET_LENGTH(substring);
10207
10208    switch (kind) {
10209    case PyUnicode_1BYTE_KIND:
10210        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10211            out = asciilib_rsplit(
10212                self,  buf1, len1, buf2, len2, maxcount);
10213        else
10214            out = ucs1lib_rsplit(
10215                self,  buf1, len1, buf2, len2, maxcount);
10216        break;
10217    case PyUnicode_2BYTE_KIND:
10218        out = ucs2lib_rsplit(
10219            self,  buf1, len1, buf2, len2, maxcount);
10220        break;
10221    case PyUnicode_4BYTE_KIND:
10222        out = ucs4lib_rsplit(
10223            self,  buf1, len1, buf2, len2, maxcount);
10224        break;
10225    default:
10226        out = NULL;
10227    }
10228    if (kind1 != kind)
10229        PyMem_Free(buf1);
10230    if (kind2 != kind)
10231        PyMem_Free(buf2);
10232    return out;
10233}
10234
10235static Py_ssize_t
10236anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10237            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10238{
10239    switch (kind) {
10240    case PyUnicode_1BYTE_KIND:
10241        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10242            return asciilib_find(buf1, len1, buf2, len2, offset);
10243        else
10244            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10245    case PyUnicode_2BYTE_KIND:
10246        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10247    case PyUnicode_4BYTE_KIND:
10248        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10249    }
10250    assert(0);
10251    return -1;
10252}
10253
10254static Py_ssize_t
10255anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10256             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10257{
10258    switch (kind) {
10259    case PyUnicode_1BYTE_KIND:
10260        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10261            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10262        else
10263            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10264    case PyUnicode_2BYTE_KIND:
10265        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10266    case PyUnicode_4BYTE_KIND:
10267        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10268    }
10269    assert(0);
10270    return 0;
10271}
10272
10273static void
10274replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10275                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10276{
10277    int kind = PyUnicode_KIND(u);
10278    void *data = PyUnicode_DATA(u);
10279    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10280    if (kind == PyUnicode_1BYTE_KIND) {
10281        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10282                                      (Py_UCS1 *)data + len,
10283                                      u1, u2, maxcount);
10284    }
10285    else if (kind == PyUnicode_2BYTE_KIND) {
10286        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10287                                      (Py_UCS2 *)data + len,
10288                                      u1, u2, maxcount);
10289    }
10290    else {
10291        assert(kind == PyUnicode_4BYTE_KIND);
10292        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10293                                      (Py_UCS4 *)data + len,
10294                                      u1, u2, maxcount);
10295    }
10296}
10297
10298static PyObject *
10299replace(PyObject *self, PyObject *str1,
10300        PyObject *str2, Py_ssize_t maxcount)
10301{
10302    PyObject *u;
10303    char *sbuf = PyUnicode_DATA(self);
10304    char *buf1 = PyUnicode_DATA(str1);
10305    char *buf2 = PyUnicode_DATA(str2);
10306    int srelease = 0, release1 = 0, release2 = 0;
10307    int skind = PyUnicode_KIND(self);
10308    int kind1 = PyUnicode_KIND(str1);
10309    int kind2 = PyUnicode_KIND(str2);
10310    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10311    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10312    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10313    int mayshrink;
10314    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10315
10316    if (maxcount < 0)
10317        maxcount = PY_SSIZE_T_MAX;
10318    else if (maxcount == 0 || slen == 0)
10319        goto nothing;
10320
10321    if (str1 == str2)
10322        goto nothing;
10323
10324    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10325    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10326    if (maxchar < maxchar_str1)
10327        /* substring too wide to be present */
10328        goto nothing;
10329    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10330    /* Replacing str1 with str2 may cause a maxchar reduction in the
10331       result string. */
10332    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10333    maxchar = Py_MAX(maxchar, maxchar_str2);
10334
10335    if (len1 == len2) {
10336        /* same length */
10337        if (len1 == 0)
10338            goto nothing;
10339        if (len1 == 1) {
10340            /* replace characters */
10341            Py_UCS4 u1, u2;
10342            Py_ssize_t pos;
10343
10344            u1 = PyUnicode_READ(kind1, buf1, 0);
10345            pos = findchar(sbuf, skind, slen, u1, 1);
10346            if (pos < 0)
10347                goto nothing;
10348            u2 = PyUnicode_READ(kind2, buf2, 0);
10349            u = PyUnicode_New(slen, maxchar);
10350            if (!u)
10351                goto error;
10352
10353            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10354            replace_1char_inplace(u, pos, u1, u2, maxcount);
10355        }
10356        else {
10357            int rkind = skind;
10358            char *res;
10359            Py_ssize_t i;
10360
10361            if (kind1 < rkind) {
10362                /* widen substring */
10363                buf1 = _PyUnicode_AsKind(str1, rkind);
10364                if (!buf1) goto error;
10365                release1 = 1;
10366            }
10367            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10368            if (i < 0)
10369                goto nothing;
10370            if (rkind > kind2) {
10371                /* widen replacement */
10372                buf2 = _PyUnicode_AsKind(str2, rkind);
10373                if (!buf2) goto error;
10374                release2 = 1;
10375            }
10376            else if (rkind < kind2) {
10377                /* widen self and buf1 */
10378                rkind = kind2;
10379                if (release1) PyMem_Free(buf1);
10380                release1 = 0;
10381                sbuf = _PyUnicode_AsKind(self, rkind);
10382                if (!sbuf) goto error;
10383                srelease = 1;
10384                buf1 = _PyUnicode_AsKind(str1, rkind);
10385                if (!buf1) goto error;
10386                release1 = 1;
10387            }
10388            u = PyUnicode_New(slen, maxchar);
10389            if (!u)
10390                goto error;
10391            assert(PyUnicode_KIND(u) == rkind);
10392            res = PyUnicode_DATA(u);
10393
10394            memcpy(res, sbuf, rkind * slen);
10395            /* change everything in-place, starting with this one */
10396            memcpy(res + rkind * i,
10397                   buf2,
10398                   rkind * len2);
10399            i += len1;
10400
10401            while ( --maxcount > 0) {
10402                i = anylib_find(rkind, self,
10403                                sbuf+rkind*i, slen-i,
10404                                str1, buf1, len1, i);
10405                if (i == -1)
10406                    break;
10407                memcpy(res + rkind * i,
10408                       buf2,
10409                       rkind * len2);
10410                i += len1;
10411            }
10412        }
10413    }
10414    else {
10415        Py_ssize_t n, i, j, ires;
10416        Py_ssize_t new_size;
10417        int rkind = skind;
10418        char *res;
10419
10420        if (kind1 < rkind) {
10421            /* widen substring */
10422            buf1 = _PyUnicode_AsKind(str1, rkind);
10423            if (!buf1) goto error;
10424            release1 = 1;
10425        }
10426        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10427        if (n == 0)
10428            goto nothing;
10429        if (kind2 < rkind) {
10430            /* widen replacement */
10431            buf2 = _PyUnicode_AsKind(str2, rkind);
10432            if (!buf2) goto error;
10433            release2 = 1;
10434        }
10435        else if (kind2 > rkind) {
10436            /* widen self and buf1 */
10437            rkind = kind2;
10438            sbuf = _PyUnicode_AsKind(self, rkind);
10439            if (!sbuf) goto error;
10440            srelease = 1;
10441            if (release1) PyMem_Free(buf1);
10442            release1 = 0;
10443            buf1 = _PyUnicode_AsKind(str1, rkind);
10444            if (!buf1) goto error;
10445            release1 = 1;
10446        }
10447        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10448           PyUnicode_GET_LENGTH(str1))); */
10449        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10450                PyErr_SetString(PyExc_OverflowError,
10451                                "replace string is too long");
10452                goto error;
10453        }
10454        new_size = slen + n * (len2 - len1);
10455        if (new_size == 0) {
10456            _Py_INCREF_UNICODE_EMPTY();
10457            if (!unicode_empty)
10458                goto error;
10459            u = unicode_empty;
10460            goto done;
10461        }
10462        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10463            PyErr_SetString(PyExc_OverflowError,
10464                            "replace string is too long");
10465            goto error;
10466        }
10467        u = PyUnicode_New(new_size, maxchar);
10468        if (!u)
10469            goto error;
10470        assert(PyUnicode_KIND(u) == rkind);
10471        res = PyUnicode_DATA(u);
10472        ires = i = 0;
10473        if (len1 > 0) {
10474            while (n-- > 0) {
10475                /* look for next match */
10476                j = anylib_find(rkind, self,
10477                                sbuf + rkind * i, slen-i,
10478                                str1, buf1, len1, i);
10479                if (j == -1)
10480                    break;
10481                else if (j > i) {
10482                    /* copy unchanged part [i:j] */
10483                    memcpy(res + rkind * ires,
10484                           sbuf + rkind * i,
10485                           rkind * (j-i));
10486                    ires += j - i;
10487                }
10488                /* copy substitution string */
10489                if (len2 > 0) {
10490                    memcpy(res + rkind * ires,
10491                           buf2,
10492                           rkind * len2);
10493                    ires += len2;
10494                }
10495                i = j + len1;
10496            }
10497            if (i < slen)
10498                /* copy tail [i:] */
10499                memcpy(res + rkind * ires,
10500                       sbuf + rkind * i,
10501                       rkind * (slen-i));
10502        }
10503        else {
10504            /* interleave */
10505            while (n > 0) {
10506                memcpy(res + rkind * ires,
10507                       buf2,
10508                       rkind * len2);
10509                ires += len2;
10510                if (--n <= 0)
10511                    break;
10512                memcpy(res + rkind * ires,
10513                       sbuf + rkind * i,
10514                       rkind);
10515                ires++;
10516                i++;
10517            }
10518            memcpy(res + rkind * ires,
10519                   sbuf + rkind * i,
10520                   rkind * (slen-i));
10521        }
10522    }
10523
10524    if (mayshrink) {
10525        unicode_adjust_maxchar(&u);
10526        if (u == NULL)
10527            goto error;
10528    }
10529
10530  done:
10531    if (srelease)
10532        PyMem_FREE(sbuf);
10533    if (release1)
10534        PyMem_FREE(buf1);
10535    if (release2)
10536        PyMem_FREE(buf2);
10537    assert(_PyUnicode_CheckConsistency(u, 1));
10538    return u;
10539
10540  nothing:
10541    /* nothing to replace; return original string (when possible) */
10542    if (srelease)
10543        PyMem_FREE(sbuf);
10544    if (release1)
10545        PyMem_FREE(buf1);
10546    if (release2)
10547        PyMem_FREE(buf2);
10548    return unicode_result_unchanged(self);
10549
10550  error:
10551    if (srelease && sbuf)
10552        PyMem_FREE(sbuf);
10553    if (release1 && buf1)
10554        PyMem_FREE(buf1);
10555    if (release2 && buf2)
10556        PyMem_FREE(buf2);
10557    return NULL;
10558}
10559
10560/* --- Unicode Object Methods --------------------------------------------- */
10561
10562PyDoc_STRVAR(title__doc__,
10563             "S.title() -> str\n\
10564\n\
10565Return a titlecased version of S, i.e. words start with title case\n\
10566characters, all remaining cased characters have lower case.");
10567
10568static PyObject*
10569unicode_title(PyObject *self)
10570{
10571    if (PyUnicode_READY(self) == -1)
10572        return NULL;
10573    return case_operation(self, do_title);
10574}
10575
10576PyDoc_STRVAR(capitalize__doc__,
10577             "S.capitalize() -> str\n\
10578\n\
10579Return a capitalized version of S, i.e. make the first character\n\
10580have upper case and the rest lower case.");
10581
10582static PyObject*
10583unicode_capitalize(PyObject *self)
10584{
10585    if (PyUnicode_READY(self) == -1)
10586        return NULL;
10587    if (PyUnicode_GET_LENGTH(self) == 0)
10588        return unicode_result_unchanged(self);
10589    return case_operation(self, do_capitalize);
10590}
10591
10592PyDoc_STRVAR(casefold__doc__,
10593             "S.casefold() -> str\n\
10594\n\
10595Return a version of S suitable for caseless comparisons.");
10596
10597static PyObject *
10598unicode_casefold(PyObject *self)
10599{
10600    if (PyUnicode_READY(self) == -1)
10601        return NULL;
10602    if (PyUnicode_IS_ASCII(self))
10603        return ascii_upper_or_lower(self, 1);
10604    return case_operation(self, do_casefold);
10605}
10606
10607
10608/* Argument converter.  Coerces to a single unicode character */
10609
10610static int
10611convert_uc(PyObject *obj, void *addr)
10612{
10613    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10614    PyObject *uniobj;
10615
10616    uniobj = PyUnicode_FromObject(obj);
10617    if (uniobj == NULL) {
10618        PyErr_SetString(PyExc_TypeError,
10619                        "The fill character cannot be converted to Unicode");
10620        return 0;
10621    }
10622    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10623        PyErr_SetString(PyExc_TypeError,
10624                        "The fill character must be exactly one character long");
10625        Py_DECREF(uniobj);
10626        return 0;
10627    }
10628    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10629    Py_DECREF(uniobj);
10630    return 1;
10631}
10632
10633PyDoc_STRVAR(center__doc__,
10634             "S.center(width[, fillchar]) -> str\n\
10635\n\
10636Return S centered in a string of length width. Padding is\n\
10637done using the specified fill character (default is a space)");
10638
10639static PyObject *
10640unicode_center(PyObject *self, PyObject *args)
10641{
10642    Py_ssize_t marg, left;
10643    Py_ssize_t width;
10644    Py_UCS4 fillchar = ' ';
10645
10646    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10647        return NULL;
10648
10649    if (PyUnicode_READY(self) == -1)
10650        return NULL;
10651
10652    if (PyUnicode_GET_LENGTH(self) >= width)
10653        return unicode_result_unchanged(self);
10654
10655    marg = width - PyUnicode_GET_LENGTH(self);
10656    left = marg / 2 + (marg & width & 1);
10657
10658    return pad(self, left, marg - left, fillchar);
10659}
10660
10661/* This function assumes that str1 and str2 are readied by the caller. */
10662
10663static int
10664unicode_compare(PyObject *str1, PyObject *str2)
10665{
10666#define COMPARE(TYPE1, TYPE2) \
10667    do { \
10668        TYPE1* p1 = (TYPE1 *)data1; \
10669        TYPE2* p2 = (TYPE2 *)data2; \
10670        TYPE1* end = p1 + len; \
10671        Py_UCS4 c1, c2; \
10672        for (; p1 != end; p1++, p2++) { \
10673            c1 = *p1; \
10674            c2 = *p2; \
10675            if (c1 != c2) \
10676                return (c1 < c2) ? -1 : 1; \
10677        } \
10678    } \
10679    while (0)
10680
10681    int kind1, kind2;
10682    void *data1, *data2;
10683    Py_ssize_t len1, len2, len;
10684
10685    kind1 = PyUnicode_KIND(str1);
10686    kind2 = PyUnicode_KIND(str2);
10687    data1 = PyUnicode_DATA(str1);
10688    data2 = PyUnicode_DATA(str2);
10689    len1 = PyUnicode_GET_LENGTH(str1);
10690    len2 = PyUnicode_GET_LENGTH(str2);
10691    len = Py_MIN(len1, len2);
10692
10693    switch(kind1) {
10694    case PyUnicode_1BYTE_KIND:
10695    {
10696        switch(kind2) {
10697        case PyUnicode_1BYTE_KIND:
10698        {
10699            int cmp = memcmp(data1, data2, len);
10700            /* normalize result of memcmp() into the range [-1; 1] */
10701            if (cmp < 0)
10702                return -1;
10703            if (cmp > 0)
10704                return 1;
10705            break;
10706        }
10707        case PyUnicode_2BYTE_KIND:
10708            COMPARE(Py_UCS1, Py_UCS2);
10709            break;
10710        case PyUnicode_4BYTE_KIND:
10711            COMPARE(Py_UCS1, Py_UCS4);
10712            break;
10713        default:
10714            assert(0);
10715        }
10716        break;
10717    }
10718    case PyUnicode_2BYTE_KIND:
10719    {
10720        switch(kind2) {
10721        case PyUnicode_1BYTE_KIND:
10722            COMPARE(Py_UCS2, Py_UCS1);
10723            break;
10724        case PyUnicode_2BYTE_KIND:
10725        {
10726            COMPARE(Py_UCS2, Py_UCS2);
10727            break;
10728        }
10729        case PyUnicode_4BYTE_KIND:
10730            COMPARE(Py_UCS2, Py_UCS4);
10731            break;
10732        default:
10733            assert(0);
10734        }
10735        break;
10736    }
10737    case PyUnicode_4BYTE_KIND:
10738    {
10739        switch(kind2) {
10740        case PyUnicode_1BYTE_KIND:
10741            COMPARE(Py_UCS4, Py_UCS1);
10742            break;
10743        case PyUnicode_2BYTE_KIND:
10744            COMPARE(Py_UCS4, Py_UCS2);
10745            break;
10746        case PyUnicode_4BYTE_KIND:
10747        {
10748#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10749            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10750            /* normalize result of wmemcmp() into the range [-1; 1] */
10751            if (cmp < 0)
10752                return -1;
10753            if (cmp > 0)
10754                return 1;
10755#else
10756            COMPARE(Py_UCS4, Py_UCS4);
10757#endif
10758            break;
10759        }
10760        default:
10761            assert(0);
10762        }
10763        break;
10764    }
10765    default:
10766        assert(0);
10767    }
10768
10769    if (len1 == len2)
10770        return 0;
10771    if (len1 < len2)
10772        return -1;
10773    else
10774        return 1;
10775
10776#undef COMPARE
10777}
10778
10779Py_LOCAL(int)
10780unicode_compare_eq(PyObject *str1, PyObject *str2)
10781{
10782    int kind;
10783    void *data1, *data2;
10784    Py_ssize_t len;
10785    int cmp;
10786
10787    len = PyUnicode_GET_LENGTH(str1);
10788    if (PyUnicode_GET_LENGTH(str2) != len)
10789        return 0;
10790    kind = PyUnicode_KIND(str1);
10791    if (PyUnicode_KIND(str2) != kind)
10792        return 0;
10793    data1 = PyUnicode_DATA(str1);
10794    data2 = PyUnicode_DATA(str2);
10795
10796    cmp = memcmp(data1, data2, len * kind);
10797    return (cmp == 0);
10798}
10799
10800
10801int
10802PyUnicode_Compare(PyObject *left, PyObject *right)
10803{
10804    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10805        if (PyUnicode_READY(left) == -1 ||
10806            PyUnicode_READY(right) == -1)
10807            return -1;
10808
10809        /* a string is equal to itself */
10810        if (left == right)
10811            return 0;
10812
10813        return unicode_compare(left, right);
10814    }
10815    PyErr_Format(PyExc_TypeError,
10816                 "Can't compare %.100s and %.100s",
10817                 left->ob_type->tp_name,
10818                 right->ob_type->tp_name);
10819    return -1;
10820}
10821
10822int
10823_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10824{
10825    PyObject *right_str = _PyUnicode_FromId(right);   /* borrowed */
10826    if (right_str == NULL)
10827        return -1;
10828    return PyUnicode_Compare(left, right_str);
10829}
10830
10831int
10832PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10833{
10834    Py_ssize_t i;
10835    int kind;
10836    Py_UCS4 chr;
10837
10838    assert(_PyUnicode_CHECK(uni));
10839    if (PyUnicode_READY(uni) == -1)
10840        return -1;
10841    kind = PyUnicode_KIND(uni);
10842    if (kind == PyUnicode_1BYTE_KIND) {
10843        const void *data = PyUnicode_1BYTE_DATA(uni);
10844        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10845        size_t len, len2 = strlen(str);
10846        int cmp;
10847
10848        len = Py_MIN(len1, len2);
10849        cmp = memcmp(data, str, len);
10850        if (cmp != 0) {
10851            if (cmp < 0)
10852                return -1;
10853            else
10854                return 1;
10855        }
10856        if (len1 > len2)
10857            return 1; /* uni is longer */
10858        if (len2 > len1)
10859            return -1; /* str is longer */
10860        return 0;
10861    }
10862    else {
10863        void *data = PyUnicode_DATA(uni);
10864        /* Compare Unicode string and source character set string */
10865        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10866            if (chr != (unsigned char)str[i])
10867                return (chr < (unsigned char)(str[i])) ? -1 : 1;
10868        /* This check keeps Python strings that end in '\0' from comparing equal
10869         to C strings identical up to that point. */
10870        if (PyUnicode_GET_LENGTH(uni) != i || chr)
10871            return 1; /* uni is longer */
10872        if (str[i])
10873            return -1; /* str is longer */
10874        return 0;
10875    }
10876}
10877
10878
10879#define TEST_COND(cond)                         \
10880    ((cond) ? Py_True : Py_False)
10881
10882PyObject *
10883PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10884{
10885    int result;
10886    PyObject *v;
10887
10888    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10889        Py_RETURN_NOTIMPLEMENTED;
10890
10891    if (PyUnicode_READY(left) == -1 ||
10892        PyUnicode_READY(right) == -1)
10893        return NULL;
10894
10895    if (left == right) {
10896        switch (op) {
10897        case Py_EQ:
10898        case Py_LE:
10899        case Py_GE:
10900            /* a string is equal to itself */
10901            v = Py_True;
10902            break;
10903        case Py_NE:
10904        case Py_LT:
10905        case Py_GT:
10906            v = Py_False;
10907            break;
10908        default:
10909            PyErr_BadArgument();
10910            return NULL;
10911        }
10912    }
10913    else if (op == Py_EQ || op == Py_NE) {
10914        result = unicode_compare_eq(left, right);
10915        result ^= (op == Py_NE);
10916        v = TEST_COND(result);
10917    }
10918    else {
10919        result = unicode_compare(left, right);
10920
10921        /* Convert the return value to a Boolean */
10922        switch (op) {
10923        case Py_LE:
10924            v = TEST_COND(result <= 0);
10925            break;
10926        case Py_GE:
10927            v = TEST_COND(result >= 0);
10928            break;
10929        case Py_LT:
10930            v = TEST_COND(result == -1);
10931            break;
10932        case Py_GT:
10933            v = TEST_COND(result == 1);
10934            break;
10935        default:
10936            PyErr_BadArgument();
10937            return NULL;
10938        }
10939    }
10940    Py_INCREF(v);
10941    return v;
10942}
10943
10944int
10945PyUnicode_Contains(PyObject *container, PyObject *element)
10946{
10947    PyObject *str, *sub;
10948    int kind1, kind2;
10949    void *buf1, *buf2;
10950    Py_ssize_t len1, len2;
10951    int result;
10952
10953    /* Coerce the two arguments */
10954    sub = PyUnicode_FromObject(element);
10955    if (!sub) {
10956        PyErr_Format(PyExc_TypeError,
10957                     "'in <string>' requires string as left operand, not %s",
10958                     element->ob_type->tp_name);
10959        return -1;
10960    }
10961
10962    str = PyUnicode_FromObject(container);
10963    if (!str) {
10964        Py_DECREF(sub);
10965        return -1;
10966    }
10967
10968    kind1 = PyUnicode_KIND(str);
10969    kind2 = PyUnicode_KIND(sub);
10970    buf1 = PyUnicode_DATA(str);
10971    buf2 = PyUnicode_DATA(sub);
10972    if (kind2 != kind1) {
10973        if (kind2 > kind1) {
10974            Py_DECREF(sub);
10975            Py_DECREF(str);
10976            return 0;
10977        }
10978        buf2 = _PyUnicode_AsKind(sub, kind1);
10979    }
10980    if (!buf2) {
10981        Py_DECREF(sub);
10982        Py_DECREF(str);
10983        return -1;
10984    }
10985    len1 = PyUnicode_GET_LENGTH(str);
10986    len2 = PyUnicode_GET_LENGTH(sub);
10987
10988    switch (kind1) {
10989    case PyUnicode_1BYTE_KIND:
10990        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10991        break;
10992    case PyUnicode_2BYTE_KIND:
10993        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10994        break;
10995    case PyUnicode_4BYTE_KIND:
10996        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10997        break;
10998    default:
10999        result = -1;
11000        assert(0);
11001    }
11002
11003    Py_DECREF(str);
11004    Py_DECREF(sub);
11005
11006    if (kind2 != kind1)
11007        PyMem_Free(buf2);
11008
11009    return result;
11010}
11011
11012/* Concat to string or Unicode object giving a new Unicode object. */
11013
11014PyObject *
11015PyUnicode_Concat(PyObject *left, PyObject *right)
11016{
11017    PyObject *u = NULL, *v = NULL, *w;
11018    Py_UCS4 maxchar, maxchar2;
11019    Py_ssize_t u_len, v_len, new_len;
11020
11021    /* Coerce the two arguments */
11022    u = PyUnicode_FromObject(left);
11023    if (u == NULL)
11024        goto onError;
11025    v = PyUnicode_FromObject(right);
11026    if (v == NULL)
11027        goto onError;
11028
11029    /* Shortcuts */
11030    if (v == unicode_empty) {
11031        Py_DECREF(v);
11032        return u;
11033    }
11034    if (u == unicode_empty) {
11035        Py_DECREF(u);
11036        return v;
11037    }
11038
11039    u_len = PyUnicode_GET_LENGTH(u);
11040    v_len = PyUnicode_GET_LENGTH(v);
11041    if (u_len > PY_SSIZE_T_MAX - v_len) {
11042        PyErr_SetString(PyExc_OverflowError,
11043                        "strings are too large to concat");
11044        goto onError;
11045    }
11046    new_len = u_len + v_len;
11047
11048    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
11049    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11050    maxchar = Py_MAX(maxchar, maxchar2);
11051
11052    /* Concat the two Unicode strings */
11053    w = PyUnicode_New(new_len, maxchar);
11054    if (w == NULL)
11055        goto onError;
11056    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11057    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
11058    Py_DECREF(u);
11059    Py_DECREF(v);
11060    assert(_PyUnicode_CheckConsistency(w, 1));
11061    return w;
11062
11063  onError:
11064    Py_XDECREF(u);
11065    Py_XDECREF(v);
11066    return NULL;
11067}
11068
11069void
11070PyUnicode_Append(PyObject **p_left, PyObject *right)
11071{
11072    PyObject *left, *res;
11073    Py_UCS4 maxchar, maxchar2;
11074    Py_ssize_t left_len, right_len, new_len;
11075
11076    if (p_left == NULL) {
11077        if (!PyErr_Occurred())
11078            PyErr_BadInternalCall();
11079        return;
11080    }
11081    left = *p_left;
11082    if (right == NULL || left == NULL
11083        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11084        if (!PyErr_Occurred())
11085            PyErr_BadInternalCall();
11086        goto error;
11087    }
11088
11089    if (PyUnicode_READY(left) == -1)
11090        goto error;
11091    if (PyUnicode_READY(right) == -1)
11092        goto error;
11093
11094    /* Shortcuts */
11095    if (left == unicode_empty) {
11096        Py_DECREF(left);
11097        Py_INCREF(right);
11098        *p_left = right;
11099        return;
11100    }
11101    if (right == unicode_empty)
11102        return;
11103
11104    left_len = PyUnicode_GET_LENGTH(left);
11105    right_len = PyUnicode_GET_LENGTH(right);
11106    if (left_len > PY_SSIZE_T_MAX - right_len) {
11107        PyErr_SetString(PyExc_OverflowError,
11108                        "strings are too large to concat");
11109        goto error;
11110    }
11111    new_len = left_len + right_len;
11112
11113    if (unicode_modifiable(left)
11114        && PyUnicode_CheckExact(right)
11115        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11116        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11117           to change the structure size, but characters are stored just after
11118           the structure, and so it requires to move all characters which is
11119           not so different than duplicating the string. */
11120        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11121    {
11122        /* append inplace */
11123        if (unicode_resize(p_left, new_len) != 0)
11124            goto error;
11125
11126        /* copy 'right' into the newly allocated area of 'left' */
11127        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11128    }
11129    else {
11130        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11131        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11132        maxchar = Py_MAX(maxchar, maxchar2);
11133
11134        /* Concat the two Unicode strings */
11135        res = PyUnicode_New(new_len, maxchar);
11136        if (res == NULL)
11137            goto error;
11138        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11139        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11140        Py_DECREF(left);
11141        *p_left = res;
11142    }
11143    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11144    return;
11145
11146error:
11147    Py_CLEAR(*p_left);
11148}
11149
11150void
11151PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11152{
11153    PyUnicode_Append(pleft, right);
11154    Py_XDECREF(right);
11155}
11156
11157PyDoc_STRVAR(count__doc__,
11158             "S.count(sub[, start[, end]]) -> int\n\
11159\n\
11160Return the number of non-overlapping occurrences of substring sub in\n\
11161string S[start:end].  Optional arguments start and end are\n\
11162interpreted as in slice notation.");
11163
11164static PyObject *
11165unicode_count(PyObject *self, PyObject *args)
11166{
11167    PyObject *substring;
11168    Py_ssize_t start = 0;
11169    Py_ssize_t end = PY_SSIZE_T_MAX;
11170    PyObject *result;
11171    int kind1, kind2, kind;
11172    void *buf1, *buf2;
11173    Py_ssize_t len1, len2, iresult;
11174
11175    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11176                                            &start, &end))
11177        return NULL;
11178
11179    kind1 = PyUnicode_KIND(self);
11180    kind2 = PyUnicode_KIND(substring);
11181    if (kind2 > kind1) {
11182        Py_DECREF(substring);
11183        return PyLong_FromLong(0);
11184    }
11185    kind = kind1;
11186    buf1 = PyUnicode_DATA(self);
11187    buf2 = PyUnicode_DATA(substring);
11188    if (kind2 != kind)
11189        buf2 = _PyUnicode_AsKind(substring, kind);
11190    if (!buf2) {
11191        Py_DECREF(substring);
11192        return NULL;
11193    }
11194    len1 = PyUnicode_GET_LENGTH(self);
11195    len2 = PyUnicode_GET_LENGTH(substring);
11196
11197    ADJUST_INDICES(start, end, len1);
11198    switch (kind) {
11199    case PyUnicode_1BYTE_KIND:
11200        iresult = ucs1lib_count(
11201            ((Py_UCS1*)buf1) + start, end - start,
11202            buf2, len2, PY_SSIZE_T_MAX
11203            );
11204        break;
11205    case PyUnicode_2BYTE_KIND:
11206        iresult = ucs2lib_count(
11207            ((Py_UCS2*)buf1) + start, end - start,
11208            buf2, len2, PY_SSIZE_T_MAX
11209            );
11210        break;
11211    case PyUnicode_4BYTE_KIND:
11212        iresult = ucs4lib_count(
11213            ((Py_UCS4*)buf1) + start, end - start,
11214            buf2, len2, PY_SSIZE_T_MAX
11215            );
11216        break;
11217    default:
11218        assert(0); iresult = 0;
11219    }
11220
11221    result = PyLong_FromSsize_t(iresult);
11222
11223    if (kind2 != kind)
11224        PyMem_Free(buf2);
11225
11226    Py_DECREF(substring);
11227
11228    return result;
11229}
11230
11231PyDoc_STRVAR(encode__doc__,
11232             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11233\n\
11234Encode S using the codec registered for encoding. Default encoding\n\
11235is 'utf-8'. errors may be given to set a different error\n\
11236handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11237a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11238'xmlcharrefreplace' as well as any other name registered with\n\
11239codecs.register_error that can handle UnicodeEncodeErrors.");
11240
11241static PyObject *
11242unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11243{
11244    static char *kwlist[] = {"encoding", "errors", 0};
11245    char *encoding = NULL;
11246    char *errors = NULL;
11247
11248    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11249                                     kwlist, &encoding, &errors))
11250        return NULL;
11251    return PyUnicode_AsEncodedString(self, encoding, errors);
11252}
11253
11254PyDoc_STRVAR(expandtabs__doc__,
11255             "S.expandtabs(tabsize=8) -> str\n\
11256\n\
11257Return a copy of S where all tab characters are expanded using spaces.\n\
11258If tabsize is not given, a tab size of 8 characters is assumed.");
11259
11260static PyObject*
11261unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11262{
11263    Py_ssize_t i, j, line_pos, src_len, incr;
11264    Py_UCS4 ch;
11265    PyObject *u;
11266    void *src_data, *dest_data;
11267    static char *kwlist[] = {"tabsize", 0};
11268    int tabsize = 8;
11269    int kind;
11270    int found;
11271
11272    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11273                                     kwlist, &tabsize))
11274        return NULL;
11275
11276    if (PyUnicode_READY(self) == -1)
11277        return NULL;
11278
11279    /* First pass: determine size of output string */
11280    src_len = PyUnicode_GET_LENGTH(self);
11281    i = j = line_pos = 0;
11282    kind = PyUnicode_KIND(self);
11283    src_data = PyUnicode_DATA(self);
11284    found = 0;
11285    for (; i < src_len; i++) {
11286        ch = PyUnicode_READ(kind, src_data, i);
11287        if (ch == '\t') {
11288            found = 1;
11289            if (tabsize > 0) {
11290                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11291                if (j > PY_SSIZE_T_MAX - incr)
11292                    goto overflow;
11293                line_pos += incr;
11294                j += incr;
11295            }
11296        }
11297        else {
11298            if (j > PY_SSIZE_T_MAX - 1)
11299                goto overflow;
11300            line_pos++;
11301            j++;
11302            if (ch == '\n' || ch == '\r')
11303                line_pos = 0;
11304        }
11305    }
11306    if (!found)
11307        return unicode_result_unchanged(self);
11308
11309    /* Second pass: create output string and fill it */
11310    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11311    if (!u)
11312        return NULL;
11313    dest_data = PyUnicode_DATA(u);
11314
11315    i = j = line_pos = 0;
11316
11317    for (; i < src_len; i++) {
11318        ch = PyUnicode_READ(kind, src_data, i);
11319        if (ch == '\t') {
11320            if (tabsize > 0) {
11321                incr = tabsize - (line_pos % tabsize);
11322                line_pos += incr;
11323                FILL(kind, dest_data, ' ', j, incr);
11324                j += incr;
11325            }
11326        }
11327        else {
11328            line_pos++;
11329            PyUnicode_WRITE(kind, dest_data, j, ch);
11330            j++;
11331            if (ch == '\n' || ch == '\r')
11332                line_pos = 0;
11333        }
11334    }
11335    assert (j == PyUnicode_GET_LENGTH(u));
11336    return unicode_result(u);
11337
11338  overflow:
11339    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11340    return NULL;
11341}
11342
11343PyDoc_STRVAR(find__doc__,
11344             "S.find(sub[, start[, end]]) -> int\n\
11345\n\
11346Return the lowest index in S where substring sub is found,\n\
11347such that sub is contained within S[start:end].  Optional\n\
11348arguments start and end are interpreted as in slice notation.\n\
11349\n\
11350Return -1 on failure.");
11351
11352static PyObject *
11353unicode_find(PyObject *self, PyObject *args)
11354{
11355    PyObject *substring;
11356    Py_ssize_t start;
11357    Py_ssize_t end;
11358    Py_ssize_t result;
11359
11360    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11361                                            &start, &end))
11362        return NULL;
11363
11364    if (PyUnicode_READY(self) == -1) {
11365        Py_DECREF(substring);
11366        return NULL;
11367    }
11368    if (PyUnicode_READY(substring) == -1) {
11369        Py_DECREF(substring);
11370        return NULL;
11371    }
11372
11373    result = any_find_slice(1, self, substring, start, end);
11374
11375    Py_DECREF(substring);
11376
11377    if (result == -2)
11378        return NULL;
11379
11380    return PyLong_FromSsize_t(result);
11381}
11382
11383static PyObject *
11384unicode_getitem(PyObject *self, Py_ssize_t index)
11385{
11386    void *data;
11387    enum PyUnicode_Kind kind;
11388    Py_UCS4 ch;
11389
11390    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11391        PyErr_BadArgument();
11392        return NULL;
11393    }
11394    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11395        PyErr_SetString(PyExc_IndexError, "string index out of range");
11396        return NULL;
11397    }
11398    kind = PyUnicode_KIND(self);
11399    data = PyUnicode_DATA(self);
11400    ch = PyUnicode_READ(kind, data, index);
11401    return unicode_char(ch);
11402}
11403
11404/* Believe it or not, this produces the same value for ASCII strings
11405   as bytes_hash(). */
11406static Py_hash_t
11407unicode_hash(PyObject *self)
11408{
11409    Py_ssize_t len;
11410    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11411
11412#ifdef Py_DEBUG
11413    assert(_Py_HashSecret_Initialized);
11414#endif
11415    if (_PyUnicode_HASH(self) != -1)
11416        return _PyUnicode_HASH(self);
11417    if (PyUnicode_READY(self) == -1)
11418        return -1;
11419    len = PyUnicode_GET_LENGTH(self);
11420    /*
11421      We make the hash of the empty string be 0, rather than using
11422      (prefix ^ suffix), since this slightly obfuscates the hash secret
11423    */
11424    if (len == 0) {
11425        _PyUnicode_HASH(self) = 0;
11426        return 0;
11427    }
11428    x = _Py_HashBytes(PyUnicode_DATA(self),
11429                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11430    _PyUnicode_HASH(self) = x;
11431    return x;
11432}
11433
11434PyDoc_STRVAR(index__doc__,
11435             "S.index(sub[, start[, end]]) -> int\n\
11436\n\
11437Like S.find() but raise ValueError when the substring is not found.");
11438
11439static PyObject *
11440unicode_index(PyObject *self, PyObject *args)
11441{
11442    Py_ssize_t result;
11443    PyObject *substring;
11444    Py_ssize_t start;
11445    Py_ssize_t end;
11446
11447    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11448                                            &start, &end))
11449        return NULL;
11450
11451    if (PyUnicode_READY(self) == -1) {
11452        Py_DECREF(substring);
11453        return NULL;
11454    }
11455    if (PyUnicode_READY(substring) == -1) {
11456        Py_DECREF(substring);
11457        return NULL;
11458    }
11459
11460    result = any_find_slice(1, self, substring, start, end);
11461
11462    Py_DECREF(substring);
11463
11464    if (result == -2)
11465        return NULL;
11466
11467    if (result < 0) {
11468        PyErr_SetString(PyExc_ValueError, "substring not found");
11469        return NULL;
11470    }
11471
11472    return PyLong_FromSsize_t(result);
11473}
11474
11475PyDoc_STRVAR(islower__doc__,
11476             "S.islower() -> bool\n\
11477\n\
11478Return True if all cased characters in S are lowercase and there is\n\
11479at least one cased character in S, False otherwise.");
11480
11481static PyObject*
11482unicode_islower(PyObject *self)
11483{
11484    Py_ssize_t i, length;
11485    int kind;
11486    void *data;
11487    int cased;
11488
11489    if (PyUnicode_READY(self) == -1)
11490        return NULL;
11491    length = PyUnicode_GET_LENGTH(self);
11492    kind = PyUnicode_KIND(self);
11493    data = PyUnicode_DATA(self);
11494
11495    /* Shortcut for single character strings */
11496    if (length == 1)
11497        return PyBool_FromLong(
11498            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11499
11500    /* Special case for empty strings */
11501    if (length == 0)
11502        return PyBool_FromLong(0);
11503
11504    cased = 0;
11505    for (i = 0; i < length; i++) {
11506        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11507
11508        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11509            return PyBool_FromLong(0);
11510        else if (!cased && Py_UNICODE_ISLOWER(ch))
11511            cased = 1;
11512    }
11513    return PyBool_FromLong(cased);
11514}
11515
11516PyDoc_STRVAR(isupper__doc__,
11517             "S.isupper() -> bool\n\
11518\n\
11519Return True if all cased characters in S are uppercase and there is\n\
11520at least one cased character in S, False otherwise.");
11521
11522static PyObject*
11523unicode_isupper(PyObject *self)
11524{
11525    Py_ssize_t i, length;
11526    int kind;
11527    void *data;
11528    int cased;
11529
11530    if (PyUnicode_READY(self) == -1)
11531        return NULL;
11532    length = PyUnicode_GET_LENGTH(self);
11533    kind = PyUnicode_KIND(self);
11534    data = PyUnicode_DATA(self);
11535
11536    /* Shortcut for single character strings */
11537    if (length == 1)
11538        return PyBool_FromLong(
11539            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11540
11541    /* Special case for empty strings */
11542    if (length == 0)
11543        return PyBool_FromLong(0);
11544
11545    cased = 0;
11546    for (i = 0; i < length; i++) {
11547        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11548
11549        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11550            return PyBool_FromLong(0);
11551        else if (!cased && Py_UNICODE_ISUPPER(ch))
11552            cased = 1;
11553    }
11554    return PyBool_FromLong(cased);
11555}
11556
11557PyDoc_STRVAR(istitle__doc__,
11558             "S.istitle() -> bool\n\
11559\n\
11560Return True if S is a titlecased string and there is at least one\n\
11561character in S, i.e. upper- and titlecase characters may only\n\
11562follow uncased characters and lowercase characters only cased ones.\n\
11563Return False otherwise.");
11564
11565static PyObject*
11566unicode_istitle(PyObject *self)
11567{
11568    Py_ssize_t i, length;
11569    int kind;
11570    void *data;
11571    int cased, previous_is_cased;
11572
11573    if (PyUnicode_READY(self) == -1)
11574        return NULL;
11575    length = PyUnicode_GET_LENGTH(self);
11576    kind = PyUnicode_KIND(self);
11577    data = PyUnicode_DATA(self);
11578
11579    /* Shortcut for single character strings */
11580    if (length == 1) {
11581        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11582        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11583                               (Py_UNICODE_ISUPPER(ch) != 0));
11584    }
11585
11586    /* Special case for empty strings */
11587    if (length == 0)
11588        return PyBool_FromLong(0);
11589
11590    cased = 0;
11591    previous_is_cased = 0;
11592    for (i = 0; i < length; i++) {
11593        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11594
11595        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11596            if (previous_is_cased)
11597                return PyBool_FromLong(0);
11598            previous_is_cased = 1;
11599            cased = 1;
11600        }
11601        else if (Py_UNICODE_ISLOWER(ch)) {
11602            if (!previous_is_cased)
11603                return PyBool_FromLong(0);
11604            previous_is_cased = 1;
11605            cased = 1;
11606        }
11607        else
11608            previous_is_cased = 0;
11609    }
11610    return PyBool_FromLong(cased);
11611}
11612
11613PyDoc_STRVAR(isspace__doc__,
11614             "S.isspace() -> bool\n\
11615\n\
11616Return True if all characters in S are whitespace\n\
11617and there is at least one character in S, False otherwise.");
11618
11619static PyObject*
11620unicode_isspace(PyObject *self)
11621{
11622    Py_ssize_t i, length;
11623    int kind;
11624    void *data;
11625
11626    if (PyUnicode_READY(self) == -1)
11627        return NULL;
11628    length = PyUnicode_GET_LENGTH(self);
11629    kind = PyUnicode_KIND(self);
11630    data = PyUnicode_DATA(self);
11631
11632    /* Shortcut for single character strings */
11633    if (length == 1)
11634        return PyBool_FromLong(
11635            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11636
11637    /* Special case for empty strings */
11638    if (length == 0)
11639        return PyBool_FromLong(0);
11640
11641    for (i = 0; i < length; i++) {
11642        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11643        if (!Py_UNICODE_ISSPACE(ch))
11644            return PyBool_FromLong(0);
11645    }
11646    return PyBool_FromLong(1);
11647}
11648
11649PyDoc_STRVAR(isalpha__doc__,
11650             "S.isalpha() -> bool\n\
11651\n\
11652Return True if all characters in S are alphabetic\n\
11653and there is at least one character in S, False otherwise.");
11654
11655static PyObject*
11656unicode_isalpha(PyObject *self)
11657{
11658    Py_ssize_t i, length;
11659    int kind;
11660    void *data;
11661
11662    if (PyUnicode_READY(self) == -1)
11663        return NULL;
11664    length = PyUnicode_GET_LENGTH(self);
11665    kind = PyUnicode_KIND(self);
11666    data = PyUnicode_DATA(self);
11667
11668    /* Shortcut for single character strings */
11669    if (length == 1)
11670        return PyBool_FromLong(
11671            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11672
11673    /* Special case for empty strings */
11674    if (length == 0)
11675        return PyBool_FromLong(0);
11676
11677    for (i = 0; i < length; i++) {
11678        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11679            return PyBool_FromLong(0);
11680    }
11681    return PyBool_FromLong(1);
11682}
11683
11684PyDoc_STRVAR(isalnum__doc__,
11685             "S.isalnum() -> bool\n\
11686\n\
11687Return True if all characters in S are alphanumeric\n\
11688and there is at least one character in S, False otherwise.");
11689
11690static PyObject*
11691unicode_isalnum(PyObject *self)
11692{
11693    int kind;
11694    void *data;
11695    Py_ssize_t len, i;
11696
11697    if (PyUnicode_READY(self) == -1)
11698        return NULL;
11699
11700    kind = PyUnicode_KIND(self);
11701    data = PyUnicode_DATA(self);
11702    len = PyUnicode_GET_LENGTH(self);
11703
11704    /* Shortcut for single character strings */
11705    if (len == 1) {
11706        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11707        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11708    }
11709
11710    /* Special case for empty strings */
11711    if (len == 0)
11712        return PyBool_FromLong(0);
11713
11714    for (i = 0; i < len; i++) {
11715        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11716        if (!Py_UNICODE_ISALNUM(ch))
11717            return PyBool_FromLong(0);
11718    }
11719    return PyBool_FromLong(1);
11720}
11721
11722PyDoc_STRVAR(isdecimal__doc__,
11723             "S.isdecimal() -> bool\n\
11724\n\
11725Return True if there are only decimal characters in S,\n\
11726False otherwise.");
11727
11728static PyObject*
11729unicode_isdecimal(PyObject *self)
11730{
11731    Py_ssize_t i, length;
11732    int kind;
11733    void *data;
11734
11735    if (PyUnicode_READY(self) == -1)
11736        return NULL;
11737    length = PyUnicode_GET_LENGTH(self);
11738    kind = PyUnicode_KIND(self);
11739    data = PyUnicode_DATA(self);
11740
11741    /* Shortcut for single character strings */
11742    if (length == 1)
11743        return PyBool_FromLong(
11744            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11745
11746    /* Special case for empty strings */
11747    if (length == 0)
11748        return PyBool_FromLong(0);
11749
11750    for (i = 0; i < length; i++) {
11751        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11752            return PyBool_FromLong(0);
11753    }
11754    return PyBool_FromLong(1);
11755}
11756
11757PyDoc_STRVAR(isdigit__doc__,
11758             "S.isdigit() -> bool\n\
11759\n\
11760Return True if all characters in S are digits\n\
11761and there is at least one character in S, False otherwise.");
11762
11763static PyObject*
11764unicode_isdigit(PyObject *self)
11765{
11766    Py_ssize_t i, length;
11767    int kind;
11768    void *data;
11769
11770    if (PyUnicode_READY(self) == -1)
11771        return NULL;
11772    length = PyUnicode_GET_LENGTH(self);
11773    kind = PyUnicode_KIND(self);
11774    data = PyUnicode_DATA(self);
11775
11776    /* Shortcut for single character strings */
11777    if (length == 1) {
11778        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11779        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11780    }
11781
11782    /* Special case for empty strings */
11783    if (length == 0)
11784        return PyBool_FromLong(0);
11785
11786    for (i = 0; i < length; i++) {
11787        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11788            return PyBool_FromLong(0);
11789    }
11790    return PyBool_FromLong(1);
11791}
11792
11793PyDoc_STRVAR(isnumeric__doc__,
11794             "S.isnumeric() -> bool\n\
11795\n\
11796Return True if there are only numeric characters in S,\n\
11797False otherwise.");
11798
11799static PyObject*
11800unicode_isnumeric(PyObject *self)
11801{
11802    Py_ssize_t i, length;
11803    int kind;
11804    void *data;
11805
11806    if (PyUnicode_READY(self) == -1)
11807        return NULL;
11808    length = PyUnicode_GET_LENGTH(self);
11809    kind = PyUnicode_KIND(self);
11810    data = PyUnicode_DATA(self);
11811
11812    /* Shortcut for single character strings */
11813    if (length == 1)
11814        return PyBool_FromLong(
11815            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11816
11817    /* Special case for empty strings */
11818    if (length == 0)
11819        return PyBool_FromLong(0);
11820
11821    for (i = 0; i < length; i++) {
11822        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11823            return PyBool_FromLong(0);
11824    }
11825    return PyBool_FromLong(1);
11826}
11827
11828int
11829PyUnicode_IsIdentifier(PyObject *self)
11830{
11831    int kind;
11832    void *data;
11833    Py_ssize_t i;
11834    Py_UCS4 first;
11835
11836    if (PyUnicode_READY(self) == -1) {
11837        Py_FatalError("identifier not ready");
11838        return 0;
11839    }
11840
11841    /* Special case for empty strings */
11842    if (PyUnicode_GET_LENGTH(self) == 0)
11843        return 0;
11844    kind = PyUnicode_KIND(self);
11845    data = PyUnicode_DATA(self);
11846
11847    /* PEP 3131 says that the first character must be in
11848       XID_Start and subsequent characters in XID_Continue,
11849       and for the ASCII range, the 2.x rules apply (i.e
11850       start with letters and underscore, continue with
11851       letters, digits, underscore). However, given the current
11852       definition of XID_Start and XID_Continue, it is sufficient
11853       to check just for these, except that _ must be allowed
11854       as starting an identifier.  */
11855    first = PyUnicode_READ(kind, data, 0);
11856    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11857        return 0;
11858
11859    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11860        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11861            return 0;
11862    return 1;
11863}
11864
11865PyDoc_STRVAR(isidentifier__doc__,
11866             "S.isidentifier() -> bool\n\
11867\n\
11868Return True if S is a valid identifier according\n\
11869to the language definition.\n\
11870\n\
11871Use keyword.iskeyword() to test for reserved identifiers\n\
11872such as \"def\" and \"class\".\n");
11873
11874static PyObject*
11875unicode_isidentifier(PyObject *self)
11876{
11877    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11878}
11879
11880PyDoc_STRVAR(isprintable__doc__,
11881             "S.isprintable() -> bool\n\
11882\n\
11883Return True if all characters in S are considered\n\
11884printable in repr() or S is empty, False otherwise.");
11885
11886static PyObject*
11887unicode_isprintable(PyObject *self)
11888{
11889    Py_ssize_t i, length;
11890    int kind;
11891    void *data;
11892
11893    if (PyUnicode_READY(self) == -1)
11894        return NULL;
11895    length = PyUnicode_GET_LENGTH(self);
11896    kind = PyUnicode_KIND(self);
11897    data = PyUnicode_DATA(self);
11898
11899    /* Shortcut for single character strings */
11900    if (length == 1)
11901        return PyBool_FromLong(
11902            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11903
11904    for (i = 0; i < length; i++) {
11905        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11906            Py_RETURN_FALSE;
11907        }
11908    }
11909    Py_RETURN_TRUE;
11910}
11911
11912PyDoc_STRVAR(join__doc__,
11913             "S.join(iterable) -> str\n\
11914\n\
11915Return a string which is the concatenation of the strings in the\n\
11916iterable.  The separator between elements is S.");
11917
11918static PyObject*
11919unicode_join(PyObject *self, PyObject *data)
11920{
11921    return PyUnicode_Join(self, data);
11922}
11923
11924static Py_ssize_t
11925unicode_length(PyObject *self)
11926{
11927    if (PyUnicode_READY(self) == -1)
11928        return -1;
11929    return PyUnicode_GET_LENGTH(self);
11930}
11931
11932PyDoc_STRVAR(ljust__doc__,
11933             "S.ljust(width[, fillchar]) -> str\n\
11934\n\
11935Return S left-justified in a Unicode string of length width. Padding is\n\
11936done using the specified fill character (default is a space).");
11937
11938static PyObject *
11939unicode_ljust(PyObject *self, PyObject *args)
11940{
11941    Py_ssize_t width;
11942    Py_UCS4 fillchar = ' ';
11943
11944    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11945        return NULL;
11946
11947    if (PyUnicode_READY(self) == -1)
11948        return NULL;
11949
11950    if (PyUnicode_GET_LENGTH(self) >= width)
11951        return unicode_result_unchanged(self);
11952
11953    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11954}
11955
11956PyDoc_STRVAR(lower__doc__,
11957             "S.lower() -> str\n\
11958\n\
11959Return a copy of the string S converted to lowercase.");
11960
11961static PyObject*
11962unicode_lower(PyObject *self)
11963{
11964    if (PyUnicode_READY(self) == -1)
11965        return NULL;
11966    if (PyUnicode_IS_ASCII(self))
11967        return ascii_upper_or_lower(self, 1);
11968    return case_operation(self, do_lower);
11969}
11970
11971#define LEFTSTRIP 0
11972#define RIGHTSTRIP 1
11973#define BOTHSTRIP 2
11974
11975/* Arrays indexed by above */
11976static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11977
11978#define STRIPNAME(i) (stripformat[i]+3)
11979
11980/* externally visible for str.strip(unicode) */
11981PyObject *
11982_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11983{
11984    void *data;
11985    int kind;
11986    Py_ssize_t i, j, len;
11987    BLOOM_MASK sepmask;
11988    Py_ssize_t seplen;
11989
11990    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11991        return NULL;
11992
11993    kind = PyUnicode_KIND(self);
11994    data = PyUnicode_DATA(self);
11995    len = PyUnicode_GET_LENGTH(self);
11996    seplen = PyUnicode_GET_LENGTH(sepobj);
11997    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11998                              PyUnicode_DATA(sepobj),
11999                              seplen);
12000
12001    i = 0;
12002    if (striptype != RIGHTSTRIP) {
12003        while (i < len) {
12004            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12005            if (!BLOOM(sepmask, ch))
12006                break;
12007            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12008                break;
12009            i++;
12010        }
12011    }
12012
12013    j = len;
12014    if (striptype != LEFTSTRIP) {
12015        j--;
12016        while (j >= i) {
12017            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12018            if (!BLOOM(sepmask, ch))
12019                break;
12020            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12021                break;
12022            j--;
12023        }
12024
12025        j++;
12026    }
12027
12028    return PyUnicode_Substring(self, i, j);
12029}
12030
12031PyObject*
12032PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12033{
12034    unsigned char *data;
12035    int kind;
12036    Py_ssize_t length;
12037
12038    if (PyUnicode_READY(self) == -1)
12039        return NULL;
12040
12041    length = PyUnicode_GET_LENGTH(self);
12042    end = Py_MIN(end, length);
12043
12044    if (start == 0 && end == length)
12045        return unicode_result_unchanged(self);
12046
12047    if (start < 0 || end < 0) {
12048        PyErr_SetString(PyExc_IndexError, "string index out of range");
12049        return NULL;
12050    }
12051    if (start >= length || end < start)
12052        _Py_RETURN_UNICODE_EMPTY();
12053
12054    length = end - start;
12055    if (PyUnicode_IS_ASCII(self)) {
12056        data = PyUnicode_1BYTE_DATA(self);
12057        return _PyUnicode_FromASCII((char*)(data + start), length);
12058    }
12059    else {
12060        kind = PyUnicode_KIND(self);
12061        data = PyUnicode_1BYTE_DATA(self);
12062        return PyUnicode_FromKindAndData(kind,
12063                                         data + kind * start,
12064                                         length);
12065    }
12066}
12067
12068static PyObject *
12069do_strip(PyObject *self, int striptype)
12070{
12071    Py_ssize_t len, i, j;
12072
12073    if (PyUnicode_READY(self) == -1)
12074        return NULL;
12075
12076    len = PyUnicode_GET_LENGTH(self);
12077
12078    if (PyUnicode_IS_ASCII(self)) {
12079        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12080
12081        i = 0;
12082        if (striptype != RIGHTSTRIP) {
12083            while (i < len) {
12084                Py_UCS1 ch = data[i];
12085                if (!_Py_ascii_whitespace[ch])
12086                    break;
12087                i++;
12088            }
12089        }
12090
12091        j = len;
12092        if (striptype != LEFTSTRIP) {
12093            j--;
12094            while (j >= i) {
12095                Py_UCS1 ch = data[j];
12096                if (!_Py_ascii_whitespace[ch])
12097                    break;
12098                j--;
12099            }
12100            j++;
12101        }
12102    }
12103    else {
12104        int kind = PyUnicode_KIND(self);
12105        void *data = PyUnicode_DATA(self);
12106
12107        i = 0;
12108        if (striptype != RIGHTSTRIP) {
12109            while (i < len) {
12110                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12111                if (!Py_UNICODE_ISSPACE(ch))
12112                    break;
12113                i++;
12114            }
12115        }
12116
12117        j = len;
12118        if (striptype != LEFTSTRIP) {
12119            j--;
12120            while (j >= i) {
12121                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12122                if (!Py_UNICODE_ISSPACE(ch))
12123                    break;
12124                j--;
12125            }
12126            j++;
12127        }
12128    }
12129
12130    return PyUnicode_Substring(self, i, j);
12131}
12132
12133
12134static PyObject *
12135do_argstrip(PyObject *self, int striptype, PyObject *args)
12136{
12137    PyObject *sep = NULL;
12138
12139    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12140        return NULL;
12141
12142    if (sep != NULL && sep != Py_None) {
12143        if (PyUnicode_Check(sep))
12144            return _PyUnicode_XStrip(self, striptype, sep);
12145        else {
12146            PyErr_Format(PyExc_TypeError,
12147                         "%s arg must be None or str",
12148                         STRIPNAME(striptype));
12149            return NULL;
12150        }
12151    }
12152
12153    return do_strip(self, striptype);
12154}
12155
12156
12157PyDoc_STRVAR(strip__doc__,
12158             "S.strip([chars]) -> str\n\
12159\n\
12160Return a copy of the string S with leading and trailing\n\
12161whitespace removed.\n\
12162If chars is given and not None, remove characters in chars instead.");
12163
12164static PyObject *
12165unicode_strip(PyObject *self, PyObject *args)
12166{
12167    if (PyTuple_GET_SIZE(args) == 0)
12168        return do_strip(self, BOTHSTRIP); /* Common case */
12169    else
12170        return do_argstrip(self, BOTHSTRIP, args);
12171}
12172
12173
12174PyDoc_STRVAR(lstrip__doc__,
12175             "S.lstrip([chars]) -> str\n\
12176\n\
12177Return a copy of the string S with leading whitespace removed.\n\
12178If chars is given and not None, remove characters in chars instead.");
12179
12180static PyObject *
12181unicode_lstrip(PyObject *self, PyObject *args)
12182{
12183    if (PyTuple_GET_SIZE(args) == 0)
12184        return do_strip(self, LEFTSTRIP); /* Common case */
12185    else
12186        return do_argstrip(self, LEFTSTRIP, args);
12187}
12188
12189
12190PyDoc_STRVAR(rstrip__doc__,
12191             "S.rstrip([chars]) -> str\n\
12192\n\
12193Return a copy of the string S with trailing whitespace removed.\n\
12194If chars is given and not None, remove characters in chars instead.");
12195
12196static PyObject *
12197unicode_rstrip(PyObject *self, PyObject *args)
12198{
12199    if (PyTuple_GET_SIZE(args) == 0)
12200        return do_strip(self, RIGHTSTRIP); /* Common case */
12201    else
12202        return do_argstrip(self, RIGHTSTRIP, args);
12203}
12204
12205
12206static PyObject*
12207unicode_repeat(PyObject *str, Py_ssize_t len)
12208{
12209    PyObject *u;
12210    Py_ssize_t nchars, n;
12211
12212    if (len < 1)
12213        _Py_RETURN_UNICODE_EMPTY();
12214
12215    /* no repeat, return original string */
12216    if (len == 1)
12217        return unicode_result_unchanged(str);
12218
12219    if (PyUnicode_READY(str) == -1)
12220        return NULL;
12221
12222    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12223        PyErr_SetString(PyExc_OverflowError,
12224                        "repeated string is too long");
12225        return NULL;
12226    }
12227    nchars = len * PyUnicode_GET_LENGTH(str);
12228
12229    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12230    if (!u)
12231        return NULL;
12232    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12233
12234    if (PyUnicode_GET_LENGTH(str) == 1) {
12235        const int kind = PyUnicode_KIND(str);
12236        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12237        if (kind == PyUnicode_1BYTE_KIND) {
12238            void *to = PyUnicode_DATA(u);
12239            memset(to, (unsigned char)fill_char, len);
12240        }
12241        else if (kind == PyUnicode_2BYTE_KIND) {
12242            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12243            for (n = 0; n < len; ++n)
12244                ucs2[n] = fill_char;
12245        } else {
12246            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12247            assert(kind == PyUnicode_4BYTE_KIND);
12248            for (n = 0; n < len; ++n)
12249                ucs4[n] = fill_char;
12250        }
12251    }
12252    else {
12253        /* number of characters copied this far */
12254        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12255        const Py_ssize_t char_size = PyUnicode_KIND(str);
12256        char *to = (char *) PyUnicode_DATA(u);
12257        Py_MEMCPY(to, PyUnicode_DATA(str),
12258                  PyUnicode_GET_LENGTH(str) * char_size);
12259        while (done < nchars) {
12260            n = (done <= nchars-done) ? done : nchars-done;
12261            Py_MEMCPY(to + (done * char_size), to, n * char_size);
12262            done += n;
12263        }
12264    }
12265
12266    assert(_PyUnicode_CheckConsistency(u, 1));
12267    return u;
12268}
12269
12270PyObject *
12271PyUnicode_Replace(PyObject *obj,
12272                  PyObject *subobj,
12273                  PyObject *replobj,
12274                  Py_ssize_t maxcount)
12275{
12276    PyObject *self;
12277    PyObject *str1;
12278    PyObject *str2;
12279    PyObject *result;
12280
12281    self = PyUnicode_FromObject(obj);
12282    if (self == NULL)
12283        return NULL;
12284    str1 = PyUnicode_FromObject(subobj);
12285    if (str1 == NULL) {
12286        Py_DECREF(self);
12287        return NULL;
12288    }
12289    str2 = PyUnicode_FromObject(replobj);
12290    if (str2 == NULL) {
12291        Py_DECREF(self);
12292        Py_DECREF(str1);
12293        return NULL;
12294    }
12295    if (PyUnicode_READY(self) == -1 ||
12296        PyUnicode_READY(str1) == -1 ||
12297        PyUnicode_READY(str2) == -1)
12298        result = NULL;
12299    else
12300        result = replace(self, str1, str2, maxcount);
12301    Py_DECREF(self);
12302    Py_DECREF(str1);
12303    Py_DECREF(str2);
12304    return result;
12305}
12306
12307PyDoc_STRVAR(replace__doc__,
12308             "S.replace(old, new[, count]) -> str\n\
12309\n\
12310Return a copy of S with all occurrences of substring\n\
12311old replaced by new.  If the optional argument count is\n\
12312given, only the first count occurrences are replaced.");
12313
12314static PyObject*
12315unicode_replace(PyObject *self, PyObject *args)
12316{
12317    PyObject *str1;
12318    PyObject *str2;
12319    Py_ssize_t maxcount = -1;
12320    PyObject *result;
12321
12322    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12323        return NULL;
12324    if (PyUnicode_READY(self) == -1)
12325        return NULL;
12326    str1 = PyUnicode_FromObject(str1);
12327    if (str1 == NULL)
12328        return NULL;
12329    str2 = PyUnicode_FromObject(str2);
12330    if (str2 == NULL) {
12331        Py_DECREF(str1);
12332        return NULL;
12333    }
12334    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12335        result = NULL;
12336    else
12337        result = replace(self, str1, str2, maxcount);
12338
12339    Py_DECREF(str1);
12340    Py_DECREF(str2);
12341    return result;
12342}
12343
12344static PyObject *
12345unicode_repr(PyObject *unicode)
12346{
12347    PyObject *repr;
12348    Py_ssize_t isize;
12349    Py_ssize_t osize, squote, dquote, i, o;
12350    Py_UCS4 max, quote;
12351    int ikind, okind, unchanged;
12352    void *idata, *odata;
12353
12354    if (PyUnicode_READY(unicode) == -1)
12355        return NULL;
12356
12357    isize = PyUnicode_GET_LENGTH(unicode);
12358    idata = PyUnicode_DATA(unicode);
12359
12360    /* Compute length of output, quote characters, and
12361       maximum character */
12362    osize = 0;
12363    max = 127;
12364    squote = dquote = 0;
12365    ikind = PyUnicode_KIND(unicode);
12366    for (i = 0; i < isize; i++) {
12367        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12368        Py_ssize_t incr = 1;
12369        switch (ch) {
12370        case '\'': squote++; break;
12371        case '"':  dquote++; break;
12372        case '\\': case '\t': case '\r': case '\n':
12373            incr = 2;
12374            break;
12375        default:
12376            /* Fast-path ASCII */
12377            if (ch < ' ' || ch == 0x7f)
12378                incr = 4; /* \xHH */
12379            else if (ch < 0x7f)
12380                ;
12381            else if (Py_UNICODE_ISPRINTABLE(ch))
12382                max = ch > max ? ch : max;
12383            else if (ch < 0x100)
12384                incr = 4; /* \xHH */
12385            else if (ch < 0x10000)
12386                incr = 6; /* \uHHHH */
12387            else
12388                incr = 10; /* \uHHHHHHHH */
12389        }
12390        if (osize > PY_SSIZE_T_MAX - incr) {
12391            PyErr_SetString(PyExc_OverflowError,
12392                            "string is too long to generate repr");
12393            return NULL;
12394        }
12395        osize += incr;
12396    }
12397
12398    quote = '\'';
12399    unchanged = (osize == isize);
12400    if (squote) {
12401        unchanged = 0;
12402        if (dquote)
12403            /* Both squote and dquote present. Use squote,
12404               and escape them */
12405            osize += squote;
12406        else
12407            quote = '"';
12408    }
12409    osize += 2;   /* quotes */
12410
12411    repr = PyUnicode_New(osize, max);
12412    if (repr == NULL)
12413        return NULL;
12414    okind = PyUnicode_KIND(repr);
12415    odata = PyUnicode_DATA(repr);
12416
12417    PyUnicode_WRITE(okind, odata, 0, quote);
12418    PyUnicode_WRITE(okind, odata, osize-1, quote);
12419    if (unchanged) {
12420        _PyUnicode_FastCopyCharacters(repr, 1,
12421                                      unicode, 0,
12422                                      isize);
12423    }
12424    else {
12425        for (i = 0, o = 1; i < isize; i++) {
12426            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12427
12428            /* Escape quotes and backslashes */
12429            if ((ch == quote) || (ch == '\\')) {
12430                PyUnicode_WRITE(okind, odata, o++, '\\');
12431                PyUnicode_WRITE(okind, odata, o++, ch);
12432                continue;
12433            }
12434
12435            /* Map special whitespace to '\t', \n', '\r' */
12436            if (ch == '\t') {
12437                PyUnicode_WRITE(okind, odata, o++, '\\');
12438                PyUnicode_WRITE(okind, odata, o++, 't');
12439            }
12440            else if (ch == '\n') {
12441                PyUnicode_WRITE(okind, odata, o++, '\\');
12442                PyUnicode_WRITE(okind, odata, o++, 'n');
12443            }
12444            else if (ch == '\r') {
12445                PyUnicode_WRITE(okind, odata, o++, '\\');
12446                PyUnicode_WRITE(okind, odata, o++, 'r');
12447            }
12448
12449            /* Map non-printable US ASCII to '\xhh' */
12450            else if (ch < ' ' || ch == 0x7F) {
12451                PyUnicode_WRITE(okind, odata, o++, '\\');
12452                PyUnicode_WRITE(okind, odata, o++, 'x');
12453                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12454                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12455            }
12456
12457            /* Copy ASCII characters as-is */
12458            else if (ch < 0x7F) {
12459                PyUnicode_WRITE(okind, odata, o++, ch);
12460            }
12461
12462            /* Non-ASCII characters */
12463            else {
12464                /* Map Unicode whitespace and control characters
12465                   (categories Z* and C* except ASCII space)
12466                */
12467                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12468                    PyUnicode_WRITE(okind, odata, o++, '\\');
12469                    /* Map 8-bit characters to '\xhh' */
12470                    if (ch <= 0xff) {
12471                        PyUnicode_WRITE(okind, odata, o++, 'x');
12472                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12473                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12474                    }
12475                    /* Map 16-bit characters to '\uxxxx' */
12476                    else if (ch <= 0xffff) {
12477                        PyUnicode_WRITE(okind, odata, o++, 'u');
12478                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12479                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12480                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12481                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12482                    }
12483                    /* Map 21-bit characters to '\U00xxxxxx' */
12484                    else {
12485                        PyUnicode_WRITE(okind, odata, o++, 'U');
12486                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12487                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12488                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12489                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12490                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12491                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12492                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12493                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12494                    }
12495                }
12496                /* Copy characters as-is */
12497                else {
12498                    PyUnicode_WRITE(okind, odata, o++, ch);
12499                }
12500            }
12501        }
12502    }
12503    /* Closing quote already added at the beginning */
12504    assert(_PyUnicode_CheckConsistency(repr, 1));
12505    return repr;
12506}
12507
12508PyDoc_STRVAR(rfind__doc__,
12509             "S.rfind(sub[, start[, end]]) -> int\n\
12510\n\
12511Return the highest index in S where substring sub is found,\n\
12512such that sub is contained within S[start:end].  Optional\n\
12513arguments start and end are interpreted as in slice notation.\n\
12514\n\
12515Return -1 on failure.");
12516
12517static PyObject *
12518unicode_rfind(PyObject *self, PyObject *args)
12519{
12520    PyObject *substring;
12521    Py_ssize_t start;
12522    Py_ssize_t end;
12523    Py_ssize_t result;
12524
12525    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12526                                            &start, &end))
12527        return NULL;
12528
12529    if (PyUnicode_READY(self) == -1) {
12530        Py_DECREF(substring);
12531        return NULL;
12532    }
12533    if (PyUnicode_READY(substring) == -1) {
12534        Py_DECREF(substring);
12535        return NULL;
12536    }
12537
12538    result = any_find_slice(-1, self, substring, start, end);
12539
12540    Py_DECREF(substring);
12541
12542    if (result == -2)
12543        return NULL;
12544
12545    return PyLong_FromSsize_t(result);
12546}
12547
12548PyDoc_STRVAR(rindex__doc__,
12549             "S.rindex(sub[, start[, end]]) -> int\n\
12550\n\
12551Like S.rfind() but raise ValueError when the substring is not found.");
12552
12553static PyObject *
12554unicode_rindex(PyObject *self, PyObject *args)
12555{
12556    PyObject *substring;
12557    Py_ssize_t start;
12558    Py_ssize_t end;
12559    Py_ssize_t result;
12560
12561    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12562                                            &start, &end))
12563        return NULL;
12564
12565    if (PyUnicode_READY(self) == -1) {
12566        Py_DECREF(substring);
12567        return NULL;
12568    }
12569    if (PyUnicode_READY(substring) == -1) {
12570        Py_DECREF(substring);
12571        return NULL;
12572    }
12573
12574    result = any_find_slice(-1, self, substring, start, end);
12575
12576    Py_DECREF(substring);
12577
12578    if (result == -2)
12579        return NULL;
12580
12581    if (result < 0) {
12582        PyErr_SetString(PyExc_ValueError, "substring not found");
12583        return NULL;
12584    }
12585
12586    return PyLong_FromSsize_t(result);
12587}
12588
12589PyDoc_STRVAR(rjust__doc__,
12590             "S.rjust(width[, fillchar]) -> str\n\
12591\n\
12592Return S right-justified in a string of length width. Padding is\n\
12593done using the specified fill character (default is a space).");
12594
12595static PyObject *
12596unicode_rjust(PyObject *self, PyObject *args)
12597{
12598    Py_ssize_t width;
12599    Py_UCS4 fillchar = ' ';
12600
12601    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12602        return NULL;
12603
12604    if (PyUnicode_READY(self) == -1)
12605        return NULL;
12606
12607    if (PyUnicode_GET_LENGTH(self) >= width)
12608        return unicode_result_unchanged(self);
12609
12610    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12611}
12612
12613PyObject *
12614PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12615{
12616    PyObject *result;
12617
12618    s = PyUnicode_FromObject(s);
12619    if (s == NULL)
12620        return NULL;
12621    if (sep != NULL) {
12622        sep = PyUnicode_FromObject(sep);
12623        if (sep == NULL) {
12624            Py_DECREF(s);
12625            return NULL;
12626        }
12627    }
12628
12629    result = split(s, sep, maxsplit);
12630
12631    Py_DECREF(s);
12632    Py_XDECREF(sep);
12633    return result;
12634}
12635
12636PyDoc_STRVAR(split__doc__,
12637             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12638\n\
12639Return a list of the words in S, using sep as the\n\
12640delimiter string.  If maxsplit is given, at most maxsplit\n\
12641splits are done. If sep is not specified or is None, any\n\
12642whitespace string is a separator and empty strings are\n\
12643removed from the result.");
12644
12645static PyObject*
12646unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12647{
12648    static char *kwlist[] = {"sep", "maxsplit", 0};
12649    PyObject *substring = Py_None;
12650    Py_ssize_t maxcount = -1;
12651
12652    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12653                                     kwlist, &substring, &maxcount))
12654        return NULL;
12655
12656    if (substring == Py_None)
12657        return split(self, NULL, maxcount);
12658    else if (PyUnicode_Check(substring))
12659        return split(self, substring, maxcount);
12660    else
12661        return PyUnicode_Split(self, substring, maxcount);
12662}
12663
12664PyObject *
12665PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12666{
12667    PyObject* str_obj;
12668    PyObject* sep_obj;
12669    PyObject* out;
12670    int kind1, kind2, kind;
12671    void *buf1 = NULL, *buf2 = NULL;
12672    Py_ssize_t len1, len2;
12673
12674    str_obj = PyUnicode_FromObject(str_in);
12675    if (!str_obj)
12676        return NULL;
12677    sep_obj = PyUnicode_FromObject(sep_in);
12678    if (!sep_obj) {
12679        Py_DECREF(str_obj);
12680        return NULL;
12681    }
12682    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12683        Py_DECREF(sep_obj);
12684        Py_DECREF(str_obj);
12685        return NULL;
12686    }
12687
12688    kind1 = PyUnicode_KIND(str_obj);
12689    kind2 = PyUnicode_KIND(sep_obj);
12690    kind = Py_MAX(kind1, kind2);
12691    buf1 = PyUnicode_DATA(str_obj);
12692    if (kind1 != kind)
12693        buf1 = _PyUnicode_AsKind(str_obj, kind);
12694    if (!buf1)
12695        goto onError;
12696    buf2 = PyUnicode_DATA(sep_obj);
12697    if (kind2 != kind)
12698        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12699    if (!buf2)
12700        goto onError;
12701    len1 = PyUnicode_GET_LENGTH(str_obj);
12702    len2 = PyUnicode_GET_LENGTH(sep_obj);
12703
12704    switch (PyUnicode_KIND(str_obj)) {
12705    case PyUnicode_1BYTE_KIND:
12706        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12707            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12708        else
12709            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12710        break;
12711    case PyUnicode_2BYTE_KIND:
12712        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12713        break;
12714    case PyUnicode_4BYTE_KIND:
12715        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12716        break;
12717    default:
12718        assert(0);
12719        out = 0;
12720    }
12721
12722    Py_DECREF(sep_obj);
12723    Py_DECREF(str_obj);
12724    if (kind1 != kind)
12725        PyMem_Free(buf1);
12726    if (kind2 != kind)
12727        PyMem_Free(buf2);
12728
12729    return out;
12730  onError:
12731    Py_DECREF(sep_obj);
12732    Py_DECREF(str_obj);
12733    if (kind1 != kind && buf1)
12734        PyMem_Free(buf1);
12735    if (kind2 != kind && buf2)
12736        PyMem_Free(buf2);
12737    return NULL;
12738}
12739
12740
12741PyObject *
12742PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12743{
12744    PyObject* str_obj;
12745    PyObject* sep_obj;
12746    PyObject* out;
12747    int kind1, kind2, kind;
12748    void *buf1 = NULL, *buf2 = NULL;
12749    Py_ssize_t len1, len2;
12750
12751    str_obj = PyUnicode_FromObject(str_in);
12752    if (!str_obj)
12753        return NULL;
12754    sep_obj = PyUnicode_FromObject(sep_in);
12755    if (!sep_obj) {
12756        Py_DECREF(str_obj);
12757        return NULL;
12758    }
12759
12760    kind1 = PyUnicode_KIND(str_in);
12761    kind2 = PyUnicode_KIND(sep_obj);
12762    kind = Py_MAX(kind1, kind2);
12763    buf1 = PyUnicode_DATA(str_in);
12764    if (kind1 != kind)
12765        buf1 = _PyUnicode_AsKind(str_in, kind);
12766    if (!buf1)
12767        goto onError;
12768    buf2 = PyUnicode_DATA(sep_obj);
12769    if (kind2 != kind)
12770        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12771    if (!buf2)
12772        goto onError;
12773    len1 = PyUnicode_GET_LENGTH(str_obj);
12774    len2 = PyUnicode_GET_LENGTH(sep_obj);
12775
12776    switch (PyUnicode_KIND(str_in)) {
12777    case PyUnicode_1BYTE_KIND:
12778        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12779            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12780        else
12781            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12782        break;
12783    case PyUnicode_2BYTE_KIND:
12784        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12785        break;
12786    case PyUnicode_4BYTE_KIND:
12787        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12788        break;
12789    default:
12790        assert(0);
12791        out = 0;
12792    }
12793
12794    Py_DECREF(sep_obj);
12795    Py_DECREF(str_obj);
12796    if (kind1 != kind)
12797        PyMem_Free(buf1);
12798    if (kind2 != kind)
12799        PyMem_Free(buf2);
12800
12801    return out;
12802  onError:
12803    Py_DECREF(sep_obj);
12804    Py_DECREF(str_obj);
12805    if (kind1 != kind && buf1)
12806        PyMem_Free(buf1);
12807    if (kind2 != kind && buf2)
12808        PyMem_Free(buf2);
12809    return NULL;
12810}
12811
12812PyDoc_STRVAR(partition__doc__,
12813             "S.partition(sep) -> (head, sep, tail)\n\
12814\n\
12815Search for the separator sep in S, and return the part before it,\n\
12816the separator itself, and the part after it.  If the separator is not\n\
12817found, return S and two empty strings.");
12818
12819static PyObject*
12820unicode_partition(PyObject *self, PyObject *separator)
12821{
12822    return PyUnicode_Partition(self, separator);
12823}
12824
12825PyDoc_STRVAR(rpartition__doc__,
12826             "S.rpartition(sep) -> (head, sep, tail)\n\
12827\n\
12828Search for the separator sep in S, starting at the end of S, and return\n\
12829the part before it, the separator itself, and the part after it.  If the\n\
12830separator is not found, return two empty strings and S.");
12831
12832static PyObject*
12833unicode_rpartition(PyObject *self, PyObject *separator)
12834{
12835    return PyUnicode_RPartition(self, separator);
12836}
12837
12838PyObject *
12839PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12840{
12841    PyObject *result;
12842
12843    s = PyUnicode_FromObject(s);
12844    if (s == NULL)
12845        return NULL;
12846    if (sep != NULL) {
12847        sep = PyUnicode_FromObject(sep);
12848        if (sep == NULL) {
12849            Py_DECREF(s);
12850            return NULL;
12851        }
12852    }
12853
12854    result = rsplit(s, sep, maxsplit);
12855
12856    Py_DECREF(s);
12857    Py_XDECREF(sep);
12858    return result;
12859}
12860
12861PyDoc_STRVAR(rsplit__doc__,
12862             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12863\n\
12864Return a list of the words in S, using sep as the\n\
12865delimiter string, starting at the end of the string and\n\
12866working to the front.  If maxsplit is given, at most maxsplit\n\
12867splits are done. If sep is not specified, any whitespace string\n\
12868is a separator.");
12869
12870static PyObject*
12871unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12872{
12873    static char *kwlist[] = {"sep", "maxsplit", 0};
12874    PyObject *substring = Py_None;
12875    Py_ssize_t maxcount = -1;
12876
12877    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12878                                     kwlist, &substring, &maxcount))
12879        return NULL;
12880
12881    if (substring == Py_None)
12882        return rsplit(self, NULL, maxcount);
12883    else if (PyUnicode_Check(substring))
12884        return rsplit(self, substring, maxcount);
12885    else
12886        return PyUnicode_RSplit(self, substring, maxcount);
12887}
12888
12889PyDoc_STRVAR(splitlines__doc__,
12890             "S.splitlines([keepends]) -> list of strings\n\
12891\n\
12892Return a list of the lines in S, breaking at line boundaries.\n\
12893Line breaks are not included in the resulting list unless keepends\n\
12894is given and true.");
12895
12896static PyObject*
12897unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12898{
12899    static char *kwlist[] = {"keepends", 0};
12900    int keepends = 0;
12901
12902    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12903                                     kwlist, &keepends))
12904        return NULL;
12905
12906    return PyUnicode_Splitlines(self, keepends);
12907}
12908
12909static
12910PyObject *unicode_str(PyObject *self)
12911{
12912    return unicode_result_unchanged(self);
12913}
12914
12915PyDoc_STRVAR(swapcase__doc__,
12916             "S.swapcase() -> str\n\
12917\n\
12918Return a copy of S with uppercase characters converted to lowercase\n\
12919and vice versa.");
12920
12921static PyObject*
12922unicode_swapcase(PyObject *self)
12923{
12924    if (PyUnicode_READY(self) == -1)
12925        return NULL;
12926    return case_operation(self, do_swapcase);
12927}
12928
12929/*[clinic input]
12930
12931@staticmethod
12932str.maketrans as unicode_maketrans
12933
12934  x: object
12935
12936  y: unicode=NULL
12937
12938  z: unicode=NULL
12939
12940  /
12941
12942Return a translation table usable for str.translate().
12943
12944If there is only one argument, it must be a dictionary mapping Unicode
12945ordinals (integers) or characters to Unicode ordinals, strings or None.
12946Character keys will be then converted to ordinals.
12947If there are two arguments, they must be strings of equal length, and
12948in the resulting dictionary, each character in x will be mapped to the
12949character at the same position in y. If there is a third argument, it
12950must be a string, whose characters will be mapped to None in the result.
12951[clinic start generated code]*/
12952
12953PyDoc_STRVAR(unicode_maketrans__doc__,
12954"maketrans(x, y=None, z=None, /)\n"
12955"--\n"
12956"\n"
12957"Return a translation table usable for str.translate().\n"
12958"\n"
12959"If there is only one argument, it must be a dictionary mapping Unicode\n"
12960"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12961"Character keys will be then converted to ordinals.\n"
12962"If there are two arguments, they must be strings of equal length, and\n"
12963"in the resulting dictionary, each character in x will be mapped to the\n"
12964"character at the same position in y. If there is a third argument, it\n"
12965"must be a string, whose characters will be mapped to None in the result.");
12966
12967#define UNICODE_MAKETRANS_METHODDEF    \
12968    {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12969
12970static PyObject *
12971unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12972
12973static PyObject *
12974unicode_maketrans(void *null, PyObject *args)
12975{
12976    PyObject *return_value = NULL;
12977    PyObject *x;
12978    PyObject *y = NULL;
12979    PyObject *z = NULL;
12980
12981    if (!PyArg_ParseTuple(args,
12982        "O|UU:maketrans",
12983        &x, &y, &z))
12984        goto exit;
12985    return_value = unicode_maketrans_impl(x, y, z);
12986
12987exit:
12988    return return_value;
12989}
12990
12991static PyObject *
12992unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12993/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
12994{
12995    PyObject *new = NULL, *key, *value;
12996    Py_ssize_t i = 0;
12997    int res;
12998
12999    new = PyDict_New();
13000    if (!new)
13001        return NULL;
13002    if (y != NULL) {
13003        int x_kind, y_kind, z_kind;
13004        void *x_data, *y_data, *z_data;
13005
13006        /* x must be a string too, of equal length */
13007        if (!PyUnicode_Check(x)) {
13008            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13009                            "be a string if there is a second argument");
13010            goto err;
13011        }
13012        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13013            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13014                            "arguments must have equal length");
13015            goto err;
13016        }
13017        /* create entries for translating chars in x to those in y */
13018        x_kind = PyUnicode_KIND(x);
13019        y_kind = PyUnicode_KIND(y);
13020        x_data = PyUnicode_DATA(x);
13021        y_data = PyUnicode_DATA(y);
13022        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13023            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13024            if (!key)
13025                goto err;
13026            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13027            if (!value) {
13028                Py_DECREF(key);
13029                goto err;
13030            }
13031            res = PyDict_SetItem(new, key, value);
13032            Py_DECREF(key);
13033            Py_DECREF(value);
13034            if (res < 0)
13035                goto err;
13036        }
13037        /* create entries for deleting chars in z */
13038        if (z != NULL) {
13039            z_kind = PyUnicode_KIND(z);
13040            z_data = PyUnicode_DATA(z);
13041            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13042                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13043                if (!key)
13044                    goto err;
13045                res = PyDict_SetItem(new, key, Py_None);
13046                Py_DECREF(key);
13047                if (res < 0)
13048                    goto err;
13049            }
13050        }
13051    } else {
13052        int kind;
13053        void *data;
13054
13055        /* x must be a dict */
13056        if (!PyDict_CheckExact(x)) {
13057            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13058                            "to maketrans it must be a dict");
13059            goto err;
13060        }
13061        /* copy entries into the new dict, converting string keys to int keys */
13062        while (PyDict_Next(x, &i, &key, &value)) {
13063            if (PyUnicode_Check(key)) {
13064                /* convert string keys to integer keys */
13065                PyObject *newkey;
13066                if (PyUnicode_GET_LENGTH(key) != 1) {
13067                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13068                                    "table must be of length 1");
13069                    goto err;
13070                }
13071                kind = PyUnicode_KIND(key);
13072                data = PyUnicode_DATA(key);
13073                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13074                if (!newkey)
13075                    goto err;
13076                res = PyDict_SetItem(new, newkey, value);
13077                Py_DECREF(newkey);
13078                if (res < 0)
13079                    goto err;
13080            } else if (PyLong_Check(key)) {
13081                /* just keep integer keys */
13082                if (PyDict_SetItem(new, key, value) < 0)
13083                    goto err;
13084            } else {
13085                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13086                                "be strings or integers");
13087                goto err;
13088            }
13089        }
13090    }
13091    return new;
13092  err:
13093    Py_DECREF(new);
13094    return NULL;
13095}
13096
13097PyDoc_STRVAR(translate__doc__,
13098             "S.translate(table) -> str\n\
13099\n\
13100Return a copy of the string S, where all characters have been mapped\n\
13101through the given translation table, which must be a mapping of\n\
13102Unicode ordinals to Unicode ordinals, strings, or None.\n\
13103Unmapped characters are left untouched. Characters mapped to None\n\
13104are deleted.");
13105
13106static PyObject*
13107unicode_translate(PyObject *self, PyObject *table)
13108{
13109    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13110}
13111
13112PyDoc_STRVAR(upper__doc__,
13113             "S.upper() -> str\n\
13114\n\
13115Return a copy of S converted to uppercase.");
13116
13117static PyObject*
13118unicode_upper(PyObject *self)
13119{
13120    if (PyUnicode_READY(self) == -1)
13121        return NULL;
13122    if (PyUnicode_IS_ASCII(self))
13123        return ascii_upper_or_lower(self, 0);
13124    return case_operation(self, do_upper);
13125}
13126
13127PyDoc_STRVAR(zfill__doc__,
13128             "S.zfill(width) -> str\n\
13129\n\
13130Pad a numeric string S with zeros on the left, to fill a field\n\
13131of the specified width. The string S is never truncated.");
13132
13133static PyObject *
13134unicode_zfill(PyObject *self, PyObject *args)
13135{
13136    Py_ssize_t fill;
13137    PyObject *u;
13138    Py_ssize_t width;
13139    int kind;
13140    void *data;
13141    Py_UCS4 chr;
13142
13143    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13144        return NULL;
13145
13146    if (PyUnicode_READY(self) == -1)
13147        return NULL;
13148
13149    if (PyUnicode_GET_LENGTH(self) >= width)
13150        return unicode_result_unchanged(self);
13151
13152    fill = width - PyUnicode_GET_LENGTH(self);
13153
13154    u = pad(self, fill, 0, '0');
13155
13156    if (u == NULL)
13157        return NULL;
13158
13159    kind = PyUnicode_KIND(u);
13160    data = PyUnicode_DATA(u);
13161    chr = PyUnicode_READ(kind, data, fill);
13162
13163    if (chr == '+' || chr == '-') {
13164        /* move sign to beginning of string */
13165        PyUnicode_WRITE(kind, data, 0, chr);
13166        PyUnicode_WRITE(kind, data, fill, '0');
13167    }
13168
13169    assert(_PyUnicode_CheckConsistency(u, 1));
13170    return u;
13171}
13172
13173#if 0
13174static PyObject *
13175unicode__decimal2ascii(PyObject *self)
13176{
13177    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13178}
13179#endif
13180
13181PyDoc_STRVAR(startswith__doc__,
13182             "S.startswith(prefix[, start[, end]]) -> bool\n\
13183\n\
13184Return True if S starts with the specified prefix, False otherwise.\n\
13185With optional start, test S beginning at that position.\n\
13186With optional end, stop comparing S at that position.\n\
13187prefix can also be a tuple of strings to try.");
13188
13189static PyObject *
13190unicode_startswith(PyObject *self,
13191                   PyObject *args)
13192{
13193    PyObject *subobj;
13194    PyObject *substring;
13195    Py_ssize_t start = 0;
13196    Py_ssize_t end = PY_SSIZE_T_MAX;
13197    int result;
13198
13199    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13200        return NULL;
13201    if (PyTuple_Check(subobj)) {
13202        Py_ssize_t i;
13203        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13204            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
13205            if (substring == NULL)
13206                return NULL;
13207            result = tailmatch(self, substring, start, end, -1);
13208            Py_DECREF(substring);
13209            if (result == -1)
13210                return NULL;
13211            if (result) {
13212                Py_RETURN_TRUE;
13213            }
13214        }
13215        /* nothing matched */
13216        Py_RETURN_FALSE;
13217    }
13218    substring = PyUnicode_FromObject(subobj);
13219    if (substring == NULL) {
13220        if (PyErr_ExceptionMatches(PyExc_TypeError))
13221            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13222                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13223        return NULL;
13224    }
13225    result = tailmatch(self, substring, start, end, -1);
13226    Py_DECREF(substring);
13227    if (result == -1)
13228        return NULL;
13229    return PyBool_FromLong(result);
13230}
13231
13232
13233PyDoc_STRVAR(endswith__doc__,
13234             "S.endswith(suffix[, start[, end]]) -> bool\n\
13235\n\
13236Return True if S ends with the specified suffix, False otherwise.\n\
13237With optional start, test S beginning at that position.\n\
13238With optional end, stop comparing S at that position.\n\
13239suffix can also be a tuple of strings to try.");
13240
13241static PyObject *
13242unicode_endswith(PyObject *self,
13243                 PyObject *args)
13244{
13245    PyObject *subobj;
13246    PyObject *substring;
13247    Py_ssize_t start = 0;
13248    Py_ssize_t end = PY_SSIZE_T_MAX;
13249    int result;
13250
13251    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13252        return NULL;
13253    if (PyTuple_Check(subobj)) {
13254        Py_ssize_t i;
13255        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13256            substring = PyUnicode_FromObject(
13257                PyTuple_GET_ITEM(subobj, i));
13258            if (substring == NULL)
13259                return NULL;
13260            result = tailmatch(self, substring, start, end, +1);
13261            Py_DECREF(substring);
13262            if (result == -1)
13263                return NULL;
13264            if (result) {
13265                Py_RETURN_TRUE;
13266            }
13267        }
13268        Py_RETURN_FALSE;
13269    }
13270    substring = PyUnicode_FromObject(subobj);
13271    if (substring == NULL) {
13272        if (PyErr_ExceptionMatches(PyExc_TypeError))
13273            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13274                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13275        return NULL;
13276    }
13277    result = tailmatch(self, substring, start, end, +1);
13278    Py_DECREF(substring);
13279    if (result == -1)
13280        return NULL;
13281    return PyBool_FromLong(result);
13282}
13283
13284Py_LOCAL_INLINE(void)
13285_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13286{
13287    if (!writer->readonly)
13288        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13289    else {
13290        /* Copy-on-write mode: set buffer size to 0 so
13291         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13292         * next write. */
13293        writer->size = 0;
13294    }
13295    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13296    writer->data = PyUnicode_DATA(writer->buffer);
13297    writer->kind = PyUnicode_KIND(writer->buffer);
13298}
13299
13300void
13301_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13302{
13303    memset(writer, 0, sizeof(*writer));
13304#ifdef Py_DEBUG
13305    writer->kind = 5;    /* invalid kind */
13306#endif
13307    writer->min_char = 127;
13308}
13309
13310int
13311_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13312                                 Py_ssize_t length, Py_UCS4 maxchar)
13313{
13314#ifdef MS_WINDOWS
13315   /* On Windows, overallocate by 50% is the best factor */
13316#  define OVERALLOCATE_FACTOR 2
13317#else
13318   /* On Linux, overallocate by 25% is the best factor */
13319#  define OVERALLOCATE_FACTOR 4
13320#endif
13321    Py_ssize_t newlen;
13322    PyObject *newbuffer;
13323
13324    assert(length > 0);
13325
13326    if (length > PY_SSIZE_T_MAX - writer->pos) {
13327        PyErr_NoMemory();
13328        return -1;
13329    }
13330    newlen = writer->pos + length;
13331
13332    maxchar = Py_MAX(maxchar, writer->min_char);
13333
13334    if (writer->buffer == NULL) {
13335        assert(!writer->readonly);
13336        if (writer->overallocate
13337            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13338            /* overallocate to limit the number of realloc() */
13339            newlen += newlen / OVERALLOCATE_FACTOR;
13340        }
13341        if (newlen < writer->min_length)
13342            newlen = writer->min_length;
13343
13344        writer->buffer = PyUnicode_New(newlen, maxchar);
13345        if (writer->buffer == NULL)
13346            return -1;
13347    }
13348    else if (newlen > writer->size) {
13349        if (writer->overallocate
13350            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13351            /* overallocate to limit the number of realloc() */
13352            newlen += newlen / OVERALLOCATE_FACTOR;
13353        }
13354        if (newlen < writer->min_length)
13355            newlen = writer->min_length;
13356
13357        if (maxchar > writer->maxchar || writer->readonly) {
13358            /* resize + widen */
13359            newbuffer = PyUnicode_New(newlen, maxchar);
13360            if (newbuffer == NULL)
13361                return -1;
13362            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13363                                          writer->buffer, 0, writer->pos);
13364            Py_DECREF(writer->buffer);
13365            writer->readonly = 0;
13366        }
13367        else {
13368            newbuffer = resize_compact(writer->buffer, newlen);
13369            if (newbuffer == NULL)
13370                return -1;
13371        }
13372        writer->buffer = newbuffer;
13373    }
13374    else if (maxchar > writer->maxchar) {
13375        assert(!writer->readonly);
13376        newbuffer = PyUnicode_New(writer->size, maxchar);
13377        if (newbuffer == NULL)
13378            return -1;
13379        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13380                                      writer->buffer, 0, writer->pos);
13381        Py_DECREF(writer->buffer);
13382        writer->buffer = newbuffer;
13383    }
13384    _PyUnicodeWriter_Update(writer);
13385    return 0;
13386
13387#undef OVERALLOCATE_FACTOR
13388}
13389
13390Py_LOCAL_INLINE(int)
13391_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13392{
13393    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13394        return -1;
13395    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13396    writer->pos++;
13397    return 0;
13398}
13399
13400int
13401_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13402{
13403    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13404}
13405
13406int
13407_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13408{
13409    Py_UCS4 maxchar;
13410    Py_ssize_t len;
13411
13412    if (PyUnicode_READY(str) == -1)
13413        return -1;
13414    len = PyUnicode_GET_LENGTH(str);
13415    if (len == 0)
13416        return 0;
13417    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13418    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13419        if (writer->buffer == NULL && !writer->overallocate) {
13420            writer->readonly = 1;
13421            Py_INCREF(str);
13422            writer->buffer = str;
13423            _PyUnicodeWriter_Update(writer);
13424            writer->pos += len;
13425            return 0;
13426        }
13427        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13428            return -1;
13429    }
13430    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13431                                  str, 0, len);
13432    writer->pos += len;
13433    return 0;
13434}
13435
13436int
13437_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13438                                Py_ssize_t start, Py_ssize_t end)
13439{
13440    Py_UCS4 maxchar;
13441    Py_ssize_t len;
13442
13443    if (PyUnicode_READY(str) == -1)
13444        return -1;
13445
13446    assert(0 <= start);
13447    assert(end <= PyUnicode_GET_LENGTH(str));
13448    assert(start <= end);
13449
13450    if (end == 0)
13451        return 0;
13452
13453    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13454        return _PyUnicodeWriter_WriteStr(writer, str);
13455
13456    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13457        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13458    else
13459        maxchar = writer->maxchar;
13460    len = end - start;
13461
13462    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13463        return -1;
13464
13465    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13466                                  str, start, len);
13467    writer->pos += len;
13468    return 0;
13469}
13470
13471int
13472_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13473                                  const char *ascii, Py_ssize_t len)
13474{
13475    if (len == -1)
13476        len = strlen(ascii);
13477
13478    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13479
13480    if (writer->buffer == NULL && !writer->overallocate) {
13481        PyObject *str;
13482
13483        str = _PyUnicode_FromASCII(ascii, len);
13484        if (str == NULL)
13485            return -1;
13486
13487        writer->readonly = 1;
13488        writer->buffer = str;
13489        _PyUnicodeWriter_Update(writer);
13490        writer->pos += len;
13491        return 0;
13492    }
13493
13494    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13495        return -1;
13496
13497    switch (writer->kind)
13498    {
13499    case PyUnicode_1BYTE_KIND:
13500    {
13501        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13502        Py_UCS1 *data = writer->data;
13503
13504        Py_MEMCPY(data + writer->pos, str, len);
13505        break;
13506    }
13507    case PyUnicode_2BYTE_KIND:
13508    {
13509        _PyUnicode_CONVERT_BYTES(
13510            Py_UCS1, Py_UCS2,
13511            ascii, ascii + len,
13512            (Py_UCS2 *)writer->data + writer->pos);
13513        break;
13514    }
13515    case PyUnicode_4BYTE_KIND:
13516    {
13517        _PyUnicode_CONVERT_BYTES(
13518            Py_UCS1, Py_UCS4,
13519            ascii, ascii + len,
13520            (Py_UCS4 *)writer->data + writer->pos);
13521        break;
13522    }
13523    default:
13524        assert(0);
13525    }
13526
13527    writer->pos += len;
13528    return 0;
13529}
13530
13531int
13532_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13533                                   const char *str, Py_ssize_t len)
13534{
13535    Py_UCS4 maxchar;
13536
13537    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13538    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13539        return -1;
13540    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13541    writer->pos += len;
13542    return 0;
13543}
13544
13545PyObject *
13546_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13547{
13548    PyObject *str;
13549    if (writer->pos == 0) {
13550        Py_CLEAR(writer->buffer);
13551        _Py_RETURN_UNICODE_EMPTY();
13552    }
13553    if (writer->readonly) {
13554        str = writer->buffer;
13555        writer->buffer = NULL;
13556        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13557        return str;
13558    }
13559    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13560        PyObject *newbuffer;
13561        newbuffer = resize_compact(writer->buffer, writer->pos);
13562        if (newbuffer == NULL) {
13563            Py_CLEAR(writer->buffer);
13564            return NULL;
13565        }
13566        writer->buffer = newbuffer;
13567    }
13568    str = writer->buffer;
13569    writer->buffer = NULL;
13570    assert(_PyUnicode_CheckConsistency(str, 1));
13571    return unicode_result_ready(str);
13572}
13573
13574void
13575_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13576{
13577    Py_CLEAR(writer->buffer);
13578}
13579
13580#include "stringlib/unicode_format.h"
13581
13582PyDoc_STRVAR(format__doc__,
13583             "S.format(*args, **kwargs) -> str\n\
13584\n\
13585Return a formatted version of S, using substitutions from args and kwargs.\n\
13586The substitutions are identified by braces ('{' and '}').");
13587
13588PyDoc_STRVAR(format_map__doc__,
13589             "S.format_map(mapping) -> str\n\
13590\n\
13591Return a formatted version of S, using substitutions from mapping.\n\
13592The substitutions are identified by braces ('{' and '}').");
13593
13594static PyObject *
13595unicode__format__(PyObject* self, PyObject* args)
13596{
13597    PyObject *format_spec;
13598    _PyUnicodeWriter writer;
13599    int ret;
13600
13601    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13602        return NULL;
13603
13604    if (PyUnicode_READY(self) == -1)
13605        return NULL;
13606    _PyUnicodeWriter_Init(&writer);
13607    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13608                                          self, format_spec, 0,
13609                                          PyUnicode_GET_LENGTH(format_spec));
13610    if (ret == -1) {
13611        _PyUnicodeWriter_Dealloc(&writer);
13612        return NULL;
13613    }
13614    return _PyUnicodeWriter_Finish(&writer);
13615}
13616
13617PyDoc_STRVAR(p_format__doc__,
13618             "S.__format__(format_spec) -> str\n\
13619\n\
13620Return a formatted version of S as described by format_spec.");
13621
13622static PyObject *
13623unicode__sizeof__(PyObject *v)
13624{
13625    Py_ssize_t size;
13626
13627    /* If it's a compact object, account for base structure +
13628       character data. */
13629    if (PyUnicode_IS_COMPACT_ASCII(v))
13630        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13631    else if (PyUnicode_IS_COMPACT(v))
13632        size = sizeof(PyCompactUnicodeObject) +
13633            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13634    else {
13635        /* If it is a two-block object, account for base object, and
13636           for character block if present. */
13637        size = sizeof(PyUnicodeObject);
13638        if (_PyUnicode_DATA_ANY(v))
13639            size += (PyUnicode_GET_LENGTH(v) + 1) *
13640                PyUnicode_KIND(v);
13641    }
13642    /* If the wstr pointer is present, account for it unless it is shared
13643       with the data pointer. Check if the data is not shared. */
13644    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13645        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13646    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13647        size += PyUnicode_UTF8_LENGTH(v) + 1;
13648
13649    return PyLong_FromSsize_t(size);
13650}
13651
13652PyDoc_STRVAR(sizeof__doc__,
13653             "S.__sizeof__() -> size of S in memory, in bytes");
13654
13655static PyObject *
13656unicode_getnewargs(PyObject *v)
13657{
13658    PyObject *copy = _PyUnicode_Copy(v);
13659    if (!copy)
13660        return NULL;
13661    return Py_BuildValue("(N)", copy);
13662}
13663
13664static PyMethodDef unicode_methods[] = {
13665    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13666    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13667    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13668    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13669    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13670    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13671    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13672    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13673    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13674    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13675    {"expandtabs", (PyCFunction) unicode_expandtabs,
13676     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13677    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13678    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13679    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13680    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13681    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13682    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13683    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13684    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13685    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13686    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13687    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13688    {"splitlines", (PyCFunction) unicode_splitlines,
13689     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13690    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13691    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13692    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13693    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13694    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13695    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13696    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13697    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13698    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13699    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13700    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13701    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13702    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13703    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13704    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13705    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13706    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13707    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13708    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13709    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13710    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13711    UNICODE_MAKETRANS_METHODDEF
13712    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13713#if 0
13714    /* These methods are just used for debugging the implementation. */
13715    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13716#endif
13717
13718    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13719    {NULL, NULL}
13720};
13721
13722static PyObject *
13723unicode_mod(PyObject *v, PyObject *w)
13724{
13725    if (!PyUnicode_Check(v))
13726        Py_RETURN_NOTIMPLEMENTED;
13727    return PyUnicode_Format(v, w);
13728}
13729
13730static PyNumberMethods unicode_as_number = {
13731    0,              /*nb_add*/
13732    0,              /*nb_subtract*/
13733    0,              /*nb_multiply*/
13734    unicode_mod,            /*nb_remainder*/
13735};
13736
13737static PySequenceMethods unicode_as_sequence = {
13738    (lenfunc) unicode_length,       /* sq_length */
13739    PyUnicode_Concat,           /* sq_concat */
13740    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13741    (ssizeargfunc) unicode_getitem,     /* sq_item */
13742    0,                  /* sq_slice */
13743    0,                  /* sq_ass_item */
13744    0,                  /* sq_ass_slice */
13745    PyUnicode_Contains,         /* sq_contains */
13746};
13747
13748static PyObject*
13749unicode_subscript(PyObject* self, PyObject* item)
13750{
13751    if (PyUnicode_READY(self) == -1)
13752        return NULL;
13753
13754    if (PyIndex_Check(item)) {
13755        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13756        if (i == -1 && PyErr_Occurred())
13757            return NULL;
13758        if (i < 0)
13759            i += PyUnicode_GET_LENGTH(self);
13760        return unicode_getitem(self, i);
13761    } else if (PySlice_Check(item)) {
13762        Py_ssize_t start, stop, step, slicelength, cur, i;
13763        PyObject *result;
13764        void *src_data, *dest_data;
13765        int src_kind, dest_kind;
13766        Py_UCS4 ch, max_char, kind_limit;
13767
13768        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13769                                 &start, &stop, &step, &slicelength) < 0) {
13770            return NULL;
13771        }
13772
13773        if (slicelength <= 0) {
13774            _Py_RETURN_UNICODE_EMPTY();
13775        } else if (start == 0 && step == 1 &&
13776                   slicelength == PyUnicode_GET_LENGTH(self)) {
13777            return unicode_result_unchanged(self);
13778        } else if (step == 1) {
13779            return PyUnicode_Substring(self,
13780                                       start, start + slicelength);
13781        }
13782        /* General case */
13783        src_kind = PyUnicode_KIND(self);
13784        src_data = PyUnicode_DATA(self);
13785        if (!PyUnicode_IS_ASCII(self)) {
13786            kind_limit = kind_maxchar_limit(src_kind);
13787            max_char = 0;
13788            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13789                ch = PyUnicode_READ(src_kind, src_data, cur);
13790                if (ch > max_char) {
13791                    max_char = ch;
13792                    if (max_char >= kind_limit)
13793                        break;
13794                }
13795            }
13796        }
13797        else
13798            max_char = 127;
13799        result = PyUnicode_New(slicelength, max_char);
13800        if (result == NULL)
13801            return NULL;
13802        dest_kind = PyUnicode_KIND(result);
13803        dest_data = PyUnicode_DATA(result);
13804
13805        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13806            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13807            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13808        }
13809        assert(_PyUnicode_CheckConsistency(result, 1));
13810        return result;
13811    } else {
13812        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13813        return NULL;
13814    }
13815}
13816
13817static PyMappingMethods unicode_as_mapping = {
13818    (lenfunc)unicode_length,        /* mp_length */
13819    (binaryfunc)unicode_subscript,  /* mp_subscript */
13820    (objobjargproc)0,           /* mp_ass_subscript */
13821};
13822
13823
13824/* Helpers for PyUnicode_Format() */
13825
13826struct unicode_formatter_t {
13827    PyObject *args;
13828    int args_owned;
13829    Py_ssize_t arglen, argidx;
13830    PyObject *dict;
13831
13832    enum PyUnicode_Kind fmtkind;
13833    Py_ssize_t fmtcnt, fmtpos;
13834    void *fmtdata;
13835    PyObject *fmtstr;
13836
13837    _PyUnicodeWriter writer;
13838};
13839
13840struct unicode_format_arg_t {
13841    Py_UCS4 ch;
13842    int flags;
13843    Py_ssize_t width;
13844    int prec;
13845    int sign;
13846};
13847
13848static PyObject *
13849unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13850{
13851    Py_ssize_t argidx = ctx->argidx;
13852
13853    if (argidx < ctx->arglen) {
13854        ctx->argidx++;
13855        if (ctx->arglen < 0)
13856            return ctx->args;
13857        else
13858            return PyTuple_GetItem(ctx->args, argidx);
13859    }
13860    PyErr_SetString(PyExc_TypeError,
13861                    "not enough arguments for format string");
13862    return NULL;
13863}
13864
13865/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13866
13867/* Format a float into the writer if the writer is not NULL, or into *p_output
13868   otherwise.
13869
13870   Return 0 on success, raise an exception and return -1 on error. */
13871static int
13872formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13873            PyObject **p_output,
13874            _PyUnicodeWriter *writer)
13875{
13876    char *p;
13877    double x;
13878    Py_ssize_t len;
13879    int prec;
13880    int dtoa_flags;
13881
13882    x = PyFloat_AsDouble(v);
13883    if (x == -1.0 && PyErr_Occurred())
13884        return -1;
13885
13886    prec = arg->prec;
13887    if (prec < 0)
13888        prec = 6;
13889
13890    if (arg->flags & F_ALT)
13891        dtoa_flags = Py_DTSF_ALT;
13892    else
13893        dtoa_flags = 0;
13894    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13895    if (p == NULL)
13896        return -1;
13897    len = strlen(p);
13898    if (writer) {
13899        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13900            PyMem_Free(p);
13901            return -1;
13902        }
13903    }
13904    else
13905        *p_output = _PyUnicode_FromASCII(p, len);
13906    PyMem_Free(p);
13907    return 0;
13908}
13909
13910/* formatlong() emulates the format codes d, u, o, x and X, and
13911 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13912 * Python's regular ints.
13913 * Return value:  a new PyUnicodeObject*, or NULL if error.
13914 *     The output string is of the form
13915 *         "-"? ("0x" | "0X")? digit+
13916 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13917 *         set in flags.  The case of hex digits will be correct,
13918 *     There will be at least prec digits, zero-filled on the left if
13919 *         necessary to get that many.
13920 * val          object to be converted
13921 * flags        bitmask of format flags; only F_ALT is looked at
13922 * prec         minimum number of digits; 0-fill on left if needed
13923 * type         a character in [duoxX]; u acts the same as d
13924 *
13925 * CAUTION:  o, x and X conversions on regular ints can never
13926 * produce a '-' sign, but can for Python's unbounded ints.
13927 */
13928static PyObject*
13929formatlong(PyObject *val, struct unicode_format_arg_t *arg)
13930{
13931    PyObject *result = NULL;
13932    char *buf;
13933    Py_ssize_t i;
13934    int sign;           /* 1 if '-', else 0 */
13935    int len;            /* number of characters */
13936    Py_ssize_t llen;
13937    int numdigits;      /* len == numnondigits + numdigits */
13938    int numnondigits = 0;
13939    int prec = arg->prec;
13940    int type = arg->ch;
13941
13942    /* Avoid exceeding SSIZE_T_MAX */
13943    if (prec > INT_MAX-3) {
13944        PyErr_SetString(PyExc_OverflowError,
13945                        "precision too large");
13946        return NULL;
13947    }
13948
13949    assert(PyLong_Check(val));
13950
13951    switch (type) {
13952    default:
13953        assert(!"'type' not in [diuoxX]");
13954    case 'd':
13955    case 'i':
13956    case 'u':
13957        /* int and int subclasses should print numerically when a numeric */
13958        /* format code is used (see issue18780) */
13959        result = PyNumber_ToBase(val, 10);
13960        break;
13961    case 'o':
13962        numnondigits = 2;
13963        result = PyNumber_ToBase(val, 8);
13964        break;
13965    case 'x':
13966    case 'X':
13967        numnondigits = 2;
13968        result = PyNumber_ToBase(val, 16);
13969        break;
13970    }
13971    if (!result)
13972        return NULL;
13973
13974    assert(unicode_modifiable(result));
13975    assert(PyUnicode_IS_READY(result));
13976    assert(PyUnicode_IS_ASCII(result));
13977
13978    /* To modify the string in-place, there can only be one reference. */
13979    if (Py_REFCNT(result) != 1) {
13980        Py_DECREF(result);
13981        PyErr_BadInternalCall();
13982        return NULL;
13983    }
13984    buf = PyUnicode_DATA(result);
13985    llen = PyUnicode_GET_LENGTH(result);
13986    if (llen > INT_MAX) {
13987        Py_DECREF(result);
13988        PyErr_SetString(PyExc_ValueError,
13989                        "string too large in _PyBytes_FormatLong");
13990        return NULL;
13991    }
13992    len = (int)llen;
13993    sign = buf[0] == '-';
13994    numnondigits += sign;
13995    numdigits = len - numnondigits;
13996    assert(numdigits > 0);
13997
13998    /* Get rid of base marker unless F_ALT */
13999    if (((arg->flags & F_ALT) == 0 &&
14000        (type == 'o' || type == 'x' || type == 'X'))) {
14001        assert(buf[sign] == '0');
14002        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14003               buf[sign+1] == 'o');
14004        numnondigits -= 2;
14005        buf += 2;
14006        len -= 2;
14007        if (sign)
14008            buf[0] = '-';
14009        assert(len == numnondigits + numdigits);
14010        assert(numdigits > 0);
14011    }
14012
14013    /* Fill with leading zeroes to meet minimum width. */
14014    if (prec > numdigits) {
14015        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14016                                numnondigits + prec);
14017        char *b1;
14018        if (!r1) {
14019            Py_DECREF(result);
14020            return NULL;
14021        }
14022        b1 = PyBytes_AS_STRING(r1);
14023        for (i = 0; i < numnondigits; ++i)
14024            *b1++ = *buf++;
14025        for (i = 0; i < prec - numdigits; i++)
14026            *b1++ = '0';
14027        for (i = 0; i < numdigits; i++)
14028            *b1++ = *buf++;
14029        *b1 = '\0';
14030        Py_DECREF(result);
14031        result = r1;
14032        buf = PyBytes_AS_STRING(result);
14033        len = numnondigits + prec;
14034    }
14035
14036    /* Fix up case for hex conversions. */
14037    if (type == 'X') {
14038        /* Need to convert all lower case letters to upper case.
14039           and need to convert 0x to 0X (and -0x to -0X). */
14040        for (i = 0; i < len; i++)
14041            if (buf[i] >= 'a' && buf[i] <= 'x')
14042                buf[i] -= 'a'-'A';
14043    }
14044    if (!PyUnicode_Check(result)
14045        || buf != PyUnicode_DATA(result)) {
14046        PyObject *unicode;
14047        unicode = _PyUnicode_FromASCII(buf, len);
14048        Py_DECREF(result);
14049        result = unicode;
14050    }
14051    else if (len != PyUnicode_GET_LENGTH(result)) {
14052        if (PyUnicode_Resize(&result, len) < 0)
14053            Py_CLEAR(result);
14054    }
14055    return result;
14056}
14057
14058/* Format an integer or a float as an integer.
14059 * Return 1 if the number has been formatted into the writer,
14060 *        0 if the number has been formatted into *p_output
14061 *       -1 and raise an exception on error */
14062static int
14063mainformatlong(PyObject *v,
14064               struct unicode_format_arg_t *arg,
14065               PyObject **p_output,
14066               _PyUnicodeWriter *writer)
14067{
14068    PyObject *iobj, *res;
14069    char type = (char)arg->ch;
14070
14071    if (!PyNumber_Check(v))
14072        goto wrongtype;
14073
14074    /* make sure number is a type of integer for o, x, and X */
14075    if (!PyLong_Check(v)) {
14076        if (type == 'o' || type == 'x' || type == 'X') {
14077            iobj = PyNumber_Index(v);
14078            if (iobj == NULL) {
14079                if (PyErr_ExceptionMatches(PyExc_TypeError))
14080                    goto wrongtype;
14081                return -1;
14082            }
14083        }
14084        else {
14085            iobj = PyNumber_Long(v);
14086            if (iobj == NULL ) {
14087                if (PyErr_ExceptionMatches(PyExc_TypeError))
14088                    goto wrongtype;
14089                return -1;
14090            }
14091        }
14092        assert(PyLong_Check(iobj));
14093    }
14094    else {
14095        iobj = v;
14096        Py_INCREF(iobj);
14097    }
14098
14099    if (PyLong_CheckExact(v)
14100        && arg->width == -1 && arg->prec == -1
14101        && !(arg->flags & (F_SIGN | F_BLANK))
14102        && type != 'X')
14103    {
14104        /* Fast path */
14105        int alternate = arg->flags & F_ALT;
14106        int base;
14107
14108        switch(type)
14109        {
14110            default:
14111                assert(0 && "'type' not in [diuoxX]");
14112            case 'd':
14113            case 'i':
14114            case 'u':
14115                base = 10;
14116                break;
14117            case 'o':
14118                base = 8;
14119                break;
14120            case 'x':
14121            case 'X':
14122                base = 16;
14123                break;
14124        }
14125
14126        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14127            Py_DECREF(iobj);
14128            return -1;
14129        }
14130        Py_DECREF(iobj);
14131        return 1;
14132    }
14133
14134    res = formatlong(iobj, arg);
14135    Py_DECREF(iobj);
14136    if (res == NULL)
14137        return -1;
14138    *p_output = res;
14139    return 0;
14140
14141wrongtype:
14142    switch(type)
14143    {
14144        case 'o':
14145        case 'x':
14146        case 'X':
14147            PyErr_Format(PyExc_TypeError,
14148                    "%%%c format: an integer is required, "
14149                    "not %.200s",
14150                    type, Py_TYPE(v)->tp_name);
14151            break;
14152        default:
14153            PyErr_Format(PyExc_TypeError,
14154                    "%%%c format: a number is required, "
14155                    "not %.200s",
14156                    type, Py_TYPE(v)->tp_name);
14157            break;
14158    }
14159    return -1;
14160}
14161
14162static Py_UCS4
14163formatchar(PyObject *v)
14164{
14165    /* presume that the buffer is at least 3 characters long */
14166    if (PyUnicode_Check(v)) {
14167        if (PyUnicode_GET_LENGTH(v) == 1) {
14168            return PyUnicode_READ_CHAR(v, 0);
14169        }
14170        goto onError;
14171    }
14172    else {
14173        PyObject *iobj;
14174        long x;
14175        /* make sure number is a type of integer */
14176        if (!PyLong_Check(v)) {
14177            iobj = PyNumber_Index(v);
14178            if (iobj == NULL) {
14179                goto onError;
14180            }
14181            v = iobj;
14182            Py_DECREF(iobj);
14183        }
14184        /* Integer input truncated to a character */
14185        x = PyLong_AsLong(v);
14186        if (x == -1 && PyErr_Occurred())
14187            goto onError;
14188
14189        if (x < 0 || x > MAX_UNICODE) {
14190            PyErr_SetString(PyExc_OverflowError,
14191                            "%c arg not in range(0x110000)");
14192            return (Py_UCS4) -1;
14193        }
14194
14195        return (Py_UCS4) x;
14196    }
14197
14198  onError:
14199    PyErr_SetString(PyExc_TypeError,
14200                    "%c requires int or char");
14201    return (Py_UCS4) -1;
14202}
14203
14204/* Parse options of an argument: flags, width, precision.
14205   Handle also "%(name)" syntax.
14206
14207   Return 0 if the argument has been formatted into arg->str.
14208   Return 1 if the argument has been written into ctx->writer,
14209   Raise an exception and return -1 on error. */
14210static int
14211unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14212                         struct unicode_format_arg_t *arg)
14213{
14214#define FORMAT_READ(ctx) \
14215        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14216
14217    PyObject *v;
14218
14219    if (arg->ch == '(') {
14220        /* Get argument value from a dictionary. Example: "%(name)s". */
14221        Py_ssize_t keystart;
14222        Py_ssize_t keylen;
14223        PyObject *key;
14224        int pcount = 1;
14225
14226        if (ctx->dict == NULL) {
14227            PyErr_SetString(PyExc_TypeError,
14228                            "format requires a mapping");
14229            return -1;
14230        }
14231        ++ctx->fmtpos;
14232        --ctx->fmtcnt;
14233        keystart = ctx->fmtpos;
14234        /* Skip over balanced parentheses */
14235        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14236            arg->ch = FORMAT_READ(ctx);
14237            if (arg->ch == ')')
14238                --pcount;
14239            else if (arg->ch == '(')
14240                ++pcount;
14241            ctx->fmtpos++;
14242        }
14243        keylen = ctx->fmtpos - keystart - 1;
14244        if (ctx->fmtcnt < 0 || pcount > 0) {
14245            PyErr_SetString(PyExc_ValueError,
14246                            "incomplete format key");
14247            return -1;
14248        }
14249        key = PyUnicode_Substring(ctx->fmtstr,
14250                                  keystart, keystart + keylen);
14251        if (key == NULL)
14252            return -1;
14253        if (ctx->args_owned) {
14254            Py_DECREF(ctx->args);
14255            ctx->args_owned = 0;
14256        }
14257        ctx->args = PyObject_GetItem(ctx->dict, key);
14258        Py_DECREF(key);
14259        if (ctx->args == NULL)
14260            return -1;
14261        ctx->args_owned = 1;
14262        ctx->arglen = -1;
14263        ctx->argidx = -2;
14264    }
14265
14266    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14267    while (--ctx->fmtcnt >= 0) {
14268        arg->ch = FORMAT_READ(ctx);
14269        ctx->fmtpos++;
14270        switch (arg->ch) {
14271        case '-': arg->flags |= F_LJUST; continue;
14272        case '+': arg->flags |= F_SIGN; continue;
14273        case ' ': arg->flags |= F_BLANK; continue;
14274        case '#': arg->flags |= F_ALT; continue;
14275        case '0': arg->flags |= F_ZERO; continue;
14276        }
14277        break;
14278    }
14279
14280    /* Parse width. Example: "%10s" => width=10 */
14281    if (arg->ch == '*') {
14282        v = unicode_format_getnextarg(ctx);
14283        if (v == NULL)
14284            return -1;
14285        if (!PyLong_Check(v)) {
14286            PyErr_SetString(PyExc_TypeError,
14287                            "* wants int");
14288            return -1;
14289        }
14290        arg->width = PyLong_AsSsize_t(v);
14291        if (arg->width == -1 && PyErr_Occurred())
14292            return -1;
14293        if (arg->width < 0) {
14294            arg->flags |= F_LJUST;
14295            arg->width = -arg->width;
14296        }
14297        if (--ctx->fmtcnt >= 0) {
14298            arg->ch = FORMAT_READ(ctx);
14299            ctx->fmtpos++;
14300        }
14301    }
14302    else if (arg->ch >= '0' && arg->ch <= '9') {
14303        arg->width = arg->ch - '0';
14304        while (--ctx->fmtcnt >= 0) {
14305            arg->ch = FORMAT_READ(ctx);
14306            ctx->fmtpos++;
14307            if (arg->ch < '0' || arg->ch > '9')
14308                break;
14309            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14310               mixing signed and unsigned comparison. Since arg->ch is between
14311               '0' and '9', casting to int is safe. */
14312            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14313                PyErr_SetString(PyExc_ValueError,
14314                                "width too big");
14315                return -1;
14316            }
14317            arg->width = arg->width*10 + (arg->ch - '0');
14318        }
14319    }
14320
14321    /* Parse precision. Example: "%.3f" => prec=3 */
14322    if (arg->ch == '.') {
14323        arg->prec = 0;
14324        if (--ctx->fmtcnt >= 0) {
14325            arg->ch = FORMAT_READ(ctx);
14326            ctx->fmtpos++;
14327        }
14328        if (arg->ch == '*') {
14329            v = unicode_format_getnextarg(ctx);
14330            if (v == NULL)
14331                return -1;
14332            if (!PyLong_Check(v)) {
14333                PyErr_SetString(PyExc_TypeError,
14334                                "* wants int");
14335                return -1;
14336            }
14337            arg->prec = _PyLong_AsInt(v);
14338            if (arg->prec == -1 && PyErr_Occurred())
14339                return -1;
14340            if (arg->prec < 0)
14341                arg->prec = 0;
14342            if (--ctx->fmtcnt >= 0) {
14343                arg->ch = FORMAT_READ(ctx);
14344                ctx->fmtpos++;
14345            }
14346        }
14347        else if (arg->ch >= '0' && arg->ch <= '9') {
14348            arg->prec = arg->ch - '0';
14349            while (--ctx->fmtcnt >= 0) {
14350                arg->ch = FORMAT_READ(ctx);
14351                ctx->fmtpos++;
14352                if (arg->ch < '0' || arg->ch > '9')
14353                    break;
14354                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14355                    PyErr_SetString(PyExc_ValueError,
14356                                    "precision too big");
14357                    return -1;
14358                }
14359                arg->prec = arg->prec*10 + (arg->ch - '0');
14360            }
14361        }
14362    }
14363
14364    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14365    if (ctx->fmtcnt >= 0) {
14366        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14367            if (--ctx->fmtcnt >= 0) {
14368                arg->ch = FORMAT_READ(ctx);
14369                ctx->fmtpos++;
14370            }
14371        }
14372    }
14373    if (ctx->fmtcnt < 0) {
14374        PyErr_SetString(PyExc_ValueError,
14375                        "incomplete format");
14376        return -1;
14377    }
14378    return 0;
14379
14380#undef FORMAT_READ
14381}
14382
14383/* Format one argument. Supported conversion specifiers:
14384
14385   - "s", "r", "a": any type
14386   - "i", "d", "u": int or float
14387   - "o", "x", "X": int
14388   - "e", "E", "f", "F", "g", "G": float
14389   - "c": int or str (1 character)
14390
14391   When possible, the output is written directly into the Unicode writer
14392   (ctx->writer). A string is created when padding is required.
14393
14394   Return 0 if the argument has been formatted into *p_str,
14395          1 if the argument has been written into ctx->writer,
14396         -1 on error. */
14397static int
14398unicode_format_arg_format(struct unicode_formatter_t *ctx,
14399                          struct unicode_format_arg_t *arg,
14400                          PyObject **p_str)
14401{
14402    PyObject *v;
14403    _PyUnicodeWriter *writer = &ctx->writer;
14404
14405    if (ctx->fmtcnt == 0)
14406        ctx->writer.overallocate = 0;
14407
14408    if (arg->ch == '%') {
14409        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14410            return -1;
14411        return 1;
14412    }
14413
14414    v = unicode_format_getnextarg(ctx);
14415    if (v == NULL)
14416        return -1;
14417
14418
14419    switch (arg->ch) {
14420    case 's':
14421    case 'r':
14422    case 'a':
14423        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14424            /* Fast path */
14425            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14426                return -1;
14427            return 1;
14428        }
14429
14430        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14431            *p_str = v;
14432            Py_INCREF(*p_str);
14433        }
14434        else {
14435            if (arg->ch == 's')
14436                *p_str = PyObject_Str(v);
14437            else if (arg->ch == 'r')
14438                *p_str = PyObject_Repr(v);
14439            else
14440                *p_str = PyObject_ASCII(v);
14441        }
14442        break;
14443
14444    case 'i':
14445    case 'd':
14446    case 'u':
14447    case 'o':
14448    case 'x':
14449    case 'X':
14450    {
14451        int ret = mainformatlong(v, arg, p_str, writer);
14452        if (ret != 0)
14453            return ret;
14454        arg->sign = 1;
14455        break;
14456    }
14457
14458    case 'e':
14459    case 'E':
14460    case 'f':
14461    case 'F':
14462    case 'g':
14463    case 'G':
14464        if (arg->width == -1 && arg->prec == -1
14465            && !(arg->flags & (F_SIGN | F_BLANK)))
14466        {
14467            /* Fast path */
14468            if (formatfloat(v, arg, NULL, writer) == -1)
14469                return -1;
14470            return 1;
14471        }
14472
14473        arg->sign = 1;
14474        if (formatfloat(v, arg, p_str, NULL) == -1)
14475            return -1;
14476        break;
14477
14478    case 'c':
14479    {
14480        Py_UCS4 ch = formatchar(v);
14481        if (ch == (Py_UCS4) -1)
14482            return -1;
14483        if (arg->width == -1 && arg->prec == -1) {
14484            /* Fast path */
14485            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14486                return -1;
14487            return 1;
14488        }
14489        *p_str = PyUnicode_FromOrdinal(ch);
14490        break;
14491    }
14492
14493    default:
14494        PyErr_Format(PyExc_ValueError,
14495                     "unsupported format character '%c' (0x%x) "
14496                     "at index %zd",
14497                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14498                     (int)arg->ch,
14499                     ctx->fmtpos - 1);
14500        return -1;
14501    }
14502    if (*p_str == NULL)
14503        return -1;
14504    assert (PyUnicode_Check(*p_str));
14505    return 0;
14506}
14507
14508static int
14509unicode_format_arg_output(struct unicode_formatter_t *ctx,
14510                          struct unicode_format_arg_t *arg,
14511                          PyObject *str)
14512{
14513    Py_ssize_t len;
14514    enum PyUnicode_Kind kind;
14515    void *pbuf;
14516    Py_ssize_t pindex;
14517    Py_UCS4 signchar;
14518    Py_ssize_t buflen;
14519    Py_UCS4 maxchar;
14520    Py_ssize_t sublen;
14521    _PyUnicodeWriter *writer = &ctx->writer;
14522    Py_UCS4 fill;
14523
14524    fill = ' ';
14525    if (arg->sign && arg->flags & F_ZERO)
14526        fill = '0';
14527
14528    if (PyUnicode_READY(str) == -1)
14529        return -1;
14530
14531    len = PyUnicode_GET_LENGTH(str);
14532    if ((arg->width == -1 || arg->width <= len)
14533        && (arg->prec == -1 || arg->prec >= len)
14534        && !(arg->flags & (F_SIGN | F_BLANK)))
14535    {
14536        /* Fast path */
14537        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14538            return -1;
14539        return 0;
14540    }
14541
14542    /* Truncate the string for "s", "r" and "a" formats
14543       if the precision is set */
14544    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14545        if (arg->prec >= 0 && len > arg->prec)
14546            len = arg->prec;
14547    }
14548
14549    /* Adjust sign and width */
14550    kind = PyUnicode_KIND(str);
14551    pbuf = PyUnicode_DATA(str);
14552    pindex = 0;
14553    signchar = '\0';
14554    if (arg->sign) {
14555        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14556        if (ch == '-' || ch == '+') {
14557            signchar = ch;
14558            len--;
14559            pindex++;
14560        }
14561        else if (arg->flags & F_SIGN)
14562            signchar = '+';
14563        else if (arg->flags & F_BLANK)
14564            signchar = ' ';
14565        else
14566            arg->sign = 0;
14567    }
14568    if (arg->width < len)
14569        arg->width = len;
14570
14571    /* Prepare the writer */
14572    maxchar = writer->maxchar;
14573    if (!(arg->flags & F_LJUST)) {
14574        if (arg->sign) {
14575            if ((arg->width-1) > len)
14576                maxchar = Py_MAX(maxchar, fill);
14577        }
14578        else {
14579            if (arg->width > len)
14580                maxchar = Py_MAX(maxchar, fill);
14581        }
14582    }
14583    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14584        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14585        maxchar = Py_MAX(maxchar, strmaxchar);
14586    }
14587
14588    buflen = arg->width;
14589    if (arg->sign && len == arg->width)
14590        buflen++;
14591    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14592        return -1;
14593
14594    /* Write the sign if needed */
14595    if (arg->sign) {
14596        if (fill != ' ') {
14597            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14598            writer->pos += 1;
14599        }
14600        if (arg->width > len)
14601            arg->width--;
14602    }
14603
14604    /* Write the numeric prefix for "x", "X" and "o" formats
14605       if the alternate form is used.
14606       For example, write "0x" for the "%#x" format. */
14607    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14608        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14609        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14610        if (fill != ' ') {
14611            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14612            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14613            writer->pos += 2;
14614            pindex += 2;
14615        }
14616        arg->width -= 2;
14617        if (arg->width < 0)
14618            arg->width = 0;
14619        len -= 2;
14620    }
14621
14622    /* Pad left with the fill character if needed */
14623    if (arg->width > len && !(arg->flags & F_LJUST)) {
14624        sublen = arg->width - len;
14625        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14626        writer->pos += sublen;
14627        arg->width = len;
14628    }
14629
14630    /* If padding with spaces: write sign if needed and/or numeric prefix if
14631       the alternate form is used */
14632    if (fill == ' ') {
14633        if (arg->sign) {
14634            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14635            writer->pos += 1;
14636        }
14637        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14638            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14639            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14640            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14641            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14642            writer->pos += 2;
14643            pindex += 2;
14644        }
14645    }
14646
14647    /* Write characters */
14648    if (len) {
14649        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14650                                      str, pindex, len);
14651        writer->pos += len;
14652    }
14653
14654    /* Pad right with the fill character if needed */
14655    if (arg->width > len) {
14656        sublen = arg->width - len;
14657        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14658        writer->pos += sublen;
14659    }
14660    return 0;
14661}
14662
14663/* Helper of PyUnicode_Format(): format one arg.
14664   Return 0 on success, raise an exception and return -1 on error. */
14665static int
14666unicode_format_arg(struct unicode_formatter_t *ctx)
14667{
14668    struct unicode_format_arg_t arg;
14669    PyObject *str;
14670    int ret;
14671
14672    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14673    arg.flags = 0;
14674    arg.width = -1;
14675    arg.prec = -1;
14676    arg.sign = 0;
14677    str = NULL;
14678
14679    ret = unicode_format_arg_parse(ctx, &arg);
14680    if (ret == -1)
14681        return -1;
14682
14683    ret = unicode_format_arg_format(ctx, &arg, &str);
14684    if (ret == -1)
14685        return -1;
14686
14687    if (ret != 1) {
14688        ret = unicode_format_arg_output(ctx, &arg, str);
14689        Py_DECREF(str);
14690        if (ret == -1)
14691            return -1;
14692    }
14693
14694    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14695        PyErr_SetString(PyExc_TypeError,
14696                        "not all arguments converted during string formatting");
14697        return -1;
14698    }
14699    return 0;
14700}
14701
14702PyObject *
14703PyUnicode_Format(PyObject *format, PyObject *args)
14704{
14705    struct unicode_formatter_t ctx;
14706
14707    if (format == NULL || args == NULL) {
14708        PyErr_BadInternalCall();
14709        return NULL;
14710    }
14711
14712    ctx.fmtstr = PyUnicode_FromObject(format);
14713    if (ctx.fmtstr == NULL)
14714        return NULL;
14715    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14716        Py_DECREF(ctx.fmtstr);
14717        return NULL;
14718    }
14719    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14720    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14721    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14722    ctx.fmtpos = 0;
14723
14724    _PyUnicodeWriter_Init(&ctx.writer);
14725    ctx.writer.min_length = ctx.fmtcnt + 100;
14726    ctx.writer.overallocate = 1;
14727
14728    if (PyTuple_Check(args)) {
14729        ctx.arglen = PyTuple_Size(args);
14730        ctx.argidx = 0;
14731    }
14732    else {
14733        ctx.arglen = -1;
14734        ctx.argidx = -2;
14735    }
14736    ctx.args_owned = 0;
14737    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14738        ctx.dict = args;
14739    else
14740        ctx.dict = NULL;
14741    ctx.args = args;
14742
14743    while (--ctx.fmtcnt >= 0) {
14744        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14745            Py_ssize_t nonfmtpos;
14746
14747            nonfmtpos = ctx.fmtpos++;
14748            while (ctx.fmtcnt >= 0 &&
14749                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14750                ctx.fmtpos++;
14751                ctx.fmtcnt--;
14752            }
14753            if (ctx.fmtcnt < 0) {
14754                ctx.fmtpos--;
14755                ctx.writer.overallocate = 0;
14756            }
14757
14758            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14759                                                nonfmtpos, ctx.fmtpos) < 0)
14760                goto onError;
14761        }
14762        else {
14763            ctx.fmtpos++;
14764            if (unicode_format_arg(&ctx) == -1)
14765                goto onError;
14766        }
14767    }
14768
14769    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14770        PyErr_SetString(PyExc_TypeError,
14771                        "not all arguments converted during string formatting");
14772        goto onError;
14773    }
14774
14775    if (ctx.args_owned) {
14776        Py_DECREF(ctx.args);
14777    }
14778    Py_DECREF(ctx.fmtstr);
14779    return _PyUnicodeWriter_Finish(&ctx.writer);
14780
14781  onError:
14782    Py_DECREF(ctx.fmtstr);
14783    _PyUnicodeWriter_Dealloc(&ctx.writer);
14784    if (ctx.args_owned) {
14785        Py_DECREF(ctx.args);
14786    }
14787    return NULL;
14788}
14789
14790static PyObject *
14791unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14792
14793static PyObject *
14794unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14795{
14796    PyObject *x = NULL;
14797    static char *kwlist[] = {"object", "encoding", "errors", 0};
14798    char *encoding = NULL;
14799    char *errors = NULL;
14800
14801    if (type != &PyUnicode_Type)
14802        return unicode_subtype_new(type, args, kwds);
14803    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14804                                     kwlist, &x, &encoding, &errors))
14805        return NULL;
14806    if (x == NULL)
14807        _Py_RETURN_UNICODE_EMPTY();
14808    if (encoding == NULL && errors == NULL)
14809        return PyObject_Str(x);
14810    else
14811        return PyUnicode_FromEncodedObject(x, encoding, errors);
14812}
14813
14814static PyObject *
14815unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14816{
14817    PyObject *unicode, *self;
14818    Py_ssize_t length, char_size;
14819    int share_wstr, share_utf8;
14820    unsigned int kind;
14821    void *data;
14822
14823    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14824
14825    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14826    if (unicode == NULL)
14827        return NULL;
14828    assert(_PyUnicode_CHECK(unicode));
14829    if (PyUnicode_READY(unicode) == -1) {
14830        Py_DECREF(unicode);
14831        return NULL;
14832    }
14833
14834    self = type->tp_alloc(type, 0);
14835    if (self == NULL) {
14836        Py_DECREF(unicode);
14837        return NULL;
14838    }
14839    kind = PyUnicode_KIND(unicode);
14840    length = PyUnicode_GET_LENGTH(unicode);
14841
14842    _PyUnicode_LENGTH(self) = length;
14843#ifdef Py_DEBUG
14844    _PyUnicode_HASH(self) = -1;
14845#else
14846    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14847#endif
14848    _PyUnicode_STATE(self).interned = 0;
14849    _PyUnicode_STATE(self).kind = kind;
14850    _PyUnicode_STATE(self).compact = 0;
14851    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14852    _PyUnicode_STATE(self).ready = 1;
14853    _PyUnicode_WSTR(self) = NULL;
14854    _PyUnicode_UTF8_LENGTH(self) = 0;
14855    _PyUnicode_UTF8(self) = NULL;
14856    _PyUnicode_WSTR_LENGTH(self) = 0;
14857    _PyUnicode_DATA_ANY(self) = NULL;
14858
14859    share_utf8 = 0;
14860    share_wstr = 0;
14861    if (kind == PyUnicode_1BYTE_KIND) {
14862        char_size = 1;
14863        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14864            share_utf8 = 1;
14865    }
14866    else if (kind == PyUnicode_2BYTE_KIND) {
14867        char_size = 2;
14868        if (sizeof(wchar_t) == 2)
14869            share_wstr = 1;
14870    }
14871    else {
14872        assert(kind == PyUnicode_4BYTE_KIND);
14873        char_size = 4;
14874        if (sizeof(wchar_t) == 4)
14875            share_wstr = 1;
14876    }
14877
14878    /* Ensure we won't overflow the length. */
14879    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14880        PyErr_NoMemory();
14881        goto onError;
14882    }
14883    data = PyObject_MALLOC((length + 1) * char_size);
14884    if (data == NULL) {
14885        PyErr_NoMemory();
14886        goto onError;
14887    }
14888
14889    _PyUnicode_DATA_ANY(self) = data;
14890    if (share_utf8) {
14891        _PyUnicode_UTF8_LENGTH(self) = length;
14892        _PyUnicode_UTF8(self) = data;
14893    }
14894    if (share_wstr) {
14895        _PyUnicode_WSTR_LENGTH(self) = length;
14896        _PyUnicode_WSTR(self) = (wchar_t *)data;
14897    }
14898
14899    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14900              kind * (length + 1));
14901    assert(_PyUnicode_CheckConsistency(self, 1));
14902#ifdef Py_DEBUG
14903    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14904#endif
14905    Py_DECREF(unicode);
14906    return self;
14907
14908onError:
14909    Py_DECREF(unicode);
14910    Py_DECREF(self);
14911    return NULL;
14912}
14913
14914PyDoc_STRVAR(unicode_doc,
14915"str(object='') -> str\n\
14916str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14917\n\
14918Create a new string object from the given object. If encoding or\n\
14919errors is specified, then the object must expose a data buffer\n\
14920that will be decoded using the given encoding and error handler.\n\
14921Otherwise, returns the result of object.__str__() (if defined)\n\
14922or repr(object).\n\
14923encoding defaults to sys.getdefaultencoding().\n\
14924errors defaults to 'strict'.");
14925
14926static PyObject *unicode_iter(PyObject *seq);
14927
14928PyTypeObject PyUnicode_Type = {
14929    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14930    "str",              /* tp_name */
14931    sizeof(PyUnicodeObject),        /* tp_size */
14932    0,                  /* tp_itemsize */
14933    /* Slots */
14934    (destructor)unicode_dealloc,    /* tp_dealloc */
14935    0,                  /* tp_print */
14936    0,                  /* tp_getattr */
14937    0,                  /* tp_setattr */
14938    0,                  /* tp_reserved */
14939    unicode_repr,           /* tp_repr */
14940    &unicode_as_number,         /* tp_as_number */
14941    &unicode_as_sequence,       /* tp_as_sequence */
14942    &unicode_as_mapping,        /* tp_as_mapping */
14943    (hashfunc) unicode_hash,        /* tp_hash*/
14944    0,                  /* tp_call*/
14945    (reprfunc) unicode_str,     /* tp_str */
14946    PyObject_GenericGetAttr,        /* tp_getattro */
14947    0,                  /* tp_setattro */
14948    0,                  /* tp_as_buffer */
14949    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14950    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14951    unicode_doc,            /* tp_doc */
14952    0,                  /* tp_traverse */
14953    0,                  /* tp_clear */
14954    PyUnicode_RichCompare,      /* tp_richcompare */
14955    0,                  /* tp_weaklistoffset */
14956    unicode_iter,           /* tp_iter */
14957    0,                  /* tp_iternext */
14958    unicode_methods,            /* tp_methods */
14959    0,                  /* tp_members */
14960    0,                  /* tp_getset */
14961    &PyBaseObject_Type,         /* tp_base */
14962    0,                  /* tp_dict */
14963    0,                  /* tp_descr_get */
14964    0,                  /* tp_descr_set */
14965    0,                  /* tp_dictoffset */
14966    0,                  /* tp_init */
14967    0,                  /* tp_alloc */
14968    unicode_new,            /* tp_new */
14969    PyObject_Del,           /* tp_free */
14970};
14971
14972/* Initialize the Unicode implementation */
14973
14974int _PyUnicode_Init(void)
14975{
14976    /* XXX - move this array to unicodectype.c ? */
14977    Py_UCS2 linebreak[] = {
14978        0x000A, /* LINE FEED */
14979        0x000D, /* CARRIAGE RETURN */
14980        0x001C, /* FILE SEPARATOR */
14981        0x001D, /* GROUP SEPARATOR */
14982        0x001E, /* RECORD SEPARATOR */
14983        0x0085, /* NEXT LINE */
14984        0x2028, /* LINE SEPARATOR */
14985        0x2029, /* PARAGRAPH SEPARATOR */
14986    };
14987
14988    /* Init the implementation */
14989    _Py_INCREF_UNICODE_EMPTY();
14990    if (!unicode_empty)
14991        Py_FatalError("Can't create empty string");
14992    Py_DECREF(unicode_empty);
14993
14994    if (PyType_Ready(&PyUnicode_Type) < 0)
14995        Py_FatalError("Can't initialize 'unicode'");
14996
14997    /* initialize the linebreak bloom filter */
14998    bloom_linebreak = make_bloom_mask(
14999        PyUnicode_2BYTE_KIND, linebreak,
15000        Py_ARRAY_LENGTH(linebreak));
15001
15002    if (PyType_Ready(&EncodingMapType) < 0)
15003         Py_FatalError("Can't initialize encoding map type");
15004
15005    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15006        Py_FatalError("Can't initialize field name iterator type");
15007
15008    if (PyType_Ready(&PyFormatterIter_Type) < 0)
15009        Py_FatalError("Can't initialize formatter iter type");
15010
15011#ifdef HAVE_MBCS
15012    winver.dwOSVersionInfoSize = sizeof(winver);
15013    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
15014        PyErr_SetFromWindowsErr(0);
15015        return -1;
15016    }
15017#endif
15018    return 0;
15019}
15020
15021/* Finalize the Unicode implementation */
15022
15023int
15024PyUnicode_ClearFreeList(void)
15025{
15026    return 0;
15027}
15028
15029void
15030_PyUnicode_Fini(void)
15031{
15032    int i;
15033
15034    Py_CLEAR(unicode_empty);
15035
15036    for (i = 0; i < 256; i++)
15037        Py_CLEAR(unicode_latin1[i]);
15038    _PyUnicode_ClearStaticStrings();
15039    (void)PyUnicode_ClearFreeList();
15040}
15041
15042void
15043PyUnicode_InternInPlace(PyObject **p)
15044{
15045    PyObject *s = *p;
15046    PyObject *t;
15047#ifdef Py_DEBUG
15048    assert(s != NULL);
15049    assert(_PyUnicode_CHECK(s));
15050#else
15051    if (s == NULL || !PyUnicode_Check(s))
15052        return;
15053#endif
15054    /* If it's a subclass, we don't really know what putting
15055       it in the interned dict might do. */
15056    if (!PyUnicode_CheckExact(s))
15057        return;
15058    if (PyUnicode_CHECK_INTERNED(s))
15059        return;
15060    if (interned == NULL) {
15061        interned = PyDict_New();
15062        if (interned == NULL) {
15063            PyErr_Clear(); /* Don't leave an exception */
15064            return;
15065        }
15066    }
15067    /* It might be that the GetItem call fails even
15068       though the key is present in the dictionary,
15069       namely when this happens during a stack overflow. */
15070    Py_ALLOW_RECURSION
15071    t = PyDict_GetItem(interned, s);
15072    Py_END_ALLOW_RECURSION
15073
15074    if (t) {
15075        Py_INCREF(t);
15076        Py_DECREF(*p);
15077        *p = t;
15078        return;
15079    }
15080
15081    PyThreadState_GET()->recursion_critical = 1;
15082    if (PyDict_SetItem(interned, s, s) < 0) {
15083        PyErr_Clear();
15084        PyThreadState_GET()->recursion_critical = 0;
15085        return;
15086    }
15087    PyThreadState_GET()->recursion_critical = 0;
15088    /* The two references in interned are not counted by refcnt.
15089       The deallocator will take care of this */
15090    Py_REFCNT(s) -= 2;
15091    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15092}
15093
15094void
15095PyUnicode_InternImmortal(PyObject **p)
15096{
15097    PyUnicode_InternInPlace(p);
15098    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15099        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15100        Py_INCREF(*p);
15101    }
15102}
15103
15104PyObject *
15105PyUnicode_InternFromString(const char *cp)
15106{
15107    PyObject *s = PyUnicode_FromString(cp);
15108    if (s == NULL)
15109        return NULL;
15110    PyUnicode_InternInPlace(&s);
15111    return s;
15112}
15113
15114void
15115_Py_ReleaseInternedUnicodeStrings(void)
15116{
15117    PyObject *keys;
15118    PyObject *s;
15119    Py_ssize_t i, n;
15120    Py_ssize_t immortal_size = 0, mortal_size = 0;
15121
15122    if (interned == NULL || !PyDict_Check(interned))
15123        return;
15124    keys = PyDict_Keys(interned);
15125    if (keys == NULL || !PyList_Check(keys)) {
15126        PyErr_Clear();
15127        return;
15128    }
15129
15130    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15131       detector, interned unicode strings are not forcibly deallocated;
15132       rather, we give them their stolen references back, and then clear
15133       and DECREF the interned dict. */
15134
15135    n = PyList_GET_SIZE(keys);
15136    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15137            n);
15138    for (i = 0; i < n; i++) {
15139        s = PyList_GET_ITEM(keys, i);
15140        if (PyUnicode_READY(s) == -1) {
15141            assert(0 && "could not ready string");
15142            fprintf(stderr, "could not ready string\n");
15143        }
15144        switch (PyUnicode_CHECK_INTERNED(s)) {
15145        case SSTATE_NOT_INTERNED:
15146            /* XXX Shouldn't happen */
15147            break;
15148        case SSTATE_INTERNED_IMMORTAL:
15149            Py_REFCNT(s) += 1;
15150            immortal_size += PyUnicode_GET_LENGTH(s);
15151            break;
15152        case SSTATE_INTERNED_MORTAL:
15153            Py_REFCNT(s) += 2;
15154            mortal_size += PyUnicode_GET_LENGTH(s);
15155            break;
15156        default:
15157            Py_FatalError("Inconsistent interned string state.");
15158        }
15159        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15160    }
15161    fprintf(stderr, "total size of all interned strings: "
15162            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15163            "mortal/immortal\n", mortal_size, immortal_size);
15164    Py_DECREF(keys);
15165    PyDict_Clear(interned);
15166    Py_CLEAR(interned);
15167}
15168
15169
15170/********************* Unicode Iterator **************************/
15171
15172typedef struct {
15173    PyObject_HEAD
15174    Py_ssize_t it_index;
15175    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15176} unicodeiterobject;
15177
15178static void
15179unicodeiter_dealloc(unicodeiterobject *it)
15180{
15181    _PyObject_GC_UNTRACK(it);
15182    Py_XDECREF(it->it_seq);
15183    PyObject_GC_Del(it);
15184}
15185
15186static int
15187unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15188{
15189    Py_VISIT(it->it_seq);
15190    return 0;
15191}
15192
15193static PyObject *
15194unicodeiter_next(unicodeiterobject *it)
15195{
15196    PyObject *seq, *item;
15197
15198    assert(it != NULL);
15199    seq = it->it_seq;
15200    if (seq == NULL)
15201        return NULL;
15202    assert(_PyUnicode_CHECK(seq));
15203
15204    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15205        int kind = PyUnicode_KIND(seq);
15206        void *data = PyUnicode_DATA(seq);
15207        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15208        item = PyUnicode_FromOrdinal(chr);
15209        if (item != NULL)
15210            ++it->it_index;
15211        return item;
15212    }
15213
15214    Py_DECREF(seq);
15215    it->it_seq = NULL;
15216    return NULL;
15217}
15218
15219static PyObject *
15220unicodeiter_len(unicodeiterobject *it)
15221{
15222    Py_ssize_t len = 0;
15223    if (it->it_seq)
15224        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15225    return PyLong_FromSsize_t(len);
15226}
15227
15228PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15229
15230static PyObject *
15231unicodeiter_reduce(unicodeiterobject *it)
15232{
15233    if (it->it_seq != NULL) {
15234        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15235                             it->it_seq, it->it_index);
15236    } else {
15237        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15238        if (u == NULL)
15239            return NULL;
15240        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15241    }
15242}
15243
15244PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15245
15246static PyObject *
15247unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15248{
15249    Py_ssize_t index = PyLong_AsSsize_t(state);
15250    if (index == -1 && PyErr_Occurred())
15251        return NULL;
15252    if (it->it_seq != NULL) {
15253        if (index < 0)
15254            index = 0;
15255        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15256            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15257        it->it_index = index;
15258    }
15259    Py_RETURN_NONE;
15260}
15261
15262PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15263
15264static PyMethodDef unicodeiter_methods[] = {
15265    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15266     length_hint_doc},
15267    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15268     reduce_doc},
15269    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15270     setstate_doc},
15271    {NULL,      NULL}       /* sentinel */
15272};
15273
15274PyTypeObject PyUnicodeIter_Type = {
15275    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15276    "str_iterator",         /* tp_name */
15277    sizeof(unicodeiterobject),      /* tp_basicsize */
15278    0,                  /* tp_itemsize */
15279    /* methods */
15280    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15281    0,                  /* tp_print */
15282    0,                  /* tp_getattr */
15283    0,                  /* tp_setattr */
15284    0,                  /* tp_reserved */
15285    0,                  /* tp_repr */
15286    0,                  /* tp_as_number */
15287    0,                  /* tp_as_sequence */
15288    0,                  /* tp_as_mapping */
15289    0,                  /* tp_hash */
15290    0,                  /* tp_call */
15291    0,                  /* tp_str */
15292    PyObject_GenericGetAttr,        /* tp_getattro */
15293    0,                  /* tp_setattro */
15294    0,                  /* tp_as_buffer */
15295    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15296    0,                  /* tp_doc */
15297    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15298    0,                  /* tp_clear */
15299    0,                  /* tp_richcompare */
15300    0,                  /* tp_weaklistoffset */
15301    PyObject_SelfIter,          /* tp_iter */
15302    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15303    unicodeiter_methods,            /* tp_methods */
15304    0,
15305};
15306
15307static PyObject *
15308unicode_iter(PyObject *seq)
15309{
15310    unicodeiterobject *it;
15311
15312    if (!PyUnicode_Check(seq)) {
15313        PyErr_BadInternalCall();
15314        return NULL;
15315    }
15316    if (PyUnicode_READY(seq) == -1)
15317        return NULL;
15318    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15319    if (it == NULL)
15320        return NULL;
15321    it->it_index = 0;
15322    Py_INCREF(seq);
15323    it->it_seq = seq;
15324    _PyObject_GC_TRACK(it);
15325    return (PyObject *)it;
15326}
15327
15328
15329size_t
15330Py_UNICODE_strlen(const Py_UNICODE *u)
15331{
15332    int res = 0;
15333    while(*u++)
15334        res++;
15335    return res;
15336}
15337
15338Py_UNICODE*
15339Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15340{
15341    Py_UNICODE *u = s1;
15342    while ((*u++ = *s2++));
15343    return s1;
15344}
15345
15346Py_UNICODE*
15347Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15348{
15349    Py_UNICODE *u = s1;
15350    while ((*u++ = *s2++))
15351        if (n-- == 0)
15352            break;
15353    return s1;
15354}
15355
15356Py_UNICODE*
15357Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15358{
15359    Py_UNICODE *u1 = s1;
15360    u1 += Py_UNICODE_strlen(u1);
15361    Py_UNICODE_strcpy(u1, s2);
15362    return s1;
15363}
15364
15365int
15366Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15367{
15368    while (*s1 && *s2 && *s1 == *s2)
15369        s1++, s2++;
15370    if (*s1 && *s2)
15371        return (*s1 < *s2) ? -1 : +1;
15372    if (*s1)
15373        return 1;
15374    if (*s2)
15375        return -1;
15376    return 0;
15377}
15378
15379int
15380Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15381{
15382    Py_UNICODE u1, u2;
15383    for (; n != 0; n--) {
15384        u1 = *s1;
15385        u2 = *s2;
15386        if (u1 != u2)
15387            return (u1 < u2) ? -1 : +1;
15388        if (u1 == '\0')
15389            return 0;
15390        s1++;
15391        s2++;
15392    }
15393    return 0;
15394}
15395
15396Py_UNICODE*
15397Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15398{
15399    const Py_UNICODE *p;
15400    for (p = s; *p; p++)
15401        if (*p == c)
15402            return (Py_UNICODE*)p;
15403    return NULL;
15404}
15405
15406Py_UNICODE*
15407Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15408{
15409    const Py_UNICODE *p;
15410    p = s + Py_UNICODE_strlen(s);
15411    while (p != s) {
15412        p--;
15413        if (*p == c)
15414            return (Py_UNICODE*)p;
15415    }
15416    return NULL;
15417}
15418
15419Py_UNICODE*
15420PyUnicode_AsUnicodeCopy(PyObject *unicode)
15421{
15422    Py_UNICODE *u, *copy;
15423    Py_ssize_t len, size;
15424
15425    if (!PyUnicode_Check(unicode)) {
15426        PyErr_BadArgument();
15427        return NULL;
15428    }
15429    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15430    if (u == NULL)
15431        return NULL;
15432    /* Ensure we won't overflow the size. */
15433    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
15434        PyErr_NoMemory();
15435        return NULL;
15436    }
15437    size = len + 1; /* copy the null character */
15438    size *= sizeof(Py_UNICODE);
15439    copy = PyMem_Malloc(size);
15440    if (copy == NULL) {
15441        PyErr_NoMemory();
15442        return NULL;
15443    }
15444    memcpy(copy, u, size);
15445    return copy;
15446}
15447
15448/* A _string module, to export formatter_parser and formatter_field_name_split
15449   to the string.Formatter class implemented in Python. */
15450
15451static PyMethodDef _string_methods[] = {
15452    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15453     METH_O, PyDoc_STR("split the argument as a field name")},
15454    {"formatter_parser", (PyCFunction) formatter_parser,
15455     METH_O, PyDoc_STR("parse the argument as a format string")},
15456    {NULL, NULL}
15457};
15458
15459static struct PyModuleDef _string_module = {
15460    PyModuleDef_HEAD_INIT,
15461    "_string",
15462    PyDoc_STR("string helper module"),
15463    0,
15464    _string_methods,
15465    NULL,
15466    NULL,
15467    NULL,
15468    NULL
15469};
15470
15471PyMODINIT_FUNC
15472PyInit__string(void)
15473{
15474    return PyModule_Create(&_string_module);
15475}
15476
15477
15478#ifdef __cplusplus
15479}
15480#endif
15481