unicodeobject.c revision 315aa404030f425a8bf7fdb5a5275c118555bc37
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/*[clinic input]
51class str "PyUnicodeObject *" "&PyUnicode_Type"
52[clinic start generated code]*/
53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
54
55/* --- Globals ------------------------------------------------------------
56
57NOTE: In the interpreter's initialization phase, some globals are currently
58      initialized dynamically as needed. In the process Unicode objects may
59      be created before the Unicode type is ready.
60
61*/
62
63
64#ifdef __cplusplus
65extern "C" {
66#endif
67
68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
71#ifdef Py_DEBUG
72#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
73#else
74#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
76
77#define _PyUnicode_UTF8(op)                             \
78    (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op)                              \
80    (assert(_PyUnicode_CHECK(op)),                      \
81     assert(PyUnicode_IS_READY(op)),                    \
82     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
83         ((char*)((PyASCIIObject*)(op) + 1)) :          \
84         _PyUnicode_UTF8(op))
85#define _PyUnicode_UTF8_LENGTH(op)                      \
86    (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op)                       \
88    (assert(_PyUnicode_CHECK(op)),                      \
89     assert(PyUnicode_IS_READY(op)),                    \
90     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
91         ((PyASCIIObject*)(op))->length :               \
92         _PyUnicode_UTF8_LENGTH(op))
93#define _PyUnicode_WSTR(op)                             \
94    (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op)                      \
96    (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op)                           \
98    (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op)                            \
100    (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op)                             \
102    (((PyASCIIObject *)(op))->hash)
103#define _PyUnicode_KIND(op)                             \
104    (assert(_PyUnicode_CHECK(op)),                      \
105     ((PyASCIIObject *)(op))->state.kind)
106#define _PyUnicode_GET_LENGTH(op)                       \
107    (assert(_PyUnicode_CHECK(op)),                      \
108     ((PyASCIIObject *)(op))->length)
109#define _PyUnicode_DATA_ANY(op)                         \
110    (((PyUnicodeObject*)(op))->data.any)
111
112#undef PyUnicode_READY
113#define PyUnicode_READY(op)                             \
114    (assert(_PyUnicode_CHECK(op)),                      \
115     (PyUnicode_IS_READY(op) ?                          \
116      0 :                                               \
117      _PyUnicode_Ready(op)))
118
119#define _PyUnicode_SHARE_UTF8(op)                       \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
122     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op)                       \
124    (assert(_PyUnicode_CHECK(op)),                      \
125     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
127/* true if the Unicode object has an allocated UTF-8 memory block
128   (not shared with other data) */
129#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
130    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
131      && _PyUnicode_UTF8(op)                            \
132      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated wstr memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
137    ((_PyUnicode_WSTR(op) &&                            \
138      (!PyUnicode_IS_READY(op) ||                       \
139       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
141/* Generic helper macro to convert characters of different types.
142   from_type and to_type have to be valid type names, begin and end
143   are pointers to the source characters which should be of type
144   "from_type *".  to is a pointer of type "to_type *" and points to the
145   buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147    do {                                                \
148        to_type *_to = (to_type *)(to);                \
149        const from_type *_iter = (from_type *)(begin);  \
150        const from_type *_end = (from_type *)(end);     \
151        Py_ssize_t n = (_end) - (_iter);                \
152        const from_type *_unrolled_end =                \
153            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
154        while (_iter < (_unrolled_end)) {               \
155            _to[0] = (to_type) _iter[0];                \
156            _to[1] = (to_type) _iter[1];                \
157            _to[2] = (to_type) _iter[2];                \
158            _to[3] = (to_type) _iter[3];                \
159            _iter += 4; _to += 4;                       \
160        }                                               \
161        while (_iter < (_end))                          \
162            *_to++ = (to_type) *_iter++;                \
163    } while (0)
164
165/* This dictionary holds all interned unicode strings.  Note that references
166   to strings in this dictionary are *not* counted in the string's ob_refcnt.
167   When the interned string reaches a refcnt of 0 the string deallocation
168   function will delete the reference from this dictionary.
169
170   Another way to look at this is that to say that the actual reference
171   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
172*/
173static PyObject *interned = NULL;
174
175/* The empty Unicode object is shared to improve performance. */
176static PyObject *unicode_empty = NULL;
177
178#define _Py_INCREF_UNICODE_EMPTY()                      \
179    do {                                                \
180        if (unicode_empty != NULL)                      \
181            Py_INCREF(unicode_empty);                   \
182        else {                                          \
183            unicode_empty = PyUnicode_New(0, 0);        \
184            if (unicode_empty != NULL) {                \
185                Py_INCREF(unicode_empty);               \
186                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187            }                                           \
188        }                                               \
189    } while (0)
190
191#define _Py_RETURN_UNICODE_EMPTY()                      \
192    do {                                                \
193        _Py_INCREF_UNICODE_EMPTY();                     \
194        return unicode_empty;                           \
195    } while (0)
196
197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
201/* List of static strings. */
202static _Py_Identifier *static_strings = NULL;
203
204/* Single character Unicode strings in the Latin-1 range are being
205   shared as well. */
206static PyObject *unicode_latin1[256] = {NULL};
207
208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
210    0, 0, 0, 0, 0, 0, 0, 0,
211/*     case 0x0009: * CHARACTER TABULATION */
212/*     case 0x000A: * LINE FEED */
213/*     case 0x000B: * LINE TABULATION */
214/*     case 0x000C: * FORM FEED */
215/*     case 0x000D: * CARRIAGE RETURN */
216    0, 1, 1, 1, 1, 1, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218/*     case 0x001C: * FILE SEPARATOR */
219/*     case 0x001D: * GROUP SEPARATOR */
220/*     case 0x001E: * RECORD SEPARATOR */
221/*     case 0x001F: * UNIT SEPARATOR */
222    0, 0, 0, 0, 1, 1, 1, 1,
223/*     case 0x0020: * SPACE */
224    1, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0
237};
238
239/* forward */
240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
241static PyObject* get_latin1_char(unsigned char ch);
242static int unicode_modifiable(PyObject *unicode);
243
244
245static PyObject *
246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
253unicode_encode_call_errorhandler(const char *errors,
254       PyObject **errorHandler,const char *encoding, const char *reason,
255       PyObject *unicode, PyObject **exceptionObject,
256       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
258static void
259raise_encode_exception(PyObject **exceptionObject,
260                       const char *encoding,
261                       PyObject *unicode,
262                       Py_ssize_t startpos, Py_ssize_t endpos,
263                       const char *reason);
264
265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
267    0, 0, 0, 0, 0, 0, 0, 0,
268/*         0x000A, * LINE FEED */
269/*         0x000B, * LINE TABULATION */
270/*         0x000C, * FORM FEED */
271/*         0x000D, * CARRIAGE RETURN */
272    0, 0, 1, 1, 1, 1, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274/*         0x001C, * FILE SEPARATOR */
275/*         0x001D, * GROUP SEPARATOR */
276/*         0x001E, * RECORD SEPARATOR */
277    0, 0, 0, 0, 1, 1, 1, 0,
278    0, 0, 0, 0, 0, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0
291};
292
293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294   This function is kept for backward compatibility with the old API. */
295Py_UNICODE
296PyUnicode_GetMax(void)
297{
298#ifdef Py_UNICODE_WIDE
299    return 0x10FFFF;
300#else
301    /* This is actually an illegal character, so it should
302       not be passed to unichr. */
303    return 0xFFFF;
304#endif
305}
306
307#ifdef Py_DEBUG
308int
309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
310{
311    PyASCIIObject *ascii;
312    unsigned int kind;
313
314    assert(PyUnicode_Check(op));
315
316    ascii = (PyASCIIObject *)op;
317    kind = ascii->state.kind;
318
319    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
320        assert(kind == PyUnicode_1BYTE_KIND);
321        assert(ascii->state.ready == 1);
322    }
323    else {
324        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325        void *data;
326
327        if (ascii->state.compact == 1) {
328            data = compact + 1;
329            assert(kind == PyUnicode_1BYTE_KIND
330                   || kind == PyUnicode_2BYTE_KIND
331                   || kind == PyUnicode_4BYTE_KIND);
332            assert(ascii->state.ascii == 0);
333            assert(ascii->state.ready == 1);
334            assert (compact->utf8 != data);
335        }
336        else {
337            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339            data = unicode->data.any;
340            if (kind == PyUnicode_WCHAR_KIND) {
341                assert(ascii->length == 0);
342                assert(ascii->hash == -1);
343                assert(ascii->state.compact == 0);
344                assert(ascii->state.ascii == 0);
345                assert(ascii->state.ready == 0);
346                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
347                assert(ascii->wstr != NULL);
348                assert(data == NULL);
349                assert(compact->utf8 == NULL);
350            }
351            else {
352                assert(kind == PyUnicode_1BYTE_KIND
353                       || kind == PyUnicode_2BYTE_KIND
354                       || kind == PyUnicode_4BYTE_KIND);
355                assert(ascii->state.compact == 0);
356                assert(ascii->state.ready == 1);
357                assert(data != NULL);
358                if (ascii->state.ascii) {
359                    assert (compact->utf8 == data);
360                    assert (compact->utf8_length == ascii->length);
361                }
362                else
363                    assert (compact->utf8 != data);
364            }
365        }
366        if (kind != PyUnicode_WCHAR_KIND) {
367            if (
368#if SIZEOF_WCHAR_T == 2
369                kind == PyUnicode_2BYTE_KIND
370#else
371                kind == PyUnicode_4BYTE_KIND
372#endif
373               )
374            {
375                assert(ascii->wstr == data);
376                assert(compact->wstr_length == ascii->length);
377            } else
378                assert(ascii->wstr != data);
379        }
380
381        if (compact->utf8 == NULL)
382            assert(compact->utf8_length == 0);
383        if (ascii->wstr == NULL)
384            assert(compact->wstr_length == 0);
385    }
386    /* check that the best kind is used */
387    if (check_content && kind != PyUnicode_WCHAR_KIND)
388    {
389        Py_ssize_t i;
390        Py_UCS4 maxchar = 0;
391        void *data;
392        Py_UCS4 ch;
393
394        data = PyUnicode_DATA(ascii);
395        for (i=0; i < ascii->length; i++)
396        {
397            ch = PyUnicode_READ(kind, data, i);
398            if (ch > maxchar)
399                maxchar = ch;
400        }
401        if (kind == PyUnicode_1BYTE_KIND) {
402            if (ascii->state.ascii == 0) {
403                assert(maxchar >= 128);
404                assert(maxchar <= 255);
405            }
406            else
407                assert(maxchar < 128);
408        }
409        else if (kind == PyUnicode_2BYTE_KIND) {
410            assert(maxchar >= 0x100);
411            assert(maxchar <= 0xFFFF);
412        }
413        else {
414            assert(maxchar >= 0x10000);
415            assert(maxchar <= MAX_UNICODE);
416        }
417        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
418    }
419    return 1;
420}
421#endif
422
423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427    Py_ssize_t len;
428
429    len = _PyUnicode_WSTR_LENGTH(unicode);
430    if (len == 0) {
431        Py_DECREF(unicode);
432        _Py_RETURN_UNICODE_EMPTY();
433    }
434
435    if (len == 1) {
436        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
437        if ((Py_UCS4)ch < 256) {
438            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439            Py_DECREF(unicode);
440            return latin1_char;
441        }
442    }
443
444    if (_PyUnicode_Ready(unicode) < 0) {
445        Py_DECREF(unicode);
446        return NULL;
447    }
448#else
449    assert(Py_REFCNT(unicode) == 1);
450
451    /* don't make the result ready in debug mode to ensure that the caller
452       makes the string ready before using it */
453    assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455    return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461    Py_ssize_t length;
462
463    length = PyUnicode_GET_LENGTH(unicode);
464    if (length == 0) {
465        if (unicode != unicode_empty) {
466            Py_DECREF(unicode);
467            _Py_RETURN_UNICODE_EMPTY();
468        }
469        return unicode_empty;
470    }
471
472    if (length == 1) {
473        void *data = PyUnicode_DATA(unicode);
474        int kind = PyUnicode_KIND(unicode);
475        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
476        if (ch < 256) {
477            PyObject *latin1_char = unicode_latin1[ch];
478            if (latin1_char != NULL) {
479                if (unicode != latin1_char) {
480                    Py_INCREF(latin1_char);
481                    Py_DECREF(unicode);
482                }
483                return latin1_char;
484            }
485            else {
486                assert(_PyUnicode_CheckConsistency(unicode, 1));
487                Py_INCREF(unicode);
488                unicode_latin1[ch] = unicode;
489                return unicode;
490            }
491        }
492    }
493
494    assert(_PyUnicode_CheckConsistency(unicode, 1));
495    return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501    assert(_PyUnicode_CHECK(unicode));
502    if (PyUnicode_IS_READY(unicode))
503        return unicode_result_ready(unicode);
504    else
505        return unicode_result_wchar(unicode);
506}
507
508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511    if (PyUnicode_CheckExact(unicode)) {
512        if (PyUnicode_READY(unicode) == -1)
513            return NULL;
514        Py_INCREF(unicode);
515        return unicode;
516    }
517    else
518        /* Subtype -- return genuine unicode string with the same value. */
519        return _PyUnicode_Copy(unicode);
520}
521
522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529   to keep things simple, we use a single bitmask, using the least 5
530   bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
544#define BLOOM_MASK unsigned long
545
546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
547
548#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
549
550#define BLOOM_LINEBREAK(ch)                                             \
551    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
552     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
553
554Py_LOCAL_INLINE(BLOOM_MASK)
555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
556{
557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
558    do {                                               \
559        TYPE *data = (TYPE *)PTR;                      \
560        TYPE *end = data + LEN;                        \
561        Py_UCS4 ch;                                    \
562        for (; data != end; data++) {                  \
563            ch = *data;                                \
564            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565        }                                              \
566        break;                                         \
567    } while (0)
568
569    /* calculate simple bloom-style bitmask for a given unicode string */
570
571    BLOOM_MASK mask;
572
573    mask = 0;
574    switch (kind) {
575    case PyUnicode_1BYTE_KIND:
576        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577        break;
578    case PyUnicode_2BYTE_KIND:
579        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580        break;
581    case PyUnicode_4BYTE_KIND:
582        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583        break;
584    default:
585        assert(0);
586    }
587    return mask;
588
589#undef BLOOM_UPDATE
590}
591
592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
610#include "stringlib/replace.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
621#include "stringlib/replace.h"
622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
632#include "stringlib/replace.h"
633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
641#include "stringlib/undef.h"
642
643/* --- Unicode Object ----------------------------------------------------- */
644
645static PyObject *
646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
647
648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649                                     Py_ssize_t size, Py_UCS4 ch,
650                                     int direction)
651{
652    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654    switch (kind) {
655    case PyUnicode_1BYTE_KIND:
656        {
657            Py_UCS1 ch1 = (Py_UCS1) ch;
658            if (ch1 == ch)
659                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660            else
661                return -1;
662        }
663    case PyUnicode_2BYTE_KIND:
664        {
665            Py_UCS2 ch2 = (Py_UCS2) ch;
666            if (ch2 == ch)
667                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668            else
669                return -1;
670        }
671    case PyUnicode_4BYTE_KIND:
672        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673    default:
674        assert(0);
675        return -1;
676    }
677}
678
679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681   earlier.
682
683   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685   invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689    int kind = PyUnicode_KIND(unicode);
690    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692    if (length <= old_length)
693        return;
694    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701    Py_ssize_t char_size;
702    Py_ssize_t struct_size;
703    Py_ssize_t new_size;
704    int share_wstr;
705    PyObject *new_unicode;
706#ifdef Py_DEBUG
707    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
710    assert(unicode_modifiable(unicode));
711    assert(PyUnicode_IS_READY(unicode));
712    assert(PyUnicode_IS_COMPACT(unicode));
713
714    char_size = PyUnicode_KIND(unicode);
715    if (PyUnicode_IS_ASCII(unicode))
716        struct_size = sizeof(PyASCIIObject);
717    else
718        struct_size = sizeof(PyCompactUnicodeObject);
719    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
720
721    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722        PyErr_NoMemory();
723        return NULL;
724    }
725    new_size = (struct_size + (length + 1) * char_size);
726
727    _Py_DEC_REFTOTAL;
728    _Py_ForgetReference(unicode);
729
730    new_unicode = (PyObject *)PyObject_REALLOC(unicode, new_size);
731    if (new_unicode == NULL) {
732        _Py_NewReference(unicode);
733        PyErr_NoMemory();
734        return NULL;
735    }
736    unicode = new_unicode;
737    _Py_NewReference(unicode);
738
739    _PyUnicode_LENGTH(unicode) = length;
740    if (share_wstr) {
741        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
742        if (!PyUnicode_IS_ASCII(unicode))
743            _PyUnicode_WSTR_LENGTH(unicode) = length;
744    }
745    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746        PyObject_DEL(_PyUnicode_WSTR(unicode));
747        _PyUnicode_WSTR(unicode) = NULL;
748    }
749#ifdef Py_DEBUG
750    unicode_fill_invalid(unicode, old_length);
751#endif
752    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753                    length, 0);
754    assert(_PyUnicode_CheckConsistency(unicode, 0));
755    return unicode;
756}
757
758static int
759resize_inplace(PyObject *unicode, Py_ssize_t length)
760{
761    wchar_t *wstr;
762    Py_ssize_t new_size;
763    assert(!PyUnicode_IS_COMPACT(unicode));
764    assert(Py_REFCNT(unicode) == 1);
765
766    if (PyUnicode_IS_READY(unicode)) {
767        Py_ssize_t char_size;
768        int share_wstr, share_utf8;
769        void *data;
770#ifdef Py_DEBUG
771        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
773
774        data = _PyUnicode_DATA_ANY(unicode);
775        char_size = PyUnicode_KIND(unicode);
776        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
778
779        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780            PyErr_NoMemory();
781            return -1;
782        }
783        new_size = (length + 1) * char_size;
784
785        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786        {
787            PyObject_DEL(_PyUnicode_UTF8(unicode));
788            _PyUnicode_UTF8(unicode) = NULL;
789            _PyUnicode_UTF8_LENGTH(unicode) = 0;
790        }
791
792        data = (PyObject *)PyObject_REALLOC(data, new_size);
793        if (data == NULL) {
794            PyErr_NoMemory();
795            return -1;
796        }
797        _PyUnicode_DATA_ANY(unicode) = data;
798        if (share_wstr) {
799            _PyUnicode_WSTR(unicode) = data;
800            _PyUnicode_WSTR_LENGTH(unicode) = length;
801        }
802        if (share_utf8) {
803            _PyUnicode_UTF8(unicode) = data;
804            _PyUnicode_UTF8_LENGTH(unicode) = length;
805        }
806        _PyUnicode_LENGTH(unicode) = length;
807        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
808#ifdef Py_DEBUG
809        unicode_fill_invalid(unicode, old_length);
810#endif
811        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
812            assert(_PyUnicode_CheckConsistency(unicode, 0));
813            return 0;
814        }
815    }
816    assert(_PyUnicode_WSTR(unicode) != NULL);
817
818    /* check for integer overflow */
819    if (length > PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) - 1) {
820        PyErr_NoMemory();
821        return -1;
822    }
823    new_size = sizeof(wchar_t) * (length + 1);
824    wstr =  _PyUnicode_WSTR(unicode);
825    wstr = PyObject_REALLOC(wstr, new_size);
826    if (!wstr) {
827        PyErr_NoMemory();
828        return -1;
829    }
830    _PyUnicode_WSTR(unicode) = wstr;
831    _PyUnicode_WSTR(unicode)[length] = 0;
832    _PyUnicode_WSTR_LENGTH(unicode) = length;
833    assert(_PyUnicode_CheckConsistency(unicode, 0));
834    return 0;
835}
836
837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840    Py_ssize_t copy_length;
841    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
842        PyObject *copy;
843
844        if (PyUnicode_READY(unicode) == -1)
845            return NULL;
846
847        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848        if (copy == NULL)
849            return NULL;
850
851        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
852        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
853        return copy;
854    }
855    else {
856        PyObject *w;
857
858        w = (PyObject*)_PyUnicode_New(length);
859        if (w == NULL)
860            return NULL;
861        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862        copy_length = Py_MIN(copy_length, length);
863        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864                  copy_length * sizeof(wchar_t));
865        return w;
866    }
867}
868
869/* We allocate one more byte to make sure the string is
870   Ux0000 terminated; some code (e.g. new_identifier)
871   relies on that.
872
873   XXX This allocator could further be enhanced by assuring that the
874   free list never reduces its size below 1.
875
876*/
877
878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
880{
881    PyUnicodeObject *unicode;
882    size_t new_size;
883
884    /* Optimization for empty strings */
885    if (length == 0 && unicode_empty != NULL) {
886        Py_INCREF(unicode_empty);
887        return (PyUnicodeObject*)unicode_empty;
888    }
889
890    /* Ensure we won't overflow the size. */
891    if (length > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
892        return (PyUnicodeObject *)PyErr_NoMemory();
893    }
894    if (length < 0) {
895        PyErr_SetString(PyExc_SystemError,
896                        "Negative size passed to _PyUnicode_New");
897        return NULL;
898    }
899
900    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901    if (unicode == NULL)
902        return NULL;
903    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
904
905    _PyUnicode_WSTR_LENGTH(unicode) = length;
906    _PyUnicode_HASH(unicode) = -1;
907    _PyUnicode_STATE(unicode).interned = 0;
908    _PyUnicode_STATE(unicode).kind = 0;
909    _PyUnicode_STATE(unicode).compact = 0;
910    _PyUnicode_STATE(unicode).ready = 0;
911    _PyUnicode_STATE(unicode).ascii = 0;
912    _PyUnicode_DATA_ANY(unicode) = NULL;
913    _PyUnicode_LENGTH(unicode) = 0;
914    _PyUnicode_UTF8(unicode) = NULL;
915    _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
917    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918    if (!_PyUnicode_WSTR(unicode)) {
919        Py_DECREF(unicode);
920        PyErr_NoMemory();
921        return NULL;
922    }
923
924    /* Initialize the first element to guard against cases where
925     * the caller fails before initializing str -- unicode_resize()
926     * reads str[0], and the Keep-Alive optimization can keep memory
927     * allocated for str alive across a call to unicode_dealloc(unicode).
928     * We don't want unicode_resize to read uninitialized memory in
929     * that case.
930     */
931    _PyUnicode_WSTR(unicode)[0] = 0;
932    _PyUnicode_WSTR(unicode)[length] = 0;
933
934    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
935    return unicode;
936}
937
938static const char*
939unicode_kind_name(PyObject *unicode)
940{
941    /* don't check consistency: unicode_kind_name() is called from
942       _PyUnicode_Dump() */
943    if (!PyUnicode_IS_COMPACT(unicode))
944    {
945        if (!PyUnicode_IS_READY(unicode))
946            return "wstr";
947        switch (PyUnicode_KIND(unicode))
948        {
949        case PyUnicode_1BYTE_KIND:
950            if (PyUnicode_IS_ASCII(unicode))
951                return "legacy ascii";
952            else
953                return "legacy latin1";
954        case PyUnicode_2BYTE_KIND:
955            return "legacy UCS2";
956        case PyUnicode_4BYTE_KIND:
957            return "legacy UCS4";
958        default:
959            return "<legacy invalid kind>";
960        }
961    }
962    assert(PyUnicode_IS_READY(unicode));
963    switch (PyUnicode_KIND(unicode)) {
964    case PyUnicode_1BYTE_KIND:
965        if (PyUnicode_IS_ASCII(unicode))
966            return "ascii";
967        else
968            return "latin1";
969    case PyUnicode_2BYTE_KIND:
970        return "UCS2";
971    case PyUnicode_4BYTE_KIND:
972        return "UCS4";
973    default:
974        return "<invalid compact kind>";
975    }
976}
977
978#ifdef Py_DEBUG
979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
981    return PyUnicode_UTF8(unicode);
982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985    return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988    printf("obj %p\n", unicode);
989    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994    return PyUnicode_DATA(unicode);
995}
996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000    PyASCIIObject *ascii = (PyASCIIObject *)op;
1001    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003    void *data;
1004
1005    if (ascii->state.compact)
1006    {
1007        if (ascii->state.ascii)
1008            data = (ascii + 1);
1009        else
1010            data = (compact + 1);
1011    }
1012    else
1013        data = unicode->data.any;
1014    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015           unicode_kind_name(op), ascii->length);
1016
1017    if (ascii->wstr == data)
1018        printf("shared ");
1019    printf("wstr=%p", ascii->wstr);
1020
1021    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1022        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1023        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024            printf("shared ");
1025        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026               compact->utf8, compact->utf8_length);
1027    }
1028    printf(", data=%p\n", data);
1029}
1030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035    PyObject *obj;
1036    PyCompactUnicodeObject *unicode;
1037    void *data;
1038    enum PyUnicode_Kind kind;
1039    int is_sharing, is_ascii;
1040    Py_ssize_t char_size;
1041    Py_ssize_t struct_size;
1042
1043    /* Optimization for empty strings */
1044    if (size == 0 && unicode_empty != NULL) {
1045        Py_INCREF(unicode_empty);
1046        return unicode_empty;
1047    }
1048
1049    is_ascii = 0;
1050    is_sharing = 0;
1051    struct_size = sizeof(PyCompactUnicodeObject);
1052    if (maxchar < 128) {
1053        kind = PyUnicode_1BYTE_KIND;
1054        char_size = 1;
1055        is_ascii = 1;
1056        struct_size = sizeof(PyASCIIObject);
1057    }
1058    else if (maxchar < 256) {
1059        kind = PyUnicode_1BYTE_KIND;
1060        char_size = 1;
1061    }
1062    else if (maxchar < 65536) {
1063        kind = PyUnicode_2BYTE_KIND;
1064        char_size = 2;
1065        if (sizeof(wchar_t) == 2)
1066            is_sharing = 1;
1067    }
1068    else {
1069        if (maxchar > MAX_UNICODE) {
1070            PyErr_SetString(PyExc_SystemError,
1071                            "invalid maximum character passed to PyUnicode_New");
1072            return NULL;
1073        }
1074        kind = PyUnicode_4BYTE_KIND;
1075        char_size = 4;
1076        if (sizeof(wchar_t) == 4)
1077            is_sharing = 1;
1078    }
1079
1080    /* Ensure we won't overflow the size. */
1081    if (size < 0) {
1082        PyErr_SetString(PyExc_SystemError,
1083                        "Negative size passed to PyUnicode_New");
1084        return NULL;
1085    }
1086    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087        return PyErr_NoMemory();
1088
1089    /* Duplicated allocation code from _PyObject_New() instead of a call to
1090     * PyObject_New() so we are able to allocate space for the object and
1091     * it's data buffer.
1092     */
1093    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094    if (obj == NULL)
1095        return PyErr_NoMemory();
1096    obj = PyObject_INIT(obj, &PyUnicode_Type);
1097    if (obj == NULL)
1098        return NULL;
1099
1100    unicode = (PyCompactUnicodeObject *)obj;
1101    if (is_ascii)
1102        data = ((PyASCIIObject*)obj) + 1;
1103    else
1104        data = unicode + 1;
1105    _PyUnicode_LENGTH(unicode) = size;
1106    _PyUnicode_HASH(unicode) = -1;
1107    _PyUnicode_STATE(unicode).interned = 0;
1108    _PyUnicode_STATE(unicode).kind = kind;
1109    _PyUnicode_STATE(unicode).compact = 1;
1110    _PyUnicode_STATE(unicode).ready = 1;
1111    _PyUnicode_STATE(unicode).ascii = is_ascii;
1112    if (is_ascii) {
1113        ((char*)data)[size] = 0;
1114        _PyUnicode_WSTR(unicode) = NULL;
1115    }
1116    else if (kind == PyUnicode_1BYTE_KIND) {
1117        ((char*)data)[size] = 0;
1118        _PyUnicode_WSTR(unicode) = NULL;
1119        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1120        unicode->utf8 = NULL;
1121        unicode->utf8_length = 0;
1122    }
1123    else {
1124        unicode->utf8 = NULL;
1125        unicode->utf8_length = 0;
1126        if (kind == PyUnicode_2BYTE_KIND)
1127            ((Py_UCS2*)data)[size] = 0;
1128        else /* kind == PyUnicode_4BYTE_KIND */
1129            ((Py_UCS4*)data)[size] = 0;
1130        if (is_sharing) {
1131            _PyUnicode_WSTR_LENGTH(unicode) = size;
1132            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133        }
1134        else {
1135            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136            _PyUnicode_WSTR(unicode) = NULL;
1137        }
1138    }
1139#ifdef Py_DEBUG
1140    unicode_fill_invalid((PyObject*)unicode, 0);
1141#endif
1142    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1143    return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148   will decode surrogate pairs, the other conversions are implemented as macros
1149   for efficiency.
1150
1151   This function assumes that unicode can hold one more code point than wstr
1152   characters for a terminating null character. */
1153static void
1154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1155                              PyObject *unicode)
1156{
1157    const wchar_t *iter;
1158    Py_UCS4 *ucs4_out;
1159
1160    assert(unicode != NULL);
1161    assert(_PyUnicode_CHECK(unicode));
1162    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165    for (iter = begin; iter < end; ) {
1166        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167                           _PyUnicode_GET_LENGTH(unicode)));
1168        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169            && (iter+1) < end
1170            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1171        {
1172            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1173            iter += 2;
1174        }
1175        else {
1176            *ucs4_out++ = *iter;
1177            iter++;
1178        }
1179    }
1180    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181                        _PyUnicode_GET_LENGTH(unicode)));
1182
1183}
1184#endif
1185
1186static int
1187unicode_check_modifiable(PyObject *unicode)
1188{
1189    if (!unicode_modifiable(unicode)) {
1190        PyErr_SetString(PyExc_SystemError,
1191                        "Cannot modify a string currently used");
1192        return -1;
1193    }
1194    return 0;
1195}
1196
1197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199                 PyObject *from, Py_ssize_t from_start,
1200                 Py_ssize_t how_many, int check_maxchar)
1201{
1202    unsigned int from_kind, to_kind;
1203    void *from_data, *to_data;
1204
1205    assert(0 <= how_many);
1206    assert(0 <= from_start);
1207    assert(0 <= to_start);
1208    assert(PyUnicode_Check(from));
1209    assert(PyUnicode_IS_READY(from));
1210    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1211
1212    assert(PyUnicode_Check(to));
1213    assert(PyUnicode_IS_READY(to));
1214    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
1216    if (how_many == 0)
1217        return 0;
1218
1219    from_kind = PyUnicode_KIND(from);
1220    from_data = PyUnicode_DATA(from);
1221    to_kind = PyUnicode_KIND(to);
1222    to_data = PyUnicode_DATA(to);
1223
1224#ifdef Py_DEBUG
1225    if (!check_maxchar
1226        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227    {
1228        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229        Py_UCS4 ch;
1230        Py_ssize_t i;
1231        for (i=0; i < how_many; i++) {
1232            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233            assert(ch <= to_maxchar);
1234        }
1235    }
1236#endif
1237
1238    if (from_kind == to_kind) {
1239        if (check_maxchar
1240            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241        {
1242            /* Writing Latin-1 characters into an ASCII string requires to
1243               check that all written characters are pure ASCII */
1244            Py_UCS4 max_char;
1245            max_char = ucs1lib_find_max_char(from_data,
1246                                             (Py_UCS1*)from_data + how_many);
1247            if (max_char >= 128)
1248                return -1;
1249        }
1250        Py_MEMCPY((char*)to_data + to_kind * to_start,
1251                  (char*)from_data + from_kind * from_start,
1252                  to_kind * how_many);
1253    }
1254    else if (from_kind == PyUnicode_1BYTE_KIND
1255             && to_kind == PyUnicode_2BYTE_KIND)
1256    {
1257        _PyUnicode_CONVERT_BYTES(
1258            Py_UCS1, Py_UCS2,
1259            PyUnicode_1BYTE_DATA(from) + from_start,
1260            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261            PyUnicode_2BYTE_DATA(to) + to_start
1262            );
1263    }
1264    else if (from_kind == PyUnicode_1BYTE_KIND
1265             && to_kind == PyUnicode_4BYTE_KIND)
1266    {
1267        _PyUnicode_CONVERT_BYTES(
1268            Py_UCS1, Py_UCS4,
1269            PyUnicode_1BYTE_DATA(from) + from_start,
1270            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271            PyUnicode_4BYTE_DATA(to) + to_start
1272            );
1273    }
1274    else if (from_kind == PyUnicode_2BYTE_KIND
1275             && to_kind == PyUnicode_4BYTE_KIND)
1276    {
1277        _PyUnicode_CONVERT_BYTES(
1278            Py_UCS2, Py_UCS4,
1279            PyUnicode_2BYTE_DATA(from) + from_start,
1280            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281            PyUnicode_4BYTE_DATA(to) + to_start
1282            );
1283    }
1284    else {
1285        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
1287        if (!check_maxchar) {
1288            if (from_kind == PyUnicode_2BYTE_KIND
1289                && to_kind == PyUnicode_1BYTE_KIND)
1290            {
1291                _PyUnicode_CONVERT_BYTES(
1292                    Py_UCS2, Py_UCS1,
1293                    PyUnicode_2BYTE_DATA(from) + from_start,
1294                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295                    PyUnicode_1BYTE_DATA(to) + to_start
1296                    );
1297            }
1298            else if (from_kind == PyUnicode_4BYTE_KIND
1299                     && to_kind == PyUnicode_1BYTE_KIND)
1300            {
1301                _PyUnicode_CONVERT_BYTES(
1302                    Py_UCS4, Py_UCS1,
1303                    PyUnicode_4BYTE_DATA(from) + from_start,
1304                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305                    PyUnicode_1BYTE_DATA(to) + to_start
1306                    );
1307            }
1308            else if (from_kind == PyUnicode_4BYTE_KIND
1309                     && to_kind == PyUnicode_2BYTE_KIND)
1310            {
1311                _PyUnicode_CONVERT_BYTES(
1312                    Py_UCS4, Py_UCS2,
1313                    PyUnicode_4BYTE_DATA(from) + from_start,
1314                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315                    PyUnicode_2BYTE_DATA(to) + to_start
1316                    );
1317            }
1318            else {
1319                assert(0);
1320                return -1;
1321            }
1322        }
1323        else {
1324            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1325            Py_UCS4 ch;
1326            Py_ssize_t i;
1327
1328            for (i=0; i < how_many; i++) {
1329                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1330                if (ch > to_maxchar)
1331                    return -1;
1332                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333            }
1334        }
1335    }
1336    return 0;
1337}
1338
1339void
1340_PyUnicode_FastCopyCharacters(
1341    PyObject *to, Py_ssize_t to_start,
1342    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1343{
1344    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349                         PyObject *from, Py_ssize_t from_start,
1350                         Py_ssize_t how_many)
1351{
1352    int err;
1353
1354    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355        PyErr_BadInternalCall();
1356        return -1;
1357    }
1358
1359    if (PyUnicode_READY(from) == -1)
1360        return -1;
1361    if (PyUnicode_READY(to) == -1)
1362        return -1;
1363
1364    if (from_start < 0) {
1365        PyErr_SetString(PyExc_IndexError, "string index out of range");
1366        return -1;
1367    }
1368    if (to_start < 0) {
1369        PyErr_SetString(PyExc_IndexError, "string index out of range");
1370        return -1;
1371    }
1372    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374        PyErr_Format(PyExc_SystemError,
1375                     "Cannot write %zi characters at %zi "
1376                     "in a string of %zi characters",
1377                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1378        return -1;
1379    }
1380
1381    if (how_many == 0)
1382        return 0;
1383
1384    if (unicode_check_modifiable(to))
1385        return -1;
1386
1387    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388    if (err) {
1389        PyErr_Format(PyExc_SystemError,
1390                     "Cannot copy %s characters "
1391                     "into a string of %s characters",
1392                     unicode_kind_name(from),
1393                     unicode_kind_name(to));
1394        return -1;
1395    }
1396    return how_many;
1397}
1398
1399/* Find the maximum code point and count the number of surrogate pairs so a
1400   correct string length can be computed before converting a string to UCS4.
1401   This function counts single surrogates as a character and not as a pair.
1402
1403   Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1407{
1408    const wchar_t *iter;
1409    Py_UCS4 ch;
1410
1411    assert(num_surrogates != NULL && maxchar != NULL);
1412    *num_surrogates = 0;
1413    *maxchar = 0;
1414
1415    for (iter = begin; iter < end; ) {
1416#if SIZEOF_WCHAR_T == 2
1417        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418            && (iter+1) < end
1419            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420        {
1421            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422            ++(*num_surrogates);
1423            iter += 2;
1424        }
1425        else
1426#endif
1427        {
1428            ch = *iter;
1429            iter++;
1430        }
1431        if (ch > *maxchar) {
1432            *maxchar = ch;
1433            if (*maxchar > MAX_UNICODE) {
1434                PyErr_Format(PyExc_ValueError,
1435                             "character U+%x is not in range [U+0000; U+10ffff]",
1436                             ch);
1437                return -1;
1438            }
1439        }
1440    }
1441    return 0;
1442}
1443
1444int
1445_PyUnicode_Ready(PyObject *unicode)
1446{
1447    wchar_t *end;
1448    Py_UCS4 maxchar = 0;
1449    Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451    Py_ssize_t length_wo_surrogates;
1452#endif
1453
1454    /* _PyUnicode_Ready() is only intended for old-style API usage where
1455       strings were created using _PyObject_New() and where no canonical
1456       representation (the str field) has been set yet aka strings
1457       which are not yet ready. */
1458    assert(_PyUnicode_CHECK(unicode));
1459    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1460    assert(_PyUnicode_WSTR(unicode) != NULL);
1461    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1462    assert(_PyUnicode_UTF8(unicode) == NULL);
1463    /* Actually, it should neither be interned nor be anything else: */
1464    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1465
1466    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1467    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1468                                &maxchar, &num_surrogates) == -1)
1469        return -1;
1470
1471    if (maxchar < 256) {
1472        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473        if (!_PyUnicode_DATA_ANY(unicode)) {
1474            PyErr_NoMemory();
1475            return -1;
1476        }
1477        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1478                                _PyUnicode_WSTR(unicode), end,
1479                                PyUnicode_1BYTE_DATA(unicode));
1480        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483        if (maxchar < 128) {
1484            _PyUnicode_STATE(unicode).ascii = 1;
1485            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1486            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1487        }
1488        else {
1489            _PyUnicode_STATE(unicode).ascii = 0;
1490            _PyUnicode_UTF8(unicode) = NULL;
1491            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1492        }
1493        PyObject_FREE(_PyUnicode_WSTR(unicode));
1494        _PyUnicode_WSTR(unicode) = NULL;
1495        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496    }
1497    /* In this case we might have to convert down from 4-byte native
1498       wchar_t to 2-byte unicode. */
1499    else if (maxchar < 65536) {
1500        assert(num_surrogates == 0 &&
1501               "FindMaxCharAndNumSurrogatePairs() messed up");
1502
1503#if SIZEOF_WCHAR_T == 2
1504        /* We can share representations and are done. */
1505        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1506        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1509        _PyUnicode_UTF8(unicode) = NULL;
1510        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1511#else
1512        /* sizeof(wchar_t) == 4 */
1513        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1514            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1515        if (!_PyUnicode_DATA_ANY(unicode)) {
1516            PyErr_NoMemory();
1517            return -1;
1518        }
1519        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520                                _PyUnicode_WSTR(unicode), end,
1521                                PyUnicode_2BYTE_DATA(unicode));
1522        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1525        _PyUnicode_UTF8(unicode) = NULL;
1526        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1527        PyObject_FREE(_PyUnicode_WSTR(unicode));
1528        _PyUnicode_WSTR(unicode) = NULL;
1529        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
1531    }
1532    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533    else {
1534#if SIZEOF_WCHAR_T == 2
1535        /* in case the native representation is 2-bytes, we need to allocate a
1536           new normalized 4-byte version. */
1537        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1538        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1539        if (!_PyUnicode_DATA_ANY(unicode)) {
1540            PyErr_NoMemory();
1541            return -1;
1542        }
1543        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1544        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1545        _PyUnicode_UTF8(unicode) = NULL;
1546        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1547        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1548        _PyUnicode_STATE(unicode).ready = 1;
1549        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1550        PyObject_FREE(_PyUnicode_WSTR(unicode));
1551        _PyUnicode_WSTR(unicode) = NULL;
1552        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1553#else
1554        assert(num_surrogates == 0);
1555
1556        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1557        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1558        _PyUnicode_UTF8(unicode) = NULL;
1559        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1560        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1561#endif
1562        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1563    }
1564    _PyUnicode_STATE(unicode).ready = 1;
1565    assert(_PyUnicode_CheckConsistency(unicode, 1));
1566    return 0;
1567}
1568
1569static void
1570unicode_dealloc(PyObject *unicode)
1571{
1572    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1573    case SSTATE_NOT_INTERNED:
1574        break;
1575
1576    case SSTATE_INTERNED_MORTAL:
1577        /* revive dead object temporarily for DelItem */
1578        Py_REFCNT(unicode) = 3;
1579        if (PyDict_DelItem(interned, unicode) != 0)
1580            Py_FatalError(
1581                "deletion of interned string failed");
1582        break;
1583
1584    case SSTATE_INTERNED_IMMORTAL:
1585        Py_FatalError("Immortal interned string died.");
1586
1587    default:
1588        Py_FatalError("Inconsistent interned string state.");
1589    }
1590
1591    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1592        PyObject_DEL(_PyUnicode_WSTR(unicode));
1593    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1594        PyObject_DEL(_PyUnicode_UTF8(unicode));
1595    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1596        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1597
1598    Py_TYPE(unicode)->tp_free(unicode);
1599}
1600
1601#ifdef Py_DEBUG
1602static int
1603unicode_is_singleton(PyObject *unicode)
1604{
1605    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1606    if (unicode == unicode_empty)
1607        return 1;
1608    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1609    {
1610        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1611        if (ch < 256 && unicode_latin1[ch] == unicode)
1612            return 1;
1613    }
1614    return 0;
1615}
1616#endif
1617
1618static int
1619unicode_modifiable(PyObject *unicode)
1620{
1621    assert(_PyUnicode_CHECK(unicode));
1622    if (Py_REFCNT(unicode) != 1)
1623        return 0;
1624    if (_PyUnicode_HASH(unicode) != -1)
1625        return 0;
1626    if (PyUnicode_CHECK_INTERNED(unicode))
1627        return 0;
1628    if (!PyUnicode_CheckExact(unicode))
1629        return 0;
1630#ifdef Py_DEBUG
1631    /* singleton refcount is greater than 1 */
1632    assert(!unicode_is_singleton(unicode));
1633#endif
1634    return 1;
1635}
1636
1637static int
1638unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1639{
1640    PyObject *unicode;
1641    Py_ssize_t old_length;
1642
1643    assert(p_unicode != NULL);
1644    unicode = *p_unicode;
1645
1646    assert(unicode != NULL);
1647    assert(PyUnicode_Check(unicode));
1648    assert(0 <= length);
1649
1650    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1651        old_length = PyUnicode_WSTR_LENGTH(unicode);
1652    else
1653        old_length = PyUnicode_GET_LENGTH(unicode);
1654    if (old_length == length)
1655        return 0;
1656
1657    if (length == 0) {
1658        _Py_INCREF_UNICODE_EMPTY();
1659        if (!unicode_empty)
1660            return -1;
1661        Py_DECREF(*p_unicode);
1662        *p_unicode = unicode_empty;
1663        return 0;
1664    }
1665
1666    if (!unicode_modifiable(unicode)) {
1667        PyObject *copy = resize_copy(unicode, length);
1668        if (copy == NULL)
1669            return -1;
1670        Py_DECREF(*p_unicode);
1671        *p_unicode = copy;
1672        return 0;
1673    }
1674
1675    if (PyUnicode_IS_COMPACT(unicode)) {
1676        PyObject *new_unicode = resize_compact(unicode, length);
1677        if (new_unicode == NULL)
1678            return -1;
1679        *p_unicode = new_unicode;
1680        return 0;
1681    }
1682    return resize_inplace(unicode, length);
1683}
1684
1685int
1686PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1687{
1688    PyObject *unicode;
1689    if (p_unicode == NULL) {
1690        PyErr_BadInternalCall();
1691        return -1;
1692    }
1693    unicode = *p_unicode;
1694    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1695    {
1696        PyErr_BadInternalCall();
1697        return -1;
1698    }
1699    return unicode_resize(p_unicode, length);
1700}
1701
1702/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1703
1704   WARNING: The function doesn't copy the terminating null character and
1705   doesn't check the maximum character (may write a latin1 character in an
1706   ASCII string). */
1707static void
1708unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1709                   const char *str, Py_ssize_t len)
1710{
1711    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1712    void *data = PyUnicode_DATA(unicode);
1713    const char *end = str + len;
1714
1715    switch (kind) {
1716    case PyUnicode_1BYTE_KIND: {
1717        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1718#ifdef Py_DEBUG
1719        if (PyUnicode_IS_ASCII(unicode)) {
1720            Py_UCS4 maxchar = ucs1lib_find_max_char(
1721                (const Py_UCS1*)str,
1722                (const Py_UCS1*)str + len);
1723            assert(maxchar < 128);
1724        }
1725#endif
1726        memcpy((char *) data + index, str, len);
1727        break;
1728    }
1729    case PyUnicode_2BYTE_KIND: {
1730        Py_UCS2 *start = (Py_UCS2 *)data + index;
1731        Py_UCS2 *ucs2 = start;
1732        assert(index <= PyUnicode_GET_LENGTH(unicode));
1733
1734        for (; str < end; ++ucs2, ++str)
1735            *ucs2 = (Py_UCS2)*str;
1736
1737        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1738        break;
1739    }
1740    default: {
1741        Py_UCS4 *start = (Py_UCS4 *)data + index;
1742        Py_UCS4 *ucs4 = start;
1743        assert(kind == PyUnicode_4BYTE_KIND);
1744        assert(index <= PyUnicode_GET_LENGTH(unicode));
1745
1746        for (; str < end; ++ucs4, ++str)
1747            *ucs4 = (Py_UCS4)*str;
1748
1749        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1750    }
1751    }
1752}
1753
1754static PyObject*
1755get_latin1_char(unsigned char ch)
1756{
1757    PyObject *unicode = unicode_latin1[ch];
1758    if (!unicode) {
1759        unicode = PyUnicode_New(1, ch);
1760        if (!unicode)
1761            return NULL;
1762        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1763        assert(_PyUnicode_CheckConsistency(unicode, 1));
1764        unicode_latin1[ch] = unicode;
1765    }
1766    Py_INCREF(unicode);
1767    return unicode;
1768}
1769
1770static PyObject*
1771unicode_char(Py_UCS4 ch)
1772{
1773    PyObject *unicode;
1774
1775    assert(ch <= MAX_UNICODE);
1776
1777    if (ch < 256)
1778        return get_latin1_char(ch);
1779
1780    unicode = PyUnicode_New(1, ch);
1781    if (unicode == NULL)
1782        return NULL;
1783    switch (PyUnicode_KIND(unicode)) {
1784    case PyUnicode_1BYTE_KIND:
1785        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1786        break;
1787    case PyUnicode_2BYTE_KIND:
1788        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1789        break;
1790    default:
1791        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1792        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1793    }
1794    assert(_PyUnicode_CheckConsistency(unicode, 1));
1795    return unicode;
1796}
1797
1798PyObject *
1799PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1800{
1801    PyObject *unicode;
1802    Py_UCS4 maxchar = 0;
1803    Py_ssize_t num_surrogates;
1804
1805    if (u == NULL)
1806        return (PyObject*)_PyUnicode_New(size);
1807
1808    /* If the Unicode data is known at construction time, we can apply
1809       some optimizations which share commonly used objects. */
1810
1811    /* Optimization for empty strings */
1812    if (size == 0)
1813        _Py_RETURN_UNICODE_EMPTY();
1814
1815    /* Single character Unicode objects in the Latin-1 range are
1816       shared when using this constructor */
1817    if (size == 1 && (Py_UCS4)*u < 256)
1818        return get_latin1_char((unsigned char)*u);
1819
1820    /* If not empty and not single character, copy the Unicode data
1821       into the new object */
1822    if (find_maxchar_surrogates(u, u + size,
1823                                &maxchar, &num_surrogates) == -1)
1824        return NULL;
1825
1826    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1827    if (!unicode)
1828        return NULL;
1829
1830    switch (PyUnicode_KIND(unicode)) {
1831    case PyUnicode_1BYTE_KIND:
1832        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1833                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1834        break;
1835    case PyUnicode_2BYTE_KIND:
1836#if Py_UNICODE_SIZE == 2
1837        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1838#else
1839        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1840                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1841#endif
1842        break;
1843    case PyUnicode_4BYTE_KIND:
1844#if SIZEOF_WCHAR_T == 2
1845        /* This is the only case which has to process surrogates, thus
1846           a simple copy loop is not enough and we need a function. */
1847        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1848#else
1849        assert(num_surrogates == 0);
1850        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1851#endif
1852        break;
1853    default:
1854        assert(0 && "Impossible state");
1855    }
1856
1857    return unicode_result(unicode);
1858}
1859
1860PyObject *
1861PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1862{
1863    if (size < 0) {
1864        PyErr_SetString(PyExc_SystemError,
1865                        "Negative size passed to PyUnicode_FromStringAndSize");
1866        return NULL;
1867    }
1868    if (u != NULL)
1869        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1870    else
1871        return (PyObject *)_PyUnicode_New(size);
1872}
1873
1874PyObject *
1875PyUnicode_FromString(const char *u)
1876{
1877    size_t size = strlen(u);
1878    if (size > PY_SSIZE_T_MAX) {
1879        PyErr_SetString(PyExc_OverflowError, "input too long");
1880        return NULL;
1881    }
1882    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1883}
1884
1885PyObject *
1886_PyUnicode_FromId(_Py_Identifier *id)
1887{
1888    if (!id->object) {
1889        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1890                                                  strlen(id->string),
1891                                                  NULL, NULL);
1892        if (!id->object)
1893            return NULL;
1894        PyUnicode_InternInPlace(&id->object);
1895        assert(!id->next);
1896        id->next = static_strings;
1897        static_strings = id;
1898    }
1899    return id->object;
1900}
1901
1902void
1903_PyUnicode_ClearStaticStrings()
1904{
1905    _Py_Identifier *tmp, *s = static_strings;
1906    while (s) {
1907        Py_CLEAR(s->object);
1908        tmp = s->next;
1909        s->next = NULL;
1910        s = tmp;
1911    }
1912    static_strings = NULL;
1913}
1914
1915/* Internal function, doesn't check maximum character */
1916
1917PyObject*
1918_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1919{
1920    const unsigned char *s = (const unsigned char *)buffer;
1921    PyObject *unicode;
1922    if (size == 1) {
1923#ifdef Py_DEBUG
1924        assert((unsigned char)s[0] < 128);
1925#endif
1926        return get_latin1_char(s[0]);
1927    }
1928    unicode = PyUnicode_New(size, 127);
1929    if (!unicode)
1930        return NULL;
1931    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1932    assert(_PyUnicode_CheckConsistency(unicode, 1));
1933    return unicode;
1934}
1935
1936static Py_UCS4
1937kind_maxchar_limit(unsigned int kind)
1938{
1939    switch (kind) {
1940    case PyUnicode_1BYTE_KIND:
1941        return 0x80;
1942    case PyUnicode_2BYTE_KIND:
1943        return 0x100;
1944    case PyUnicode_4BYTE_KIND:
1945        return 0x10000;
1946    default:
1947        assert(0 && "invalid kind");
1948        return MAX_UNICODE;
1949    }
1950}
1951
1952Py_LOCAL_INLINE(Py_UCS4)
1953align_maxchar(Py_UCS4 maxchar)
1954{
1955    if (maxchar <= 127)
1956        return 127;
1957    else if (maxchar <= 255)
1958        return 255;
1959    else if (maxchar <= 65535)
1960        return 65535;
1961    else
1962        return MAX_UNICODE;
1963}
1964
1965static PyObject*
1966_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1967{
1968    PyObject *res;
1969    unsigned char max_char;
1970
1971    if (size == 0)
1972        _Py_RETURN_UNICODE_EMPTY();
1973    assert(size > 0);
1974    if (size == 1)
1975        return get_latin1_char(u[0]);
1976
1977    max_char = ucs1lib_find_max_char(u, u + size);
1978    res = PyUnicode_New(size, max_char);
1979    if (!res)
1980        return NULL;
1981    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1982    assert(_PyUnicode_CheckConsistency(res, 1));
1983    return res;
1984}
1985
1986static PyObject*
1987_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1988{
1989    PyObject *res;
1990    Py_UCS2 max_char;
1991
1992    if (size == 0)
1993        _Py_RETURN_UNICODE_EMPTY();
1994    assert(size > 0);
1995    if (size == 1)
1996        return unicode_char(u[0]);
1997
1998    max_char = ucs2lib_find_max_char(u, u + size);
1999    res = PyUnicode_New(size, max_char);
2000    if (!res)
2001        return NULL;
2002    if (max_char >= 256)
2003        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2004    else {
2005        _PyUnicode_CONVERT_BYTES(
2006            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2007    }
2008    assert(_PyUnicode_CheckConsistency(res, 1));
2009    return res;
2010}
2011
2012static PyObject*
2013_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2014{
2015    PyObject *res;
2016    Py_UCS4 max_char;
2017
2018    if (size == 0)
2019        _Py_RETURN_UNICODE_EMPTY();
2020    assert(size > 0);
2021    if (size == 1)
2022        return unicode_char(u[0]);
2023
2024    max_char = ucs4lib_find_max_char(u, u + size);
2025    res = PyUnicode_New(size, max_char);
2026    if (!res)
2027        return NULL;
2028    if (max_char < 256)
2029        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2030                                 PyUnicode_1BYTE_DATA(res));
2031    else if (max_char < 0x10000)
2032        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2033                                 PyUnicode_2BYTE_DATA(res));
2034    else
2035        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2036    assert(_PyUnicode_CheckConsistency(res, 1));
2037    return res;
2038}
2039
2040PyObject*
2041PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2042{
2043    if (size < 0) {
2044        PyErr_SetString(PyExc_ValueError, "size must be positive");
2045        return NULL;
2046    }
2047    switch (kind) {
2048    case PyUnicode_1BYTE_KIND:
2049        return _PyUnicode_FromUCS1(buffer, size);
2050    case PyUnicode_2BYTE_KIND:
2051        return _PyUnicode_FromUCS2(buffer, size);
2052    case PyUnicode_4BYTE_KIND:
2053        return _PyUnicode_FromUCS4(buffer, size);
2054    default:
2055        PyErr_SetString(PyExc_SystemError, "invalid kind");
2056        return NULL;
2057    }
2058}
2059
2060Py_UCS4
2061_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2062{
2063    enum PyUnicode_Kind kind;
2064    void *startptr, *endptr;
2065
2066    assert(PyUnicode_IS_READY(unicode));
2067    assert(0 <= start);
2068    assert(end <= PyUnicode_GET_LENGTH(unicode));
2069    assert(start <= end);
2070
2071    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2072        return PyUnicode_MAX_CHAR_VALUE(unicode);
2073
2074    if (start == end)
2075        return 127;
2076
2077    if (PyUnicode_IS_ASCII(unicode))
2078        return 127;
2079
2080    kind = PyUnicode_KIND(unicode);
2081    startptr = PyUnicode_DATA(unicode);
2082    endptr = (char *)startptr + end * kind;
2083    startptr = (char *)startptr + start * kind;
2084    switch(kind) {
2085    case PyUnicode_1BYTE_KIND:
2086        return ucs1lib_find_max_char(startptr, endptr);
2087    case PyUnicode_2BYTE_KIND:
2088        return ucs2lib_find_max_char(startptr, endptr);
2089    case PyUnicode_4BYTE_KIND:
2090        return ucs4lib_find_max_char(startptr, endptr);
2091    default:
2092        assert(0);
2093        return 0;
2094    }
2095}
2096
2097/* Ensure that a string uses the most efficient storage, if it is not the
2098   case: create a new string with of the right kind. Write NULL into *p_unicode
2099   on error. */
2100static void
2101unicode_adjust_maxchar(PyObject **p_unicode)
2102{
2103    PyObject *unicode, *copy;
2104    Py_UCS4 max_char;
2105    Py_ssize_t len;
2106    unsigned int kind;
2107
2108    assert(p_unicode != NULL);
2109    unicode = *p_unicode;
2110    assert(PyUnicode_IS_READY(unicode));
2111    if (PyUnicode_IS_ASCII(unicode))
2112        return;
2113
2114    len = PyUnicode_GET_LENGTH(unicode);
2115    kind = PyUnicode_KIND(unicode);
2116    if (kind == PyUnicode_1BYTE_KIND) {
2117        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2118        max_char = ucs1lib_find_max_char(u, u + len);
2119        if (max_char >= 128)
2120            return;
2121    }
2122    else if (kind == PyUnicode_2BYTE_KIND) {
2123        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2124        max_char = ucs2lib_find_max_char(u, u + len);
2125        if (max_char >= 256)
2126            return;
2127    }
2128    else {
2129        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2130        assert(kind == PyUnicode_4BYTE_KIND);
2131        max_char = ucs4lib_find_max_char(u, u + len);
2132        if (max_char >= 0x10000)
2133            return;
2134    }
2135    copy = PyUnicode_New(len, max_char);
2136    if (copy != NULL)
2137        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2138    Py_DECREF(unicode);
2139    *p_unicode = copy;
2140}
2141
2142PyObject*
2143_PyUnicode_Copy(PyObject *unicode)
2144{
2145    Py_ssize_t length;
2146    PyObject *copy;
2147
2148    if (!PyUnicode_Check(unicode)) {
2149        PyErr_BadInternalCall();
2150        return NULL;
2151    }
2152    if (PyUnicode_READY(unicode) == -1)
2153        return NULL;
2154
2155    length = PyUnicode_GET_LENGTH(unicode);
2156    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2157    if (!copy)
2158        return NULL;
2159    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2160
2161    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2162              length * PyUnicode_KIND(unicode));
2163    assert(_PyUnicode_CheckConsistency(copy, 1));
2164    return copy;
2165}
2166
2167
2168/* Widen Unicode objects to larger buffers. Don't write terminating null
2169   character. Return NULL on error. */
2170
2171void*
2172_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2173{
2174    Py_ssize_t len;
2175    void *result;
2176    unsigned int skind;
2177
2178    if (PyUnicode_READY(s) == -1)
2179        return NULL;
2180
2181    len = PyUnicode_GET_LENGTH(s);
2182    skind = PyUnicode_KIND(s);
2183    if (skind >= kind) {
2184        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2185        return NULL;
2186    }
2187    switch (kind) {
2188    case PyUnicode_2BYTE_KIND:
2189        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2190        if (!result)
2191            return PyErr_NoMemory();
2192        assert(skind == PyUnicode_1BYTE_KIND);
2193        _PyUnicode_CONVERT_BYTES(
2194            Py_UCS1, Py_UCS2,
2195            PyUnicode_1BYTE_DATA(s),
2196            PyUnicode_1BYTE_DATA(s) + len,
2197            result);
2198        return result;
2199    case PyUnicode_4BYTE_KIND:
2200        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2201        if (!result)
2202            return PyErr_NoMemory();
2203        if (skind == PyUnicode_2BYTE_KIND) {
2204            _PyUnicode_CONVERT_BYTES(
2205                Py_UCS2, Py_UCS4,
2206                PyUnicode_2BYTE_DATA(s),
2207                PyUnicode_2BYTE_DATA(s) + len,
2208                result);
2209        }
2210        else {
2211            assert(skind == PyUnicode_1BYTE_KIND);
2212            _PyUnicode_CONVERT_BYTES(
2213                Py_UCS1, Py_UCS4,
2214                PyUnicode_1BYTE_DATA(s),
2215                PyUnicode_1BYTE_DATA(s) + len,
2216                result);
2217        }
2218        return result;
2219    default:
2220        break;
2221    }
2222    PyErr_SetString(PyExc_SystemError, "invalid kind");
2223    return NULL;
2224}
2225
2226static Py_UCS4*
2227as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2228        int copy_null)
2229{
2230    int kind;
2231    void *data;
2232    Py_ssize_t len, targetlen;
2233    if (PyUnicode_READY(string) == -1)
2234        return NULL;
2235    kind = PyUnicode_KIND(string);
2236    data = PyUnicode_DATA(string);
2237    len = PyUnicode_GET_LENGTH(string);
2238    targetlen = len;
2239    if (copy_null)
2240        targetlen++;
2241    if (!target) {
2242        if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UCS4) < targetlen) {
2243            PyErr_NoMemory();
2244            return NULL;
2245        }
2246        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2247        if (!target) {
2248            PyErr_NoMemory();
2249            return NULL;
2250        }
2251    }
2252    else {
2253        if (targetsize < targetlen) {
2254            PyErr_Format(PyExc_SystemError,
2255                         "string is longer than the buffer");
2256            if (copy_null && 0 < targetsize)
2257                target[0] = 0;
2258            return NULL;
2259        }
2260    }
2261    if (kind == PyUnicode_1BYTE_KIND) {
2262        Py_UCS1 *start = (Py_UCS1 *) data;
2263        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2264    }
2265    else if (kind == PyUnicode_2BYTE_KIND) {
2266        Py_UCS2 *start = (Py_UCS2 *) data;
2267        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268    }
2269    else {
2270        assert(kind == PyUnicode_4BYTE_KIND);
2271        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2272    }
2273    if (copy_null)
2274        target[len] = 0;
2275    return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280                 int copy_null)
2281{
2282    if (target == NULL || targetsize < 0) {
2283        PyErr_BadInternalCall();
2284        return NULL;
2285    }
2286    return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292    return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
2296
2297PyObject *
2298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2299{
2300    if (w == NULL) {
2301        if (size == 0)
2302            _Py_RETURN_UNICODE_EMPTY();
2303        PyErr_BadInternalCall();
2304        return NULL;
2305    }
2306
2307    if (size == -1) {
2308        size = wcslen(w);
2309    }
2310
2311    return PyUnicode_FromUnicode(w, size);
2312}
2313
2314#endif /* HAVE_WCHAR_H */
2315
2316static void
2317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2318        char c)
2319{
2320    *fmt++ = '%';
2321    if (longflag)
2322        *fmt++ = 'l';
2323    else if (longlongflag) {
2324        /* longlongflag should only ever be nonzero on machines with
2325           HAVE_LONG_LONG defined */
2326#ifdef HAVE_LONG_LONG
2327        char *f = PY_FORMAT_LONG_LONG;
2328        while (*f)
2329            *fmt++ = *f++;
2330#else
2331        /* we shouldn't ever get here */
2332        assert(0);
2333        *fmt++ = 'l';
2334#endif
2335    }
2336    else if (size_tflag) {
2337        char *f = PY_FORMAT_SIZE_T;
2338        while (*f)
2339            *fmt++ = *f++;
2340    }
2341    *fmt++ = c;
2342    *fmt = '\0';
2343}
2344
2345/* maximum number of characters required for output of %lld or %p.
2346   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2349
2350static int
2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352                             Py_ssize_t width, Py_ssize_t precision)
2353{
2354    Py_ssize_t length, fill, arglen;
2355    Py_UCS4 maxchar;
2356
2357    if (PyUnicode_READY(str) == -1)
2358        return -1;
2359
2360    length = PyUnicode_GET_LENGTH(str);
2361    if ((precision == -1 || precision >= length)
2362        && width <= length)
2363        return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365    if (precision != -1)
2366        length = Py_MIN(precision, length);
2367
2368    arglen = Py_MAX(length, width);
2369    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371    else
2372        maxchar = writer->maxchar;
2373
2374    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375        return -1;
2376
2377    if (width > length) {
2378        fill = width - length;
2379        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380            return -1;
2381        writer->pos += fill;
2382    }
2383
2384    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385                                  str, 0, length);
2386    writer->pos += length;
2387    return 0;
2388}
2389
2390static int
2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392                              Py_ssize_t width, Py_ssize_t precision)
2393{
2394    /* UTF-8 */
2395    Py_ssize_t length;
2396    PyObject *unicode;
2397    int res;
2398
2399    length = strlen(str);
2400    if (precision != -1)
2401        length = Py_MIN(length, precision);
2402    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403    if (unicode == NULL)
2404        return -1;
2405
2406    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407    Py_DECREF(unicode);
2408    return res;
2409}
2410
2411static const char*
2412unicode_fromformat_arg(_PyUnicodeWriter *writer,
2413                       const char *f, va_list *vargs)
2414{
2415    const char *p;
2416    Py_ssize_t len;
2417    int zeropad;
2418    Py_ssize_t width;
2419    Py_ssize_t precision;
2420    int longflag;
2421    int longlongflag;
2422    int size_tflag;
2423    Py_ssize_t fill;
2424
2425    p = f;
2426    f++;
2427    zeropad = 0;
2428    if (*f == '0') {
2429        zeropad = 1;
2430        f++;
2431    }
2432
2433    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2434    width = -1;
2435    if (Py_ISDIGIT((unsigned)*f)) {
2436        width = *f - '0';
2437        f++;
2438        while (Py_ISDIGIT((unsigned)*f)) {
2439            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2440                PyErr_SetString(PyExc_ValueError,
2441                                "width too big");
2442                return NULL;
2443            }
2444            width = (width * 10) + (*f - '0');
2445            f++;
2446        }
2447    }
2448    precision = -1;
2449    if (*f == '.') {
2450        f++;
2451        if (Py_ISDIGIT((unsigned)*f)) {
2452            precision = (*f - '0');
2453            f++;
2454            while (Py_ISDIGIT((unsigned)*f)) {
2455                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456                    PyErr_SetString(PyExc_ValueError,
2457                                    "precision too big");
2458                    return NULL;
2459                }
2460                precision = (precision * 10) + (*f - '0');
2461                f++;
2462            }
2463        }
2464        if (*f == '%') {
2465            /* "%.3%s" => f points to "3" */
2466            f--;
2467        }
2468    }
2469    if (*f == '\0') {
2470        /* bogus format "%.123" => go backward, f points to "3" */
2471        f--;
2472    }
2473
2474    /* Handle %ld, %lu, %lld and %llu. */
2475    longflag = 0;
2476    longlongflag = 0;
2477    size_tflag = 0;
2478    if (*f == 'l') {
2479        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2480            longflag = 1;
2481            ++f;
2482        }
2483#ifdef HAVE_LONG_LONG
2484        else if (f[1] == 'l' &&
2485                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2486            longlongflag = 1;
2487            f += 2;
2488        }
2489#endif
2490    }
2491    /* handle the size_t flag. */
2492    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2493        size_tflag = 1;
2494        ++f;
2495    }
2496
2497    if (f[1] == '\0')
2498        writer->overallocate = 0;
2499
2500    switch (*f) {
2501    case 'c':
2502    {
2503        int ordinal = va_arg(*vargs, int);
2504        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2505            PyErr_SetString(PyExc_OverflowError,
2506                            "character argument not in range(0x110000)");
2507            return NULL;
2508        }
2509        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2510            return NULL;
2511        break;
2512    }
2513
2514    case 'i':
2515    case 'd':
2516    case 'u':
2517    case 'x':
2518    {
2519        /* used by sprintf */
2520        char fmt[10]; /* should be enough for "%0lld\0" */
2521        char buffer[MAX_LONG_LONG_CHARS];
2522        Py_ssize_t arglen;
2523
2524        if (*f == 'u') {
2525            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527            if (longflag)
2528                len = sprintf(buffer, fmt,
2529                        va_arg(*vargs, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531            else if (longlongflag)
2532                len = sprintf(buffer, fmt,
2533                        va_arg(*vargs, unsigned PY_LONG_LONG));
2534#endif
2535            else if (size_tflag)
2536                len = sprintf(buffer, fmt,
2537                        va_arg(*vargs, size_t));
2538            else
2539                len = sprintf(buffer, fmt,
2540                        va_arg(*vargs, unsigned int));
2541        }
2542        else if (*f == 'x') {
2543            makefmt(fmt, 0, 0, 0, 'x');
2544            len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545        }
2546        else {
2547            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549            if (longflag)
2550                len = sprintf(buffer, fmt,
2551                        va_arg(*vargs, long));
2552#ifdef HAVE_LONG_LONG
2553            else if (longlongflag)
2554                len = sprintf(buffer, fmt,
2555                        va_arg(*vargs, PY_LONG_LONG));
2556#endif
2557            else if (size_tflag)
2558                len = sprintf(buffer, fmt,
2559                        va_arg(*vargs, Py_ssize_t));
2560            else
2561                len = sprintf(buffer, fmt,
2562                        va_arg(*vargs, int));
2563        }
2564        assert(len >= 0);
2565
2566        if (precision < len)
2567            precision = len;
2568
2569        arglen = Py_MAX(precision, width);
2570        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571            return NULL;
2572
2573        if (width > precision) {
2574            Py_UCS4 fillchar;
2575            fill = width - precision;
2576            fillchar = zeropad?'0':' ';
2577            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2578                return NULL;
2579            writer->pos += fill;
2580        }
2581        if (precision > len) {
2582            fill = precision - len;
2583            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584                return NULL;
2585            writer->pos += fill;
2586        }
2587
2588        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589            return NULL;
2590        break;
2591    }
2592
2593    case 'p':
2594    {
2595        char number[MAX_LONG_LONG_CHARS];
2596
2597        len = sprintf(number, "%p", va_arg(*vargs, void*));
2598        assert(len >= 0);
2599
2600        /* %p is ill-defined:  ensure leading 0x. */
2601        if (number[1] == 'X')
2602            number[1] = 'x';
2603        else if (number[1] != 'x') {
2604            memmove(number + 2, number,
2605                    strlen(number) + 1);
2606            number[0] = '0';
2607            number[1] = 'x';
2608            len += 2;
2609        }
2610
2611        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2612            return NULL;
2613        break;
2614    }
2615
2616    case 's':
2617    {
2618        /* UTF-8 */
2619        const char *s = va_arg(*vargs, const char*);
2620        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2621            return NULL;
2622        break;
2623    }
2624
2625    case 'U':
2626    {
2627        PyObject *obj = va_arg(*vargs, PyObject *);
2628        assert(obj && _PyUnicode_CHECK(obj));
2629
2630        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2631            return NULL;
2632        break;
2633    }
2634
2635    case 'V':
2636    {
2637        PyObject *obj = va_arg(*vargs, PyObject *);
2638        const char *str = va_arg(*vargs, const char *);
2639        if (obj) {
2640            assert(_PyUnicode_CHECK(obj));
2641            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2642                return NULL;
2643        }
2644        else {
2645            assert(str != NULL);
2646            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2647                return NULL;
2648        }
2649        break;
2650    }
2651
2652    case 'S':
2653    {
2654        PyObject *obj = va_arg(*vargs, PyObject *);
2655        PyObject *str;
2656        assert(obj);
2657        str = PyObject_Str(obj);
2658        if (!str)
2659            return NULL;
2660        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2661            Py_DECREF(str);
2662            return NULL;
2663        }
2664        Py_DECREF(str);
2665        break;
2666    }
2667
2668    case 'R':
2669    {
2670        PyObject *obj = va_arg(*vargs, PyObject *);
2671        PyObject *repr;
2672        assert(obj);
2673        repr = PyObject_Repr(obj);
2674        if (!repr)
2675            return NULL;
2676        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2677            Py_DECREF(repr);
2678            return NULL;
2679        }
2680        Py_DECREF(repr);
2681        break;
2682    }
2683
2684    case 'A':
2685    {
2686        PyObject *obj = va_arg(*vargs, PyObject *);
2687        PyObject *ascii;
2688        assert(obj);
2689        ascii = PyObject_ASCII(obj);
2690        if (!ascii)
2691            return NULL;
2692        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2693            Py_DECREF(ascii);
2694            return NULL;
2695        }
2696        Py_DECREF(ascii);
2697        break;
2698    }
2699
2700    case '%':
2701        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2702            return NULL;
2703        break;
2704
2705    default:
2706        /* if we stumble upon an unknown formatting code, copy the rest
2707           of the format string to the output string. (we cannot just
2708           skip the code, since there's no way to know what's in the
2709           argument list) */
2710        len = strlen(p);
2711        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2712            return NULL;
2713        f = p+len;
2714        return f;
2715    }
2716
2717    f++;
2718    return f;
2719}
2720
2721PyObject *
2722PyUnicode_FromFormatV(const char *format, va_list vargs)
2723{
2724    va_list vargs2;
2725    const char *f;
2726    _PyUnicodeWriter writer;
2727
2728    _PyUnicodeWriter_Init(&writer);
2729    writer.min_length = strlen(format) + 100;
2730    writer.overallocate = 1;
2731
2732    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2733       Copy it to be able to pass a reference to a subfunction. */
2734    Py_VA_COPY(vargs2, vargs);
2735
2736    for (f = format; *f; ) {
2737        if (*f == '%') {
2738            f = unicode_fromformat_arg(&writer, f, &vargs2);
2739            if (f == NULL)
2740                goto fail;
2741        }
2742        else {
2743            const char *p;
2744            Py_ssize_t len;
2745
2746            p = f;
2747            do
2748            {
2749                if ((unsigned char)*p > 127) {
2750                    PyErr_Format(PyExc_ValueError,
2751                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752                        "string, got a non-ASCII byte: 0x%02x",
2753                        (unsigned char)*p);
2754                    return NULL;
2755                }
2756                p++;
2757            }
2758            while (*p != '\0' && *p != '%');
2759            len = p - f;
2760
2761            if (*p == '\0')
2762                writer.overallocate = 0;
2763
2764            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2765                goto fail;
2766
2767            f = p;
2768        }
2769    }
2770    return _PyUnicodeWriter_Finish(&writer);
2771
2772  fail:
2773    _PyUnicodeWriter_Dealloc(&writer);
2774    return NULL;
2775}
2776
2777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
2780    PyObject* ret;
2781    va_list vargs;
2782
2783#ifdef HAVE_STDARG_PROTOTYPES
2784    va_start(vargs, format);
2785#else
2786    va_start(vargs);
2787#endif
2788    ret = PyUnicode_FromFormatV(format, vargs);
2789    va_end(vargs);
2790    return ret;
2791}
2792
2793#ifdef HAVE_WCHAR_H
2794
2795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796   convert a Unicode object to a wide character string.
2797
2798   - If w is NULL: return the number of wide characters (including the null
2799     character) required to convert the unicode object. Ignore size argument.
2800
2801   - Otherwise: return the number of wide characters (excluding the null
2802     character) written into w. Write at most size wide characters (including
2803     the null character). */
2804static Py_ssize_t
2805unicode_aswidechar(PyObject *unicode,
2806                   wchar_t *w,
2807                   Py_ssize_t size)
2808{
2809    Py_ssize_t res;
2810    const wchar_t *wstr;
2811
2812    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2813    if (wstr == NULL)
2814        return -1;
2815
2816    if (w != NULL) {
2817        if (size > res)
2818            size = res + 1;
2819        else
2820            res = size;
2821        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2822        return res;
2823    }
2824    else
2825        return res + 1;
2826}
2827
2828Py_ssize_t
2829PyUnicode_AsWideChar(PyObject *unicode,
2830                     wchar_t *w,
2831                     Py_ssize_t size)
2832{
2833    if (unicode == NULL) {
2834        PyErr_BadInternalCall();
2835        return -1;
2836    }
2837    return unicode_aswidechar(unicode, w, size);
2838}
2839
2840wchar_t*
2841PyUnicode_AsWideCharString(PyObject *unicode,
2842                           Py_ssize_t *size)
2843{
2844    wchar_t* buffer;
2845    Py_ssize_t buflen;
2846
2847    if (unicode == NULL) {
2848        PyErr_BadInternalCall();
2849        return NULL;
2850    }
2851
2852    buflen = unicode_aswidechar(unicode, NULL, 0);
2853    if (buflen == -1)
2854        return NULL;
2855    if (PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(wchar_t) < buflen) {
2856        PyErr_NoMemory();
2857        return NULL;
2858    }
2859
2860    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2861    if (buffer == NULL) {
2862        PyErr_NoMemory();
2863        return NULL;
2864    }
2865    buflen = unicode_aswidechar(unicode, buffer, buflen);
2866    if (buflen == -1) {
2867        PyMem_FREE(buffer);
2868        return NULL;
2869    }
2870    if (size != NULL)
2871        *size = buflen;
2872    return buffer;
2873}
2874
2875#endif /* HAVE_WCHAR_H */
2876
2877PyObject *
2878PyUnicode_FromOrdinal(int ordinal)
2879{
2880    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2881        PyErr_SetString(PyExc_ValueError,
2882                        "chr() arg not in range(0x110000)");
2883        return NULL;
2884    }
2885
2886    return unicode_char((Py_UCS4)ordinal);
2887}
2888
2889PyObject *
2890PyUnicode_FromObject(PyObject *obj)
2891{
2892    /* XXX Perhaps we should make this API an alias of
2893       PyObject_Str() instead ?! */
2894    if (PyUnicode_CheckExact(obj)) {
2895        if (PyUnicode_READY(obj) == -1)
2896            return NULL;
2897        Py_INCREF(obj);
2898        return obj;
2899    }
2900    if (PyUnicode_Check(obj)) {
2901        /* For a Unicode subtype that's not a Unicode object,
2902           return a true Unicode object with the same data. */
2903        return _PyUnicode_Copy(obj);
2904    }
2905    PyErr_Format(PyExc_TypeError,
2906                 "Can't convert '%.100s' object to str implicitly",
2907                 Py_TYPE(obj)->tp_name);
2908    return NULL;
2909}
2910
2911PyObject *
2912PyUnicode_FromEncodedObject(PyObject *obj,
2913                            const char *encoding,
2914                            const char *errors)
2915{
2916    Py_buffer buffer;
2917    PyObject *v;
2918
2919    if (obj == NULL) {
2920        PyErr_BadInternalCall();
2921        return NULL;
2922    }
2923
2924    /* Decoding bytes objects is the most common case and should be fast */
2925    if (PyBytes_Check(obj)) {
2926        if (PyBytes_GET_SIZE(obj) == 0)
2927            _Py_RETURN_UNICODE_EMPTY();
2928        v = PyUnicode_Decode(
2929                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2930                encoding, errors);
2931        return v;
2932    }
2933
2934    if (PyUnicode_Check(obj)) {
2935        PyErr_SetString(PyExc_TypeError,
2936                        "decoding str is not supported");
2937        return NULL;
2938    }
2939
2940    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2941    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2942        PyErr_Format(PyExc_TypeError,
2943                     "coercing to str: need bytes, bytearray "
2944                     "or buffer-like object, %.80s found",
2945                     Py_TYPE(obj)->tp_name);
2946        return NULL;
2947    }
2948
2949    if (buffer.len == 0) {
2950        PyBuffer_Release(&buffer);
2951        _Py_RETURN_UNICODE_EMPTY();
2952    }
2953
2954    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2955    PyBuffer_Release(&buffer);
2956    return v;
2957}
2958
2959/* Convert encoding to lower case and replace '_' with '-' in order to
2960   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2961   1 on success. */
2962int
2963_Py_normalize_encoding(const char *encoding,
2964                       char *lower,
2965                       size_t lower_len)
2966{
2967    const char *e;
2968    char *l;
2969    char *l_end;
2970
2971    if (encoding == NULL) {
2972        /* 6 == strlen("utf-8") + 1 */
2973        if (lower_len < 6)
2974            return 0;
2975        strcpy(lower, "utf-8");
2976        return 1;
2977    }
2978    e = encoding;
2979    l = lower;
2980    l_end = &lower[lower_len - 1];
2981    while (*e) {
2982        if (l == l_end)
2983            return 0;
2984        if (Py_ISUPPER(*e)) {
2985            *l++ = Py_TOLOWER(*e++);
2986        }
2987        else if (*e == '_') {
2988            *l++ = '-';
2989            e++;
2990        }
2991        else {
2992            *l++ = *e++;
2993        }
2994    }
2995    *l = '\0';
2996    return 1;
2997}
2998
2999PyObject *
3000PyUnicode_Decode(const char *s,
3001                 Py_ssize_t size,
3002                 const char *encoding,
3003                 const char *errors)
3004{
3005    PyObject *buffer = NULL, *unicode;
3006    Py_buffer info;
3007    char lower[11];  /* Enough for any encoding shortcut */
3008
3009    /* Shortcuts for common default encodings */
3010    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3011        if ((strcmp(lower, "utf-8") == 0) ||
3012            (strcmp(lower, "utf8") == 0))
3013            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3014        else if ((strcmp(lower, "latin-1") == 0) ||
3015                 (strcmp(lower, "latin1") == 0) ||
3016                 (strcmp(lower, "iso-8859-1") == 0) ||
3017                 (strcmp(lower, "iso8859-1") == 0))
3018            return PyUnicode_DecodeLatin1(s, size, errors);
3019#ifdef HAVE_MBCS
3020        else if (strcmp(lower, "mbcs") == 0)
3021            return PyUnicode_DecodeMBCS(s, size, errors);
3022#endif
3023        else if (strcmp(lower, "ascii") == 0)
3024            return PyUnicode_DecodeASCII(s, size, errors);
3025        else if (strcmp(lower, "utf-16") == 0)
3026            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3027        else if (strcmp(lower, "utf-32") == 0)
3028            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3029    }
3030
3031    /* Decode via the codec registry */
3032    buffer = NULL;
3033    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3034        goto onError;
3035    buffer = PyMemoryView_FromBuffer(&info);
3036    if (buffer == NULL)
3037        goto onError;
3038    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3039    if (unicode == NULL)
3040        goto onError;
3041    if (!PyUnicode_Check(unicode)) {
3042        PyErr_Format(PyExc_TypeError,
3043                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3044                     "use codecs.decode() to decode to arbitrary types",
3045                     encoding,
3046                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3047        Py_DECREF(unicode);
3048        goto onError;
3049    }
3050    Py_DECREF(buffer);
3051    return unicode_result(unicode);
3052
3053  onError:
3054    Py_XDECREF(buffer);
3055    return NULL;
3056}
3057
3058PyObject *
3059PyUnicode_AsDecodedObject(PyObject *unicode,
3060                          const char *encoding,
3061                          const char *errors)
3062{
3063    PyObject *v;
3064
3065    if (!PyUnicode_Check(unicode)) {
3066        PyErr_BadArgument();
3067        goto onError;
3068    }
3069
3070    if (encoding == NULL)
3071        encoding = PyUnicode_GetDefaultEncoding();
3072
3073    /* Decode via the codec registry */
3074    v = PyCodec_Decode(unicode, encoding, errors);
3075    if (v == NULL)
3076        goto onError;
3077    return unicode_result(v);
3078
3079  onError:
3080    return NULL;
3081}
3082
3083PyObject *
3084PyUnicode_AsDecodedUnicode(PyObject *unicode,
3085                           const char *encoding,
3086                           const char *errors)
3087{
3088    PyObject *v;
3089
3090    if (!PyUnicode_Check(unicode)) {
3091        PyErr_BadArgument();
3092        goto onError;
3093    }
3094
3095    if (encoding == NULL)
3096        encoding = PyUnicode_GetDefaultEncoding();
3097
3098    /* Decode via the codec registry */
3099    v = PyCodec_Decode(unicode, encoding, errors);
3100    if (v == NULL)
3101        goto onError;
3102    if (!PyUnicode_Check(v)) {
3103        PyErr_Format(PyExc_TypeError,
3104                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3105                     "use codecs.decode() to decode to arbitrary types",
3106                     encoding,
3107                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3108        Py_DECREF(v);
3109        goto onError;
3110    }
3111    return unicode_result(v);
3112
3113  onError:
3114    return NULL;
3115}
3116
3117PyObject *
3118PyUnicode_Encode(const Py_UNICODE *s,
3119                 Py_ssize_t size,
3120                 const char *encoding,
3121                 const char *errors)
3122{
3123    PyObject *v, *unicode;
3124
3125    unicode = PyUnicode_FromUnicode(s, size);
3126    if (unicode == NULL)
3127        return NULL;
3128    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3129    Py_DECREF(unicode);
3130    return v;
3131}
3132
3133PyObject *
3134PyUnicode_AsEncodedObject(PyObject *unicode,
3135                          const char *encoding,
3136                          const char *errors)
3137{
3138    PyObject *v;
3139
3140    if (!PyUnicode_Check(unicode)) {
3141        PyErr_BadArgument();
3142        goto onError;
3143    }
3144
3145    if (encoding == NULL)
3146        encoding = PyUnicode_GetDefaultEncoding();
3147
3148    /* Encode via the codec registry */
3149    v = PyCodec_Encode(unicode, encoding, errors);
3150    if (v == NULL)
3151        goto onError;
3152    return v;
3153
3154  onError:
3155    return NULL;
3156}
3157
3158static size_t
3159wcstombs_errorpos(const wchar_t *wstr)
3160{
3161    size_t len;
3162#if SIZEOF_WCHAR_T == 2
3163    wchar_t buf[3];
3164#else
3165    wchar_t buf[2];
3166#endif
3167    char outbuf[MB_LEN_MAX];
3168    const wchar_t *start, *previous;
3169
3170#if SIZEOF_WCHAR_T == 2
3171    buf[2] = 0;
3172#else
3173    buf[1] = 0;
3174#endif
3175    start = wstr;
3176    while (*wstr != L'\0')
3177    {
3178        previous = wstr;
3179#if SIZEOF_WCHAR_T == 2
3180        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3181            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3182        {
3183            buf[0] = wstr[0];
3184            buf[1] = wstr[1];
3185            wstr += 2;
3186        }
3187        else {
3188            buf[0] = *wstr;
3189            buf[1] = 0;
3190            wstr++;
3191        }
3192#else
3193        buf[0] = *wstr;
3194        wstr++;
3195#endif
3196        len = wcstombs(outbuf, buf, sizeof(outbuf));
3197        if (len == (size_t)-1)
3198            return previous - start;
3199    }
3200
3201    /* failed to find the unencodable character */
3202    return 0;
3203}
3204
3205static int
3206locale_error_handler(const char *errors, int *surrogateescape)
3207{
3208    if (errors == NULL) {
3209        *surrogateescape = 0;
3210        return 0;
3211    }
3212
3213    if (strcmp(errors, "strict") == 0) {
3214        *surrogateescape = 0;
3215        return 0;
3216    }
3217    if (strcmp(errors, "surrogateescape") == 0) {
3218        *surrogateescape = 1;
3219        return 0;
3220    }
3221    PyErr_Format(PyExc_ValueError,
3222                 "only 'strict' and 'surrogateescape' error handlers "
3223                 "are supported, not '%s'",
3224                 errors);
3225    return -1;
3226}
3227
3228PyObject *
3229PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3230{
3231    Py_ssize_t wlen, wlen2;
3232    wchar_t *wstr;
3233    PyObject *bytes = NULL;
3234    char *errmsg;
3235    PyObject *reason = NULL;
3236    PyObject *exc;
3237    size_t error_pos;
3238    int surrogateescape;
3239
3240    if (locale_error_handler(errors, &surrogateescape) < 0)
3241        return NULL;
3242
3243    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3244    if (wstr == NULL)
3245        return NULL;
3246
3247    wlen2 = wcslen(wstr);
3248    if (wlen2 != wlen) {
3249        PyMem_Free(wstr);
3250        PyErr_SetString(PyExc_ValueError, "embedded null character");
3251        return NULL;
3252    }
3253
3254    if (surrogateescape) {
3255        /* "surrogateescape" error handler */
3256        char *str;
3257
3258        str = Py_EncodeLocale(wstr, &error_pos);
3259        if (str == NULL) {
3260            if (error_pos == (size_t)-1) {
3261                PyErr_NoMemory();
3262                PyMem_Free(wstr);
3263                return NULL;
3264            }
3265            else {
3266                goto encode_error;
3267            }
3268        }
3269        PyMem_Free(wstr);
3270
3271        bytes = PyBytes_FromString(str);
3272        PyMem_Free(str);
3273    }
3274    else {
3275        /* strict mode */
3276        size_t len, len2;
3277
3278        len = wcstombs(NULL, wstr, 0);
3279        if (len == (size_t)-1) {
3280            error_pos = (size_t)-1;
3281            goto encode_error;
3282        }
3283
3284        bytes = PyBytes_FromStringAndSize(NULL, len);
3285        if (bytes == NULL) {
3286            PyMem_Free(wstr);
3287            return NULL;
3288        }
3289
3290        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3291        if (len2 == (size_t)-1 || len2 > len) {
3292            error_pos = (size_t)-1;
3293            goto encode_error;
3294        }
3295        PyMem_Free(wstr);
3296    }
3297    return bytes;
3298
3299encode_error:
3300    errmsg = strerror(errno);
3301    assert(errmsg != NULL);
3302
3303    if (error_pos == (size_t)-1)
3304        error_pos = wcstombs_errorpos(wstr);
3305
3306    PyMem_Free(wstr);
3307    Py_XDECREF(bytes);
3308
3309    if (errmsg != NULL) {
3310        size_t errlen;
3311        wstr = Py_DecodeLocale(errmsg, &errlen);
3312        if (wstr != NULL) {
3313            reason = PyUnicode_FromWideChar(wstr, errlen);
3314            PyMem_RawFree(wstr);
3315        } else
3316            errmsg = NULL;
3317    }
3318    if (errmsg == NULL)
3319        reason = PyUnicode_FromString(
3320            "wcstombs() encountered an unencodable "
3321            "wide character");
3322    if (reason == NULL)
3323        return NULL;
3324
3325    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3326                                "locale", unicode,
3327                                (Py_ssize_t)error_pos,
3328                                (Py_ssize_t)(error_pos+1),
3329                                reason);
3330    Py_DECREF(reason);
3331    if (exc != NULL) {
3332        PyCodec_StrictErrors(exc);
3333        Py_XDECREF(exc);
3334    }
3335    return NULL;
3336}
3337
3338PyObject *
3339PyUnicode_EncodeFSDefault(PyObject *unicode)
3340{
3341#ifdef HAVE_MBCS
3342    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3343#elif defined(__APPLE__)
3344    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3345#else
3346    PyInterpreterState *interp = PyThreadState_GET()->interp;
3347    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3348       cannot use it to encode and decode filenames before it is loaded. Load
3349       the Python codec requires to encode at least its own filename. Use the C
3350       version of the locale codec until the codec registry is initialized and
3351       the Python codec is loaded.
3352
3353       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3354       cannot only rely on it: check also interp->fscodec_initialized for
3355       subinterpreters. */
3356    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3357        return PyUnicode_AsEncodedString(unicode,
3358                                         Py_FileSystemDefaultEncoding,
3359                                         "surrogateescape");
3360    }
3361    else {
3362        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3363    }
3364#endif
3365}
3366
3367PyObject *
3368PyUnicode_AsEncodedString(PyObject *unicode,
3369                          const char *encoding,
3370                          const char *errors)
3371{
3372    PyObject *v;
3373    char lower[11];  /* Enough for any encoding shortcut */
3374
3375    if (!PyUnicode_Check(unicode)) {
3376        PyErr_BadArgument();
3377        return NULL;
3378    }
3379
3380    /* Shortcuts for common default encodings */
3381    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3382        if ((strcmp(lower, "utf-8") == 0) ||
3383            (strcmp(lower, "utf8") == 0))
3384        {
3385            if (errors == NULL || strcmp(errors, "strict") == 0)
3386                return _PyUnicode_AsUTF8String(unicode, NULL);
3387            else
3388                return _PyUnicode_AsUTF8String(unicode, errors);
3389        }
3390        else if ((strcmp(lower, "latin-1") == 0) ||
3391                 (strcmp(lower, "latin1") == 0) ||
3392                 (strcmp(lower, "iso-8859-1") == 0) ||
3393                 (strcmp(lower, "iso8859-1") == 0))
3394            return _PyUnicode_AsLatin1String(unicode, errors);
3395#ifdef HAVE_MBCS
3396        else if (strcmp(lower, "mbcs") == 0)
3397            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3398#endif
3399        else if (strcmp(lower, "ascii") == 0)
3400            return _PyUnicode_AsASCIIString(unicode, errors);
3401    }
3402
3403    /* Encode via the codec registry */
3404    v = _PyCodec_EncodeText(unicode, encoding, errors);
3405    if (v == NULL)
3406        return NULL;
3407
3408    /* The normal path */
3409    if (PyBytes_Check(v))
3410        return v;
3411
3412    /* If the codec returns a buffer, raise a warning and convert to bytes */
3413    if (PyByteArray_Check(v)) {
3414        int error;
3415        PyObject *b;
3416
3417        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3418            "encoder %s returned bytearray instead of bytes; "
3419            "use codecs.encode() to encode to arbitrary types",
3420            encoding);
3421        if (error) {
3422            Py_DECREF(v);
3423            return NULL;
3424        }
3425
3426        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3427        Py_DECREF(v);
3428        return b;
3429    }
3430
3431    PyErr_Format(PyExc_TypeError,
3432                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3433                 "use codecs.encode() to encode to arbitrary types",
3434                 encoding,
3435                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3436    Py_DECREF(v);
3437    return NULL;
3438}
3439
3440PyObject *
3441PyUnicode_AsEncodedUnicode(PyObject *unicode,
3442                           const char *encoding,
3443                           const char *errors)
3444{
3445    PyObject *v;
3446
3447    if (!PyUnicode_Check(unicode)) {
3448        PyErr_BadArgument();
3449        goto onError;
3450    }
3451
3452    if (encoding == NULL)
3453        encoding = PyUnicode_GetDefaultEncoding();
3454
3455    /* Encode via the codec registry */
3456    v = PyCodec_Encode(unicode, encoding, errors);
3457    if (v == NULL)
3458        goto onError;
3459    if (!PyUnicode_Check(v)) {
3460        PyErr_Format(PyExc_TypeError,
3461                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3462                     "use codecs.encode() to encode to arbitrary types",
3463                     encoding,
3464                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3465        Py_DECREF(v);
3466        goto onError;
3467    }
3468    return v;
3469
3470  onError:
3471    return NULL;
3472}
3473
3474static size_t
3475mbstowcs_errorpos(const char *str, size_t len)
3476{
3477#ifdef HAVE_MBRTOWC
3478    const char *start = str;
3479    mbstate_t mbs;
3480    size_t converted;
3481    wchar_t ch;
3482
3483    memset(&mbs, 0, sizeof mbs);
3484    while (len)
3485    {
3486        converted = mbrtowc(&ch, str, len, &mbs);
3487        if (converted == 0)
3488            /* Reached end of string */
3489            break;
3490        if (converted == (size_t)-1 || converted == (size_t)-2) {
3491            /* Conversion error or incomplete character */
3492            return str - start;
3493        }
3494        else {
3495            str += converted;
3496            len -= converted;
3497        }
3498    }
3499    /* failed to find the undecodable byte sequence */
3500    return 0;
3501#endif
3502    return 0;
3503}
3504
3505PyObject*
3506PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3507                              const char *errors)
3508{
3509    wchar_t smallbuf[256];
3510    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3511    wchar_t *wstr;
3512    size_t wlen, wlen2;
3513    PyObject *unicode;
3514    int surrogateescape;
3515    size_t error_pos;
3516    char *errmsg;
3517    PyObject *reason, *exc;
3518
3519    if (locale_error_handler(errors, &surrogateescape) < 0)
3520        return NULL;
3521
3522    if (str[len] != '\0' || (size_t)len != strlen(str))  {
3523        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3524        return NULL;
3525    }
3526
3527    if (surrogateescape) {
3528        /* "surrogateescape" error handler */
3529        wstr = Py_DecodeLocale(str, &wlen);
3530        if (wstr == NULL) {
3531            if (wlen == (size_t)-1)
3532                PyErr_NoMemory();
3533            else
3534                PyErr_SetFromErrno(PyExc_OSError);
3535            return NULL;
3536        }
3537
3538        unicode = PyUnicode_FromWideChar(wstr, wlen);
3539        PyMem_RawFree(wstr);
3540    }
3541    else {
3542        /* strict mode */
3543#ifndef HAVE_BROKEN_MBSTOWCS
3544        wlen = mbstowcs(NULL, str, 0);
3545#else
3546        wlen = len;
3547#endif
3548        if (wlen == (size_t)-1)
3549            goto decode_error;
3550        if (wlen+1 <= smallbuf_len) {
3551            wstr = smallbuf;
3552        }
3553        else {
3554            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3555                return PyErr_NoMemory();
3556
3557            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3558            if (!wstr)
3559                return PyErr_NoMemory();
3560        }
3561
3562        wlen2 = mbstowcs(wstr, str, wlen+1);
3563        if (wlen2 == (size_t)-1) {
3564            if (wstr != smallbuf)
3565                PyMem_Free(wstr);
3566            goto decode_error;
3567        }
3568#ifdef HAVE_BROKEN_MBSTOWCS
3569        assert(wlen2 == wlen);
3570#endif
3571        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3572        if (wstr != smallbuf)
3573            PyMem_Free(wstr);
3574    }
3575    return unicode;
3576
3577decode_error:
3578    errmsg = strerror(errno);
3579    assert(errmsg != NULL);
3580
3581    error_pos = mbstowcs_errorpos(str, len);
3582    if (errmsg != NULL) {
3583        size_t errlen;
3584        wstr = Py_DecodeLocale(errmsg, &errlen);
3585        if (wstr != NULL) {
3586            reason = PyUnicode_FromWideChar(wstr, errlen);
3587            PyMem_RawFree(wstr);
3588        } else
3589            errmsg = NULL;
3590    }
3591    if (errmsg == NULL)
3592        reason = PyUnicode_FromString(
3593            "mbstowcs() encountered an invalid multibyte sequence");
3594    if (reason == NULL)
3595        return NULL;
3596
3597    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3598                                "locale", str, len,
3599                                (Py_ssize_t)error_pos,
3600                                (Py_ssize_t)(error_pos+1),
3601                                reason);
3602    Py_DECREF(reason);
3603    if (exc != NULL) {
3604        PyCodec_StrictErrors(exc);
3605        Py_XDECREF(exc);
3606    }
3607    return NULL;
3608}
3609
3610PyObject*
3611PyUnicode_DecodeLocale(const char *str, const char *errors)
3612{
3613    Py_ssize_t size = (Py_ssize_t)strlen(str);
3614    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3615}
3616
3617
3618PyObject*
3619PyUnicode_DecodeFSDefault(const char *s) {
3620    Py_ssize_t size = (Py_ssize_t)strlen(s);
3621    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3622}
3623
3624PyObject*
3625PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3626{
3627#ifdef HAVE_MBCS
3628    return PyUnicode_DecodeMBCS(s, size, NULL);
3629#elif defined(__APPLE__)
3630    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3631#else
3632    PyInterpreterState *interp = PyThreadState_GET()->interp;
3633    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3634       cannot use it to encode and decode filenames before it is loaded. Load
3635       the Python codec requires to encode at least its own filename. Use the C
3636       version of the locale codec until the codec registry is initialized and
3637       the Python codec is loaded.
3638
3639       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3640       cannot only rely on it: check also interp->fscodec_initialized for
3641       subinterpreters. */
3642    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3643        return PyUnicode_Decode(s, size,
3644                                Py_FileSystemDefaultEncoding,
3645                                "surrogateescape");
3646    }
3647    else {
3648        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3649    }
3650#endif
3651}
3652
3653
3654int
3655_PyUnicode_HasNULChars(PyObject* str)
3656{
3657    Py_ssize_t pos;
3658
3659    if (PyUnicode_READY(str) == -1)
3660        return -1;
3661    pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3662                   PyUnicode_GET_LENGTH(str), '\0', 1);
3663    if (pos == -1)
3664        return 0;
3665    else
3666        return 1;
3667}
3668
3669int
3670PyUnicode_FSConverter(PyObject* arg, void* addr)
3671{
3672    PyObject *output = NULL;
3673    Py_ssize_t size;
3674    void *data;
3675    if (arg == NULL) {
3676        Py_DECREF(*(PyObject**)addr);
3677        return 1;
3678    }
3679    if (PyBytes_Check(arg)) {
3680        output = arg;
3681        Py_INCREF(output);
3682    }
3683    else {
3684        arg = PyUnicode_FromObject(arg);
3685        if (!arg)
3686            return 0;
3687        output = PyUnicode_EncodeFSDefault(arg);
3688        Py_DECREF(arg);
3689        if (!output)
3690            return 0;
3691        if (!PyBytes_Check(output)) {
3692            Py_DECREF(output);
3693            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3694            return 0;
3695        }
3696    }
3697    size = PyBytes_GET_SIZE(output);
3698    data = PyBytes_AS_STRING(output);
3699    if ((size_t)size != strlen(data)) {
3700        PyErr_SetString(PyExc_ValueError, "embedded null byte");
3701        Py_DECREF(output);
3702        return 0;
3703    }
3704    *(PyObject**)addr = output;
3705    return Py_CLEANUP_SUPPORTED;
3706}
3707
3708
3709int
3710PyUnicode_FSDecoder(PyObject* arg, void* addr)
3711{
3712    PyObject *output = NULL;
3713    if (arg == NULL) {
3714        Py_DECREF(*(PyObject**)addr);
3715        return 1;
3716    }
3717    if (PyUnicode_Check(arg)) {
3718        if (PyUnicode_READY(arg) == -1)
3719            return 0;
3720        output = arg;
3721        Py_INCREF(output);
3722    }
3723    else {
3724        arg = PyBytes_FromObject(arg);
3725        if (!arg)
3726            return 0;
3727        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3728                                                  PyBytes_GET_SIZE(arg));
3729        Py_DECREF(arg);
3730        if (!output)
3731            return 0;
3732        if (!PyUnicode_Check(output)) {
3733            Py_DECREF(output);
3734            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3735            return 0;
3736        }
3737    }
3738    if (PyUnicode_READY(output) == -1) {
3739        Py_DECREF(output);
3740        return 0;
3741    }
3742    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3743                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3744        PyErr_SetString(PyExc_ValueError, "embedded null character");
3745        Py_DECREF(output);
3746        return 0;
3747    }
3748    *(PyObject**)addr = output;
3749    return Py_CLEANUP_SUPPORTED;
3750}
3751
3752
3753char*
3754PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3755{
3756    PyObject *bytes;
3757
3758    if (!PyUnicode_Check(unicode)) {
3759        PyErr_BadArgument();
3760        return NULL;
3761    }
3762    if (PyUnicode_READY(unicode) == -1)
3763        return NULL;
3764
3765    if (PyUnicode_UTF8(unicode) == NULL) {
3766        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3767        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3768        if (bytes == NULL)
3769            return NULL;
3770        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3771        if (_PyUnicode_UTF8(unicode) == NULL) {
3772            PyErr_NoMemory();
3773            Py_DECREF(bytes);
3774            return NULL;
3775        }
3776        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3777        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3778                  PyBytes_AS_STRING(bytes),
3779                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3780        Py_DECREF(bytes);
3781    }
3782
3783    if (psize)
3784        *psize = PyUnicode_UTF8_LENGTH(unicode);
3785    return PyUnicode_UTF8(unicode);
3786}
3787
3788char*
3789PyUnicode_AsUTF8(PyObject *unicode)
3790{
3791    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3792}
3793
3794Py_UNICODE *
3795PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3796{
3797    const unsigned char *one_byte;
3798#if SIZEOF_WCHAR_T == 4
3799    const Py_UCS2 *two_bytes;
3800#else
3801    const Py_UCS4 *four_bytes;
3802    const Py_UCS4 *ucs4_end;
3803    Py_ssize_t num_surrogates;
3804#endif
3805    wchar_t *w;
3806    wchar_t *wchar_end;
3807
3808    if (!PyUnicode_Check(unicode)) {
3809        PyErr_BadArgument();
3810        return NULL;
3811    }
3812    if (_PyUnicode_WSTR(unicode) == NULL) {
3813        /* Non-ASCII compact unicode object */
3814        assert(_PyUnicode_KIND(unicode) != 0);
3815        assert(PyUnicode_IS_READY(unicode));
3816
3817        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3818#if SIZEOF_WCHAR_T == 2
3819            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3820            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3821            num_surrogates = 0;
3822
3823            for (; four_bytes < ucs4_end; ++four_bytes) {
3824                if (*four_bytes > 0xFFFF)
3825                    ++num_surrogates;
3826            }
3827
3828            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3829                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3830            if (!_PyUnicode_WSTR(unicode)) {
3831                PyErr_NoMemory();
3832                return NULL;
3833            }
3834            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3835
3836            w = _PyUnicode_WSTR(unicode);
3837            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3838            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3839            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3840                if (*four_bytes > 0xFFFF) {
3841                    assert(*four_bytes <= MAX_UNICODE);
3842                    /* encode surrogate pair in this case */
3843                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3844                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3845                }
3846                else
3847                    *w = *four_bytes;
3848
3849                if (w > wchar_end) {
3850                    assert(0 && "Miscalculated string end");
3851                }
3852            }
3853            *w = 0;
3854#else
3855            /* sizeof(wchar_t) == 4 */
3856            Py_FatalError("Impossible unicode object state, wstr and str "
3857                          "should share memory already.");
3858            return NULL;
3859#endif
3860        }
3861        else {
3862            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3863                                                  (_PyUnicode_LENGTH(unicode) + 1));
3864            if (!_PyUnicode_WSTR(unicode)) {
3865                PyErr_NoMemory();
3866                return NULL;
3867            }
3868            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3869                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3870            w = _PyUnicode_WSTR(unicode);
3871            wchar_end = w + _PyUnicode_LENGTH(unicode);
3872
3873            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3874                one_byte = PyUnicode_1BYTE_DATA(unicode);
3875                for (; w < wchar_end; ++one_byte, ++w)
3876                    *w = *one_byte;
3877                /* null-terminate the wstr */
3878                *w = 0;
3879            }
3880            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3881#if SIZEOF_WCHAR_T == 4
3882                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3883                for (; w < wchar_end; ++two_bytes, ++w)
3884                    *w = *two_bytes;
3885                /* null-terminate the wstr */
3886                *w = 0;
3887#else
3888                /* sizeof(wchar_t) == 2 */
3889                PyObject_FREE(_PyUnicode_WSTR(unicode));
3890                _PyUnicode_WSTR(unicode) = NULL;
3891                Py_FatalError("Impossible unicode object state, wstr "
3892                              "and str should share memory already.");
3893                return NULL;
3894#endif
3895            }
3896            else {
3897                assert(0 && "This should never happen.");
3898            }
3899        }
3900    }
3901    if (size != NULL)
3902        *size = PyUnicode_WSTR_LENGTH(unicode);
3903    return _PyUnicode_WSTR(unicode);
3904}
3905
3906Py_UNICODE *
3907PyUnicode_AsUnicode(PyObject *unicode)
3908{
3909    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3910}
3911
3912
3913Py_ssize_t
3914PyUnicode_GetSize(PyObject *unicode)
3915{
3916    if (!PyUnicode_Check(unicode)) {
3917        PyErr_BadArgument();
3918        goto onError;
3919    }
3920    return PyUnicode_GET_SIZE(unicode);
3921
3922  onError:
3923    return -1;
3924}
3925
3926Py_ssize_t
3927PyUnicode_GetLength(PyObject *unicode)
3928{
3929    if (!PyUnicode_Check(unicode)) {
3930        PyErr_BadArgument();
3931        return -1;
3932    }
3933    if (PyUnicode_READY(unicode) == -1)
3934        return -1;
3935    return PyUnicode_GET_LENGTH(unicode);
3936}
3937
3938Py_UCS4
3939PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3940{
3941    void *data;
3942    int kind;
3943
3944    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3945        PyErr_BadArgument();
3946        return (Py_UCS4)-1;
3947    }
3948    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3949        PyErr_SetString(PyExc_IndexError, "string index out of range");
3950        return (Py_UCS4)-1;
3951    }
3952    data = PyUnicode_DATA(unicode);
3953    kind = PyUnicode_KIND(unicode);
3954    return PyUnicode_READ(kind, data, index);
3955}
3956
3957int
3958PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3959{
3960    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3961        PyErr_BadArgument();
3962        return -1;
3963    }
3964    assert(PyUnicode_IS_READY(unicode));
3965    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3966        PyErr_SetString(PyExc_IndexError, "string index out of range");
3967        return -1;
3968    }
3969    if (unicode_check_modifiable(unicode))
3970        return -1;
3971    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3972        PyErr_SetString(PyExc_ValueError, "character out of range");
3973        return -1;
3974    }
3975    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3976                    index, ch);
3977    return 0;
3978}
3979
3980const char *
3981PyUnicode_GetDefaultEncoding(void)
3982{
3983    return "utf-8";
3984}
3985
3986/* create or adjust a UnicodeDecodeError */
3987static void
3988make_decode_exception(PyObject **exceptionObject,
3989                      const char *encoding,
3990                      const char *input, Py_ssize_t length,
3991                      Py_ssize_t startpos, Py_ssize_t endpos,
3992                      const char *reason)
3993{
3994    if (*exceptionObject == NULL) {
3995        *exceptionObject = PyUnicodeDecodeError_Create(
3996            encoding, input, length, startpos, endpos, reason);
3997    }
3998    else {
3999        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
4000            goto onError;
4001        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4002            goto onError;
4003        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4004            goto onError;
4005    }
4006    return;
4007
4008onError:
4009    Py_CLEAR(*exceptionObject);
4010}
4011
4012#ifdef HAVE_MBCS
4013/* error handling callback helper:
4014   build arguments, call the callback and check the arguments,
4015   if no exception occurred, copy the replacement to the output
4016   and adjust various state variables.
4017   return 0 on success, -1 on error
4018*/
4019
4020static int
4021unicode_decode_call_errorhandler_wchar(
4022    const char *errors, PyObject **errorHandler,
4023    const char *encoding, const char *reason,
4024    const char **input, const char **inend, Py_ssize_t *startinpos,
4025    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4026    PyObject **output, Py_ssize_t *outpos)
4027{
4028    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4029
4030    PyObject *restuple = NULL;
4031    PyObject *repunicode = NULL;
4032    Py_ssize_t outsize;
4033    Py_ssize_t insize;
4034    Py_ssize_t requiredsize;
4035    Py_ssize_t newpos;
4036    PyObject *inputobj = NULL;
4037    wchar_t *repwstr;
4038    Py_ssize_t repwlen;
4039
4040    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4041    outsize = _PyUnicode_WSTR_LENGTH(*output);
4042
4043    if (*errorHandler == NULL) {
4044        *errorHandler = PyCodec_LookupError(errors);
4045        if (*errorHandler == NULL)
4046            goto onError;
4047    }
4048
4049    make_decode_exception(exceptionObject,
4050        encoding,
4051        *input, *inend - *input,
4052        *startinpos, *endinpos,
4053        reason);
4054    if (*exceptionObject == NULL)
4055        goto onError;
4056
4057    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4058    if (restuple == NULL)
4059        goto onError;
4060    if (!PyTuple_Check(restuple)) {
4061        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4062        goto onError;
4063    }
4064    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4065        goto onError;
4066
4067    /* Copy back the bytes variables, which might have been modified by the
4068       callback */
4069    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4070    if (!inputobj)
4071        goto onError;
4072    if (!PyBytes_Check(inputobj)) {
4073        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4074    }
4075    *input = PyBytes_AS_STRING(inputobj);
4076    insize = PyBytes_GET_SIZE(inputobj);
4077    *inend = *input + insize;
4078    /* we can DECREF safely, as the exception has another reference,
4079       so the object won't go away. */
4080    Py_DECREF(inputobj);
4081
4082    if (newpos<0)
4083        newpos = insize+newpos;
4084    if (newpos<0 || newpos>insize) {
4085        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4086        goto onError;
4087    }
4088
4089    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4090    if (repwstr == NULL)
4091        goto onError;
4092    /* need more space? (at least enough for what we
4093       have+the replacement+the rest of the string (starting
4094       at the new input position), so we won't have to check space
4095       when there are no errors in the rest of the string) */
4096    requiredsize = *outpos;
4097    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4098        goto overflow;
4099    requiredsize += repwlen;
4100    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4101        goto overflow;
4102    requiredsize += insize - newpos;
4103    if (requiredsize > outsize) {
4104        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4105            requiredsize = 2*outsize;
4106        if (unicode_resize(output, requiredsize) < 0)
4107            goto onError;
4108    }
4109    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4110    *outpos += repwlen;
4111    *endinpos = newpos;
4112    *inptr = *input + newpos;
4113
4114    /* we made it! */
4115    Py_XDECREF(restuple);
4116    return 0;
4117
4118  overflow:
4119    PyErr_SetString(PyExc_OverflowError,
4120                    "decoded result is too long for a Python string");
4121
4122  onError:
4123    Py_XDECREF(restuple);
4124    return -1;
4125}
4126#endif   /* HAVE_MBCS */
4127
4128static int
4129unicode_decode_call_errorhandler_writer(
4130    const char *errors, PyObject **errorHandler,
4131    const char *encoding, const char *reason,
4132    const char **input, const char **inend, Py_ssize_t *startinpos,
4133    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4134    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4135{
4136    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4137
4138    PyObject *restuple = NULL;
4139    PyObject *repunicode = NULL;
4140    Py_ssize_t insize;
4141    Py_ssize_t newpos;
4142    Py_ssize_t replen;
4143    PyObject *inputobj = NULL;
4144
4145    if (*errorHandler == NULL) {
4146        *errorHandler = PyCodec_LookupError(errors);
4147        if (*errorHandler == NULL)
4148            goto onError;
4149    }
4150
4151    make_decode_exception(exceptionObject,
4152        encoding,
4153        *input, *inend - *input,
4154        *startinpos, *endinpos,
4155        reason);
4156    if (*exceptionObject == NULL)
4157        goto onError;
4158
4159    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4160    if (restuple == NULL)
4161        goto onError;
4162    if (!PyTuple_Check(restuple)) {
4163        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4164        goto onError;
4165    }
4166    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4167        goto onError;
4168
4169    /* Copy back the bytes variables, which might have been modified by the
4170       callback */
4171    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4172    if (!inputobj)
4173        goto onError;
4174    if (!PyBytes_Check(inputobj)) {
4175        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4176    }
4177    *input = PyBytes_AS_STRING(inputobj);
4178    insize = PyBytes_GET_SIZE(inputobj);
4179    *inend = *input + insize;
4180    /* we can DECREF safely, as the exception has another reference,
4181       so the object won't go away. */
4182    Py_DECREF(inputobj);
4183
4184    if (newpos<0)
4185        newpos = insize+newpos;
4186    if (newpos<0 || newpos>insize) {
4187        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4188        goto onError;
4189    }
4190
4191    if (PyUnicode_READY(repunicode) < 0)
4192        goto onError;
4193    replen = PyUnicode_GET_LENGTH(repunicode);
4194    writer->min_length += replen;
4195    if (replen > 1)
4196        writer->overallocate = 1;
4197    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4198        goto onError;
4199
4200    *endinpos = newpos;
4201    *inptr = *input + newpos;
4202
4203    /* we made it! */
4204    Py_XDECREF(restuple);
4205    return 0;
4206
4207  onError:
4208    Py_XDECREF(restuple);
4209    return -1;
4210}
4211
4212/* --- UTF-7 Codec -------------------------------------------------------- */
4213
4214/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4215
4216/* Three simple macros defining base-64. */
4217
4218/* Is c a base-64 character? */
4219
4220#define IS_BASE64(c) \
4221    (((c) >= 'A' && (c) <= 'Z') ||     \
4222     ((c) >= 'a' && (c) <= 'z') ||     \
4223     ((c) >= '0' && (c) <= '9') ||     \
4224     (c) == '+' || (c) == '/')
4225
4226/* given that c is a base-64 character, what is its base-64 value? */
4227
4228#define FROM_BASE64(c)                                                  \
4229    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4230     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4231     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4232     (c) == '+' ? 62 : 63)
4233
4234/* What is the base-64 character of the bottom 6 bits of n? */
4235
4236#define TO_BASE64(n)  \
4237    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4238
4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4240 * decoded as itself.  We are permissive on decoding; the only ASCII
4241 * byte not decoding to itself is the + which begins a base64
4242 * string. */
4243
4244#define DECODE_DIRECT(c)                                \
4245    ((c) <= 127 && (c) != '+')
4246
4247/* The UTF-7 encoder treats ASCII characters differently according to
4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4249 * the above).  See RFC2152.  This array identifies these different
4250 * sets:
4251 * 0 : "Set D"
4252 *     alphanumeric and '(),-./:?
4253 * 1 : "Set O"
4254 *     !"#$%&*;<=>@[]^_`{|}
4255 * 2 : "whitespace"
4256 *     ht nl cr sp
4257 * 3 : special (must be base64 encoded)
4258 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4259 */
4260
4261static
4262char utf7_category[128] = {
4263/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4264    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4266    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4267/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4268    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4269/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4270    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4271/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4272    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4273/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4274    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4275/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4276    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4277/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4278    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4279};
4280
4281/* ENCODE_DIRECT: this character should be encoded as itself.  The
4282 * answer depends on whether we are encoding set O as itself, and also
4283 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4284 * clear that the answers to these questions vary between
4285 * applications, so this code needs to be flexible.  */
4286
4287#define ENCODE_DIRECT(c, directO, directWS)             \
4288    ((c) < 128 && (c) > 0 &&                            \
4289     ((utf7_category[(c)] == 0) ||                      \
4290      (directWS && (utf7_category[(c)] == 2)) ||        \
4291      (directO && (utf7_category[(c)] == 1))))
4292
4293PyObject *
4294PyUnicode_DecodeUTF7(const char *s,
4295                     Py_ssize_t size,
4296                     const char *errors)
4297{
4298    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4299}
4300
4301/* The decoder.  The only state we preserve is our read position,
4302 * i.e. how many characters we have consumed.  So if we end in the
4303 * middle of a shift sequence we have to back off the read position
4304 * and the output to the beginning of the sequence, otherwise we lose
4305 * all the shift state (seen bits, number of bits seen, high
4306 * surrogate). */
4307
4308PyObject *
4309PyUnicode_DecodeUTF7Stateful(const char *s,
4310                             Py_ssize_t size,
4311                             const char *errors,
4312                             Py_ssize_t *consumed)
4313{
4314    const char *starts = s;
4315    Py_ssize_t startinpos;
4316    Py_ssize_t endinpos;
4317    const char *e;
4318    _PyUnicodeWriter writer;
4319    const char *errmsg = "";
4320    int inShift = 0;
4321    Py_ssize_t shiftOutStart;
4322    unsigned int base64bits = 0;
4323    unsigned long base64buffer = 0;
4324    Py_UCS4 surrogate = 0;
4325    PyObject *errorHandler = NULL;
4326    PyObject *exc = NULL;
4327
4328    if (size == 0) {
4329        if (consumed)
4330            *consumed = 0;
4331        _Py_RETURN_UNICODE_EMPTY();
4332    }
4333
4334    /* Start off assuming it's all ASCII. Widen later as necessary. */
4335    _PyUnicodeWriter_Init(&writer);
4336    writer.min_length = size;
4337
4338    shiftOutStart = 0;
4339    e = s + size;
4340
4341    while (s < e) {
4342        Py_UCS4 ch;
4343      restart:
4344        ch = (unsigned char) *s;
4345
4346        if (inShift) { /* in a base-64 section */
4347            if (IS_BASE64(ch)) { /* consume a base-64 character */
4348                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4349                base64bits += 6;
4350                s++;
4351                if (base64bits >= 16) {
4352                    /* we have enough bits for a UTF-16 value */
4353                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4354                    base64bits -= 16;
4355                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4356                    assert(outCh <= 0xffff);
4357                    if (surrogate) {
4358                        /* expecting a second surrogate */
4359                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4360                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4361                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4362                                goto onError;
4363                            surrogate = 0;
4364                            continue;
4365                        }
4366                        else {
4367                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4368                                goto onError;
4369                            surrogate = 0;
4370                        }
4371                    }
4372                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4373                        /* first surrogate */
4374                        surrogate = outCh;
4375                    }
4376                    else {
4377                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4378                            goto onError;
4379                    }
4380                }
4381            }
4382            else { /* now leaving a base-64 section */
4383                inShift = 0;
4384                s++;
4385                if (surrogate) {
4386                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4387                        goto onError;
4388                    surrogate = 0;
4389                }
4390                if (base64bits > 0) { /* left-over bits */
4391                    if (base64bits >= 6) {
4392                        /* We've seen at least one base-64 character */
4393                        errmsg = "partial character in shift sequence";
4394                        goto utf7Error;
4395                    }
4396                    else {
4397                        /* Some bits remain; they should be zero */
4398                        if (base64buffer != 0) {
4399                            errmsg = "non-zero padding bits in shift sequence";
4400                            goto utf7Error;
4401                        }
4402                    }
4403                }
4404                if (ch != '-') {
4405                    /* '-' is absorbed; other terminating
4406                       characters are preserved */
4407                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4408                        goto onError;
4409                }
4410            }
4411        }
4412        else if ( ch == '+' ) {
4413            startinpos = s-starts;
4414            s++; /* consume '+' */
4415            if (s < e && *s == '-') { /* '+-' encodes '+' */
4416                s++;
4417                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4418                    goto onError;
4419            }
4420            else { /* begin base64-encoded section */
4421                inShift = 1;
4422                shiftOutStart = writer.pos;
4423                base64bits = 0;
4424                base64buffer = 0;
4425            }
4426        }
4427        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4428            s++;
4429            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4430                goto onError;
4431        }
4432        else {
4433            startinpos = s-starts;
4434            s++;
4435            errmsg = "unexpected special character";
4436            goto utf7Error;
4437        }
4438        continue;
4439utf7Error:
4440        endinpos = s-starts;
4441        if (unicode_decode_call_errorhandler_writer(
4442                errors, &errorHandler,
4443                "utf7", errmsg,
4444                &starts, &e, &startinpos, &endinpos, &exc, &s,
4445                &writer))
4446            goto onError;
4447    }
4448
4449    /* end of string */
4450
4451    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4452        /* if we're in an inconsistent state, that's an error */
4453        if (surrogate ||
4454                (base64bits >= 6) ||
4455                (base64bits > 0 && base64buffer != 0)) {
4456            endinpos = size;
4457            if (unicode_decode_call_errorhandler_writer(
4458                    errors, &errorHandler,
4459                    "utf7", "unterminated shift sequence",
4460                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4461                    &writer))
4462                goto onError;
4463            if (s < e)
4464                goto restart;
4465        }
4466    }
4467
4468    /* return state */
4469    if (consumed) {
4470        if (inShift) {
4471            *consumed = startinpos;
4472            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4473                PyObject *result = PyUnicode_FromKindAndData(
4474                        writer.kind, writer.data, shiftOutStart);
4475                Py_XDECREF(errorHandler);
4476                Py_XDECREF(exc);
4477                _PyUnicodeWriter_Dealloc(&writer);
4478                return result;
4479            }
4480            writer.pos = shiftOutStart; /* back off output */
4481        }
4482        else {
4483            *consumed = s-starts;
4484        }
4485    }
4486
4487    Py_XDECREF(errorHandler);
4488    Py_XDECREF(exc);
4489    return _PyUnicodeWriter_Finish(&writer);
4490
4491  onError:
4492    Py_XDECREF(errorHandler);
4493    Py_XDECREF(exc);
4494    _PyUnicodeWriter_Dealloc(&writer);
4495    return NULL;
4496}
4497
4498
4499PyObject *
4500_PyUnicode_EncodeUTF7(PyObject *str,
4501                      int base64SetO,
4502                      int base64WhiteSpace,
4503                      const char *errors)
4504{
4505    int kind;
4506    void *data;
4507    Py_ssize_t len;
4508    PyObject *v;
4509    int inShift = 0;
4510    Py_ssize_t i;
4511    unsigned int base64bits = 0;
4512    unsigned long base64buffer = 0;
4513    char * out;
4514    char * start;
4515
4516    if (PyUnicode_READY(str) == -1)
4517        return NULL;
4518    kind = PyUnicode_KIND(str);
4519    data = PyUnicode_DATA(str);
4520    len = PyUnicode_GET_LENGTH(str);
4521
4522    if (len == 0)
4523        return PyBytes_FromStringAndSize(NULL, 0);
4524
4525    /* It might be possible to tighten this worst case */
4526    if (len > PY_SSIZE_T_MAX / 8)
4527        return PyErr_NoMemory();
4528    v = PyBytes_FromStringAndSize(NULL, len * 8);
4529    if (v == NULL)
4530        return NULL;
4531
4532    start = out = PyBytes_AS_STRING(v);
4533    for (i = 0; i < len; ++i) {
4534        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4535
4536        if (inShift) {
4537            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4538                /* shifting out */
4539                if (base64bits) { /* output remaining bits */
4540                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4541                    base64buffer = 0;
4542                    base64bits = 0;
4543                }
4544                inShift = 0;
4545                /* Characters not in the BASE64 set implicitly unshift the sequence
4546                   so no '-' is required, except if the character is itself a '-' */
4547                if (IS_BASE64(ch) || ch == '-') {
4548                    *out++ = '-';
4549                }
4550                *out++ = (char) ch;
4551            }
4552            else {
4553                goto encode_char;
4554            }
4555        }
4556        else { /* not in a shift sequence */
4557            if (ch == '+') {
4558                *out++ = '+';
4559                        *out++ = '-';
4560            }
4561            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4562                *out++ = (char) ch;
4563            }
4564            else {
4565                *out++ = '+';
4566                inShift = 1;
4567                goto encode_char;
4568            }
4569        }
4570        continue;
4571encode_char:
4572        if (ch >= 0x10000) {
4573            assert(ch <= MAX_UNICODE);
4574
4575            /* code first surrogate */
4576            base64bits += 16;
4577            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4578            while (base64bits >= 6) {
4579                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4580                base64bits -= 6;
4581            }
4582            /* prepare second surrogate */
4583            ch = Py_UNICODE_LOW_SURROGATE(ch);
4584        }
4585        base64bits += 16;
4586        base64buffer = (base64buffer << 16) | ch;
4587        while (base64bits >= 6) {
4588            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4589            base64bits -= 6;
4590        }
4591    }
4592    if (base64bits)
4593        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4594    if (inShift)
4595        *out++ = '-';
4596    if (_PyBytes_Resize(&v, out - start) < 0)
4597        return NULL;
4598    return v;
4599}
4600PyObject *
4601PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4602                     Py_ssize_t size,
4603                     int base64SetO,
4604                     int base64WhiteSpace,
4605                     const char *errors)
4606{
4607    PyObject *result;
4608    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4609    if (tmp == NULL)
4610        return NULL;
4611    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4612                                   base64WhiteSpace, errors);
4613    Py_DECREF(tmp);
4614    return result;
4615}
4616
4617#undef IS_BASE64
4618#undef FROM_BASE64
4619#undef TO_BASE64
4620#undef DECODE_DIRECT
4621#undef ENCODE_DIRECT
4622
4623/* --- UTF-8 Codec -------------------------------------------------------- */
4624
4625PyObject *
4626PyUnicode_DecodeUTF8(const char *s,
4627                     Py_ssize_t size,
4628                     const char *errors)
4629{
4630    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4631}
4632
4633#include "stringlib/asciilib.h"
4634#include "stringlib/codecs.h"
4635#include "stringlib/undef.h"
4636
4637#include "stringlib/ucs1lib.h"
4638#include "stringlib/codecs.h"
4639#include "stringlib/undef.h"
4640
4641#include "stringlib/ucs2lib.h"
4642#include "stringlib/codecs.h"
4643#include "stringlib/undef.h"
4644
4645#include "stringlib/ucs4lib.h"
4646#include "stringlib/codecs.h"
4647#include "stringlib/undef.h"
4648
4649/* Mask to quickly check whether a C 'long' contains a
4650   non-ASCII, UTF8-encoded char. */
4651#if (SIZEOF_LONG == 8)
4652# define ASCII_CHAR_MASK 0x8080808080808080UL
4653#elif (SIZEOF_LONG == 4)
4654# define ASCII_CHAR_MASK 0x80808080UL
4655#else
4656# error C 'long' size should be either 4 or 8!
4657#endif
4658
4659static Py_ssize_t
4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4661{
4662    const char *p = start;
4663    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4664
4665    /*
4666     * Issue #17237: m68k is a bit different from most architectures in
4667     * that objects do not use "natural alignment" - for example, int and
4668     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4669     * won't work; also, tests have shown that skipping the "optimised
4670     * version" will even speed up m68k.
4671     */
4672#if !defined(__m68k__)
4673#if SIZEOF_LONG <= SIZEOF_VOID_P
4674    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4675    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4676        /* Fast path, see in STRINGLIB(utf8_decode) for
4677           an explanation. */
4678        /* Help allocation */
4679        const char *_p = p;
4680        Py_UCS1 * q = dest;
4681        while (_p < aligned_end) {
4682            unsigned long value = *(const unsigned long *) _p;
4683            if (value & ASCII_CHAR_MASK)
4684                break;
4685            *((unsigned long *)q) = value;
4686            _p += SIZEOF_LONG;
4687            q += SIZEOF_LONG;
4688        }
4689        p = _p;
4690        while (p < end) {
4691            if ((unsigned char)*p & 0x80)
4692                break;
4693            *q++ = *p++;
4694        }
4695        return p - start;
4696    }
4697#endif
4698#endif
4699    while (p < end) {
4700        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4701           for an explanation. */
4702        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4703            /* Help allocation */
4704            const char *_p = p;
4705            while (_p < aligned_end) {
4706                unsigned long value = *(unsigned long *) _p;
4707                if (value & ASCII_CHAR_MASK)
4708                    break;
4709                _p += SIZEOF_LONG;
4710            }
4711            p = _p;
4712            if (_p == end)
4713                break;
4714        }
4715        if ((unsigned char)*p & 0x80)
4716            break;
4717        ++p;
4718    }
4719    memcpy(dest, start, p - start);
4720    return p - start;
4721}
4722
4723PyObject *
4724PyUnicode_DecodeUTF8Stateful(const char *s,
4725                             Py_ssize_t size,
4726                             const char *errors,
4727                             Py_ssize_t *consumed)
4728{
4729    _PyUnicodeWriter writer;
4730    const char *starts = s;
4731    const char *end = s + size;
4732
4733    Py_ssize_t startinpos;
4734    Py_ssize_t endinpos;
4735    const char *errmsg = "";
4736    PyObject *errorHandler = NULL;
4737    PyObject *exc = NULL;
4738
4739    if (size == 0) {
4740        if (consumed)
4741            *consumed = 0;
4742        _Py_RETURN_UNICODE_EMPTY();
4743    }
4744
4745    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4746    if (size == 1 && (unsigned char)s[0] < 128) {
4747        if (consumed)
4748            *consumed = 1;
4749        return get_latin1_char((unsigned char)s[0]);
4750    }
4751
4752    _PyUnicodeWriter_Init(&writer);
4753    writer.min_length = size;
4754    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4755        goto onError;
4756
4757    writer.pos = ascii_decode(s, end, writer.data);
4758    s += writer.pos;
4759    while (s < end) {
4760        Py_UCS4 ch;
4761        int kind = writer.kind;
4762        if (kind == PyUnicode_1BYTE_KIND) {
4763            if (PyUnicode_IS_ASCII(writer.buffer))
4764                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4765            else
4766                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4767        } else if (kind == PyUnicode_2BYTE_KIND) {
4768            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4769        } else {
4770            assert(kind == PyUnicode_4BYTE_KIND);
4771            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4772        }
4773
4774        switch (ch) {
4775        case 0:
4776            if (s == end || consumed)
4777                goto End;
4778            errmsg = "unexpected end of data";
4779            startinpos = s - starts;
4780            endinpos = end - starts;
4781            break;
4782        case 1:
4783            errmsg = "invalid start byte";
4784            startinpos = s - starts;
4785            endinpos = startinpos + 1;
4786            break;
4787        case 2:
4788        case 3:
4789        case 4:
4790            errmsg = "invalid continuation byte";
4791            startinpos = s - starts;
4792            endinpos = startinpos + ch - 1;
4793            break;
4794        default:
4795            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4796                goto onError;
4797            continue;
4798        }
4799
4800        if (unicode_decode_call_errorhandler_writer(
4801                errors, &errorHandler,
4802                "utf-8", errmsg,
4803                &starts, &end, &startinpos, &endinpos, &exc, &s,
4804                &writer))
4805            goto onError;
4806    }
4807
4808End:
4809    if (consumed)
4810        *consumed = s - starts;
4811
4812    Py_XDECREF(errorHandler);
4813    Py_XDECREF(exc);
4814    return _PyUnicodeWriter_Finish(&writer);
4815
4816onError:
4817    Py_XDECREF(errorHandler);
4818    Py_XDECREF(exc);
4819    _PyUnicodeWriter_Dealloc(&writer);
4820    return NULL;
4821}
4822
4823#ifdef __APPLE__
4824
4825/* Simplified UTF-8 decoder using surrogateescape error handler,
4826   used to decode the command line arguments on Mac OS X.
4827
4828   Return a pointer to a newly allocated wide character string (use
4829   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4830
4831wchar_t*
4832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4833{
4834    const char *e;
4835    wchar_t *unicode;
4836    Py_ssize_t outpos;
4837
4838    /* Note: size will always be longer than the resulting Unicode
4839       character count */
4840    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4841        return NULL;
4842    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4843    if (!unicode)
4844        return NULL;
4845
4846    /* Unpack UTF-8 encoded data */
4847    e = s + size;
4848    outpos = 0;
4849    while (s < e) {
4850        Py_UCS4 ch;
4851#if SIZEOF_WCHAR_T == 4
4852        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4853#else
4854        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4855#endif
4856        if (ch > 0xFF) {
4857#if SIZEOF_WCHAR_T == 4
4858            assert(0);
4859#else
4860            assert(Py_UNICODE_IS_SURROGATE(ch));
4861            /*  compute and append the two surrogates: */
4862            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4863            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4864#endif
4865        }
4866        else {
4867            if (!ch && s == e)
4868                break;
4869            /* surrogateescape */
4870            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4871        }
4872    }
4873    unicode[outpos] = L'\0';
4874    return unicode;
4875}
4876
4877#endif /* __APPLE__ */
4878
4879/* Primary internal function which creates utf8 encoded bytes objects.
4880
4881   Allocation strategy:  if the string is short, convert into a stack buffer
4882   and allocate exactly as much space needed at the end.  Else allocate the
4883   maximum possible needed (4 result bytes per Unicode character), and return
4884   the excess memory at the end.
4885*/
4886PyObject *
4887_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4888{
4889    enum PyUnicode_Kind kind;
4890    void *data;
4891    Py_ssize_t size;
4892
4893    if (!PyUnicode_Check(unicode)) {
4894        PyErr_BadArgument();
4895        return NULL;
4896    }
4897
4898    if (PyUnicode_READY(unicode) == -1)
4899        return NULL;
4900
4901    if (PyUnicode_UTF8(unicode))
4902        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4903                                         PyUnicode_UTF8_LENGTH(unicode));
4904
4905    kind = PyUnicode_KIND(unicode);
4906    data = PyUnicode_DATA(unicode);
4907    size = PyUnicode_GET_LENGTH(unicode);
4908
4909    switch (kind) {
4910    default:
4911        assert(0);
4912    case PyUnicode_1BYTE_KIND:
4913        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4914        assert(!PyUnicode_IS_ASCII(unicode));
4915        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4916    case PyUnicode_2BYTE_KIND:
4917        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4918    case PyUnicode_4BYTE_KIND:
4919        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4920    }
4921}
4922
4923PyObject *
4924PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4925                     Py_ssize_t size,
4926                     const char *errors)
4927{
4928    PyObject *v, *unicode;
4929
4930    unicode = PyUnicode_FromUnicode(s, size);
4931    if (unicode == NULL)
4932        return NULL;
4933    v = _PyUnicode_AsUTF8String(unicode, errors);
4934    Py_DECREF(unicode);
4935    return v;
4936}
4937
4938PyObject *
4939PyUnicode_AsUTF8String(PyObject *unicode)
4940{
4941    return _PyUnicode_AsUTF8String(unicode, NULL);
4942}
4943
4944/* --- UTF-32 Codec ------------------------------------------------------- */
4945
4946PyObject *
4947PyUnicode_DecodeUTF32(const char *s,
4948                      Py_ssize_t size,
4949                      const char *errors,
4950                      int *byteorder)
4951{
4952    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4953}
4954
4955PyObject *
4956PyUnicode_DecodeUTF32Stateful(const char *s,
4957                              Py_ssize_t size,
4958                              const char *errors,
4959                              int *byteorder,
4960                              Py_ssize_t *consumed)
4961{
4962    const char *starts = s;
4963    Py_ssize_t startinpos;
4964    Py_ssize_t endinpos;
4965    _PyUnicodeWriter writer;
4966    const unsigned char *q, *e;
4967    int le, bo = 0;       /* assume native ordering by default */
4968    const char *encoding;
4969    const char *errmsg = "";
4970    PyObject *errorHandler = NULL;
4971    PyObject *exc = NULL;
4972
4973    q = (unsigned char *)s;
4974    e = q + size;
4975
4976    if (byteorder)
4977        bo = *byteorder;
4978
4979    /* Check for BOM marks (U+FEFF) in the input and adjust current
4980       byte order setting accordingly. In native mode, the leading BOM
4981       mark is skipped, in all other modes, it is copied to the output
4982       stream as-is (giving a ZWNBSP character). */
4983    if (bo == 0 && size >= 4) {
4984        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4985        if (bom == 0x0000FEFF) {
4986            bo = -1;
4987            q += 4;
4988        }
4989        else if (bom == 0xFFFE0000) {
4990            bo = 1;
4991            q += 4;
4992        }
4993        if (byteorder)
4994            *byteorder = bo;
4995    }
4996
4997    if (q == e) {
4998        if (consumed)
4999            *consumed = size;
5000        _Py_RETURN_UNICODE_EMPTY();
5001    }
5002
5003#ifdef WORDS_BIGENDIAN
5004    le = bo < 0;
5005#else
5006    le = bo <= 0;
5007#endif
5008    encoding = le ? "utf-32-le" : "utf-32-be";
5009
5010    _PyUnicodeWriter_Init(&writer);
5011    writer.min_length = (e - q + 3) / 4;
5012    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5013        goto onError;
5014
5015    while (1) {
5016        Py_UCS4 ch = 0;
5017        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5018
5019        if (e - q >= 4) {
5020            enum PyUnicode_Kind kind = writer.kind;
5021            void *data = writer.data;
5022            const unsigned char *last = e - 4;
5023            Py_ssize_t pos = writer.pos;
5024            if (le) {
5025                do {
5026                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5027                    if (ch > maxch)
5028                        break;
5029                    if (kind != PyUnicode_1BYTE_KIND &&
5030                        Py_UNICODE_IS_SURROGATE(ch))
5031                        break;
5032                    PyUnicode_WRITE(kind, data, pos++, ch);
5033                    q += 4;
5034                } while (q <= last);
5035            }
5036            else {
5037                do {
5038                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5039                    if (ch > maxch)
5040                        break;
5041                    if (kind != PyUnicode_1BYTE_KIND &&
5042                        Py_UNICODE_IS_SURROGATE(ch))
5043                        break;
5044                    PyUnicode_WRITE(kind, data, pos++, ch);
5045                    q += 4;
5046                } while (q <= last);
5047            }
5048            writer.pos = pos;
5049        }
5050
5051        if (Py_UNICODE_IS_SURROGATE(ch)) {
5052            errmsg = "codepoint in surrogate code point range(0xd800, 0xe000)";
5053            startinpos = ((const char *)q) - starts;
5054            endinpos = startinpos + 4;
5055        }
5056        else if (ch <= maxch) {
5057            if (q == e || consumed)
5058                break;
5059            /* remaining bytes at the end? (size should be divisible by 4) */
5060            errmsg = "truncated data";
5061            startinpos = ((const char *)q) - starts;
5062            endinpos = ((const char *)e) - starts;
5063        }
5064        else {
5065            if (ch < 0x110000) {
5066                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5067                    goto onError;
5068                q += 4;
5069                continue;
5070            }
5071            errmsg = "codepoint not in range(0x110000)";
5072            startinpos = ((const char *)q) - starts;
5073            endinpos = startinpos + 4;
5074        }
5075
5076        /* The remaining input chars are ignored if the callback
5077           chooses to skip the input */
5078        if (unicode_decode_call_errorhandler_writer(
5079                errors, &errorHandler,
5080                encoding, errmsg,
5081                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5082                &writer))
5083            goto onError;
5084    }
5085
5086    if (consumed)
5087        *consumed = (const char *)q-starts;
5088
5089    Py_XDECREF(errorHandler);
5090    Py_XDECREF(exc);
5091    return _PyUnicodeWriter_Finish(&writer);
5092
5093  onError:
5094    _PyUnicodeWriter_Dealloc(&writer);
5095    Py_XDECREF(errorHandler);
5096    Py_XDECREF(exc);
5097    return NULL;
5098}
5099
5100PyObject *
5101_PyUnicode_EncodeUTF32(PyObject *str,
5102                       const char *errors,
5103                       int byteorder)
5104{
5105    int kind;
5106    void *data;
5107    Py_ssize_t len;
5108    PyObject *v;
5109    unsigned char *p;
5110    Py_ssize_t nsize, i;
5111    /* Offsets from p for storing byte pairs in the right order. */
5112#if PY_LITTLE_ENDIAN
5113    int iorder[] = {0, 1, 2, 3};
5114#else
5115    int iorder[] = {3, 2, 1, 0};
5116#endif
5117    const char *encoding;
5118    PyObject *errorHandler = NULL;
5119    PyObject *exc = NULL;
5120    PyObject *rep = NULL;
5121
5122#define STORECHAR(CH)                           \
5123    do {                                        \
5124        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5125        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5126        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5127        p[iorder[0]] = (CH) & 0xff;             \
5128        p += 4;                                 \
5129    } while(0)
5130
5131    if (!PyUnicode_Check(str)) {
5132        PyErr_BadArgument();
5133        return NULL;
5134    }
5135    if (PyUnicode_READY(str) == -1)
5136        return NULL;
5137    kind = PyUnicode_KIND(str);
5138    data = PyUnicode_DATA(str);
5139    len = PyUnicode_GET_LENGTH(str);
5140
5141    nsize = len + (byteorder == 0);
5142    if (nsize > PY_SSIZE_T_MAX / 4)
5143        return PyErr_NoMemory();
5144    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5145    if (v == NULL)
5146        return NULL;
5147
5148    p = (unsigned char *)PyBytes_AS_STRING(v);
5149    if (byteorder == 0)
5150        STORECHAR(0xFEFF);
5151    if (len == 0)
5152        return v;
5153
5154    if (byteorder == -1) {
5155        /* force LE */
5156        iorder[0] = 0;
5157        iorder[1] = 1;
5158        iorder[2] = 2;
5159        iorder[3] = 3;
5160        encoding = "utf-32-le";
5161    }
5162    else if (byteorder == 1) {
5163        /* force BE */
5164        iorder[0] = 3;
5165        iorder[1] = 2;
5166        iorder[2] = 1;
5167        iorder[3] = 0;
5168        encoding = "utf-32-be";
5169    }
5170    else
5171        encoding = "utf-32";
5172
5173    if (kind == PyUnicode_1BYTE_KIND) {
5174        for (i = 0; i < len; i++)
5175            STORECHAR(PyUnicode_READ(kind, data, i));
5176        return v;
5177    }
5178
5179    for (i = 0; i < len;) {
5180        Py_ssize_t repsize, moreunits;
5181        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5182        i++;
5183        assert(ch <= MAX_UNICODE);
5184        if (!Py_UNICODE_IS_SURROGATE(ch)) {
5185            STORECHAR(ch);
5186            continue;
5187        }
5188
5189        rep = unicode_encode_call_errorhandler(
5190                errors, &errorHandler,
5191                encoding, "surrogates not allowed",
5192                str, &exc, i-1, i, &i);
5193
5194        if (!rep)
5195            goto error;
5196
5197        if (PyBytes_Check(rep)) {
5198            repsize = PyBytes_GET_SIZE(rep);
5199            if (repsize & 3) {
5200                raise_encode_exception(&exc, encoding,
5201                                       str, i - 1, i,
5202                                       "surrogates not allowed");
5203                goto error;
5204            }
5205            moreunits = repsize / 4;
5206        }
5207        else {
5208            assert(PyUnicode_Check(rep));
5209            if (PyUnicode_READY(rep) < 0)
5210                goto error;
5211            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5212            if (!PyUnicode_IS_ASCII(rep)) {
5213                raise_encode_exception(&exc, encoding,
5214                                       str, i - 1, i,
5215                                       "surrogates not allowed");
5216                goto error;
5217            }
5218        }
5219
5220        /* four bytes are reserved for each surrogate */
5221        if (moreunits > 1) {
5222            Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5223            Py_ssize_t morebytes = 4 * (moreunits - 1);
5224            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5225                /* integer overflow */
5226                PyErr_NoMemory();
5227                goto error;
5228            }
5229            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5230                goto error;
5231            p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5232        }
5233
5234        if (PyBytes_Check(rep)) {
5235            Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5236            p += repsize;
5237        } else /* rep is unicode */ {
5238            const Py_UCS1 *repdata;
5239            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5240            repdata = PyUnicode_1BYTE_DATA(rep);
5241            while (repsize--) {
5242                Py_UCS4 ch = *repdata++;
5243                STORECHAR(ch);
5244            }
5245        }
5246
5247        Py_CLEAR(rep);
5248    }
5249
5250    /* Cut back to size actually needed. This is necessary for, for example,
5251       encoding of a string containing isolated surrogates and the 'ignore'
5252       handler is used. */
5253    nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5254    if (nsize != PyBytes_GET_SIZE(v))
5255      _PyBytes_Resize(&v, nsize);
5256    Py_XDECREF(errorHandler);
5257    Py_XDECREF(exc);
5258    return v;
5259  error:
5260    Py_XDECREF(rep);
5261    Py_XDECREF(errorHandler);
5262    Py_XDECREF(exc);
5263    Py_XDECREF(v);
5264    return NULL;
5265#undef STORECHAR
5266}
5267
5268PyObject *
5269PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5270                      Py_ssize_t size,
5271                      const char *errors,
5272                      int byteorder)
5273{
5274    PyObject *result;
5275    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5276    if (tmp == NULL)
5277        return NULL;
5278    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5279    Py_DECREF(tmp);
5280    return result;
5281}
5282
5283PyObject *
5284PyUnicode_AsUTF32String(PyObject *unicode)
5285{
5286    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5287}
5288
5289/* --- UTF-16 Codec ------------------------------------------------------- */
5290
5291PyObject *
5292PyUnicode_DecodeUTF16(const char *s,
5293                      Py_ssize_t size,
5294                      const char *errors,
5295                      int *byteorder)
5296{
5297    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5298}
5299
5300PyObject *
5301PyUnicode_DecodeUTF16Stateful(const char *s,
5302                              Py_ssize_t size,
5303                              const char *errors,
5304                              int *byteorder,
5305                              Py_ssize_t *consumed)
5306{
5307    const char *starts = s;
5308    Py_ssize_t startinpos;
5309    Py_ssize_t endinpos;
5310    _PyUnicodeWriter writer;
5311    const unsigned char *q, *e;
5312    int bo = 0;       /* assume native ordering by default */
5313    int native_ordering;
5314    const char *errmsg = "";
5315    PyObject *errorHandler = NULL;
5316    PyObject *exc = NULL;
5317    const char *encoding;
5318
5319    q = (unsigned char *)s;
5320    e = q + size;
5321
5322    if (byteorder)
5323        bo = *byteorder;
5324
5325    /* Check for BOM marks (U+FEFF) in the input and adjust current
5326       byte order setting accordingly. In native mode, the leading BOM
5327       mark is skipped, in all other modes, it is copied to the output
5328       stream as-is (giving a ZWNBSP character). */
5329    if (bo == 0 && size >= 2) {
5330        const Py_UCS4 bom = (q[1] << 8) | q[0];
5331        if (bom == 0xFEFF) {
5332            q += 2;
5333            bo = -1;
5334        }
5335        else if (bom == 0xFFFE) {
5336            q += 2;
5337            bo = 1;
5338        }
5339        if (byteorder)
5340            *byteorder = bo;
5341    }
5342
5343    if (q == e) {
5344        if (consumed)
5345            *consumed = size;
5346        _Py_RETURN_UNICODE_EMPTY();
5347    }
5348
5349#if PY_LITTLE_ENDIAN
5350    native_ordering = bo <= 0;
5351    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5352#else
5353    native_ordering = bo >= 0;
5354    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5355#endif
5356
5357    /* Note: size will always be longer than the resulting Unicode
5358       character count */
5359    _PyUnicodeWriter_Init(&writer);
5360    writer.min_length = (e - q + 1) / 2;
5361    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5362        goto onError;
5363
5364    while (1) {
5365        Py_UCS4 ch = 0;
5366        if (e - q >= 2) {
5367            int kind = writer.kind;
5368            if (kind == PyUnicode_1BYTE_KIND) {
5369                if (PyUnicode_IS_ASCII(writer.buffer))
5370                    ch = asciilib_utf16_decode(&q, e,
5371                            (Py_UCS1*)writer.data, &writer.pos,
5372                            native_ordering);
5373                else
5374                    ch = ucs1lib_utf16_decode(&q, e,
5375                            (Py_UCS1*)writer.data, &writer.pos,
5376                            native_ordering);
5377            } else if (kind == PyUnicode_2BYTE_KIND) {
5378                ch = ucs2lib_utf16_decode(&q, e,
5379                        (Py_UCS2*)writer.data, &writer.pos,
5380                        native_ordering);
5381            } else {
5382                assert(kind == PyUnicode_4BYTE_KIND);
5383                ch = ucs4lib_utf16_decode(&q, e,
5384                        (Py_UCS4*)writer.data, &writer.pos,
5385                        native_ordering);
5386            }
5387        }
5388
5389        switch (ch)
5390        {
5391        case 0:
5392            /* remaining byte at the end? (size should be even) */
5393            if (q == e || consumed)
5394                goto End;
5395            errmsg = "truncated data";
5396            startinpos = ((const char *)q) - starts;
5397            endinpos = ((const char *)e) - starts;
5398            break;
5399            /* The remaining input chars are ignored if the callback
5400               chooses to skip the input */
5401        case 1:
5402            q -= 2;
5403            if (consumed)
5404                goto End;
5405            errmsg = "unexpected end of data";
5406            startinpos = ((const char *)q) - starts;
5407            endinpos = ((const char *)e) - starts;
5408            break;
5409        case 2:
5410            errmsg = "illegal encoding";
5411            startinpos = ((const char *)q) - 2 - starts;
5412            endinpos = startinpos + 2;
5413            break;
5414        case 3:
5415            errmsg = "illegal UTF-16 surrogate";
5416            startinpos = ((const char *)q) - 4 - starts;
5417            endinpos = startinpos + 2;
5418            break;
5419        default:
5420            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5421                goto onError;
5422            continue;
5423        }
5424
5425        if (unicode_decode_call_errorhandler_writer(
5426                errors,
5427                &errorHandler,
5428                encoding, errmsg,
5429                &starts,
5430                (const char **)&e,
5431                &startinpos,
5432                &endinpos,
5433                &exc,
5434                (const char **)&q,
5435                &writer))
5436            goto onError;
5437    }
5438
5439End:
5440    if (consumed)
5441        *consumed = (const char *)q-starts;
5442
5443    Py_XDECREF(errorHandler);
5444    Py_XDECREF(exc);
5445    return _PyUnicodeWriter_Finish(&writer);
5446
5447  onError:
5448    _PyUnicodeWriter_Dealloc(&writer);
5449    Py_XDECREF(errorHandler);
5450    Py_XDECREF(exc);
5451    return NULL;
5452}
5453
5454PyObject *
5455_PyUnicode_EncodeUTF16(PyObject *str,
5456                       const char *errors,
5457                       int byteorder)
5458{
5459    enum PyUnicode_Kind kind;
5460    const void *data;
5461    Py_ssize_t len;
5462    PyObject *v;
5463    unsigned short *out;
5464    Py_ssize_t pairs;
5465#if PY_BIG_ENDIAN
5466    int native_ordering = byteorder >= 0;
5467#else
5468    int native_ordering = byteorder <= 0;
5469#endif
5470    const char *encoding;
5471    Py_ssize_t nsize, pos;
5472    PyObject *errorHandler = NULL;
5473    PyObject *exc = NULL;
5474    PyObject *rep = NULL;
5475
5476    if (!PyUnicode_Check(str)) {
5477        PyErr_BadArgument();
5478        return NULL;
5479    }
5480    if (PyUnicode_READY(str) == -1)
5481        return NULL;
5482    kind = PyUnicode_KIND(str);
5483    data = PyUnicode_DATA(str);
5484    len = PyUnicode_GET_LENGTH(str);
5485
5486    pairs = 0;
5487    if (kind == PyUnicode_4BYTE_KIND) {
5488        const Py_UCS4 *in = (const Py_UCS4 *)data;
5489        const Py_UCS4 *end = in + len;
5490        while (in < end)
5491            if (*in++ >= 0x10000)
5492                pairs++;
5493    }
5494    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5495        return PyErr_NoMemory();
5496    nsize = len + pairs + (byteorder == 0);
5497    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5498    if (v == NULL)
5499        return NULL;
5500
5501    /* output buffer is 2-bytes aligned */
5502    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5503    out = (unsigned short *)PyBytes_AS_STRING(v);
5504    if (byteorder == 0)
5505        *out++ = 0xFEFF;
5506    if (len == 0)
5507        goto done;
5508
5509    if (kind == PyUnicode_1BYTE_KIND) {
5510        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5511        goto done;
5512    }
5513
5514    if (byteorder < 0)
5515        encoding = "utf-16-le";
5516    else if (byteorder > 0)
5517        encoding = "utf-16-be";
5518    else
5519        encoding = "utf-16";
5520
5521    pos = 0;
5522    while (pos < len) {
5523        Py_ssize_t repsize, moreunits;
5524
5525        if (kind == PyUnicode_2BYTE_KIND) {
5526            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5527                                        &out, native_ordering);
5528        }
5529        else {
5530            assert(kind == PyUnicode_4BYTE_KIND);
5531            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5532                                        &out, native_ordering);
5533        }
5534        if (pos == len)
5535            break;
5536
5537        rep = unicode_encode_call_errorhandler(
5538                errors, &errorHandler,
5539                encoding, "surrogates not allowed",
5540                str, &exc, pos, pos + 1, &pos);
5541        if (!rep)
5542            goto error;
5543
5544        if (PyBytes_Check(rep)) {
5545            repsize = PyBytes_GET_SIZE(rep);
5546            if (repsize & 1) {
5547                raise_encode_exception(&exc, encoding,
5548                                       str, pos - 1, pos,
5549                                       "surrogates not allowed");
5550                goto error;
5551            }
5552            moreunits = repsize / 2;
5553        }
5554        else {
5555            assert(PyUnicode_Check(rep));
5556            if (PyUnicode_READY(rep) < 0)
5557                goto error;
5558            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5559            if (!PyUnicode_IS_ASCII(rep)) {
5560                raise_encode_exception(&exc, encoding,
5561                                       str, pos - 1, pos,
5562                                       "surrogates not allowed");
5563                goto error;
5564            }
5565        }
5566
5567        /* two bytes are reserved for each surrogate */
5568        if (moreunits > 1) {
5569            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5570            Py_ssize_t morebytes = 2 * (moreunits - 1);
5571            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5572                /* integer overflow */
5573                PyErr_NoMemory();
5574                goto error;
5575            }
5576            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5577                goto error;
5578            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5579        }
5580
5581        if (PyBytes_Check(rep)) {
5582            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5583            out += moreunits;
5584        } else /* rep is unicode */ {
5585            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5586            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5587                                 &out, native_ordering);
5588        }
5589
5590        Py_CLEAR(rep);
5591    }
5592
5593    /* Cut back to size actually needed. This is necessary for, for example,
5594    encoding of a string containing isolated surrogates and the 'ignore' handler
5595    is used. */
5596    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5597    if (nsize != PyBytes_GET_SIZE(v))
5598      _PyBytes_Resize(&v, nsize);
5599    Py_XDECREF(errorHandler);
5600    Py_XDECREF(exc);
5601  done:
5602    return v;
5603  error:
5604    Py_XDECREF(rep);
5605    Py_XDECREF(errorHandler);
5606    Py_XDECREF(exc);
5607    Py_XDECREF(v);
5608    return NULL;
5609#undef STORECHAR
5610}
5611
5612PyObject *
5613PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5614                      Py_ssize_t size,
5615                      const char *errors,
5616                      int byteorder)
5617{
5618    PyObject *result;
5619    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5620    if (tmp == NULL)
5621        return NULL;
5622    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5623    Py_DECREF(tmp);
5624    return result;
5625}
5626
5627PyObject *
5628PyUnicode_AsUTF16String(PyObject *unicode)
5629{
5630    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5631}
5632
5633/* --- Unicode Escape Codec ----------------------------------------------- */
5634
5635/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5636   if all the escapes in the string make it still a valid ASCII string.
5637   Returns -1 if any escapes were found which cause the string to
5638   pop out of ASCII range.  Otherwise returns the length of the
5639   required buffer to hold the string.
5640   */
5641static Py_ssize_t
5642length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5643{
5644    const unsigned char *p = (const unsigned char *)s;
5645    const unsigned char *end = p + size;
5646    Py_ssize_t length = 0;
5647
5648    if (size < 0)
5649        return -1;
5650
5651    for (; p < end; ++p) {
5652        if (*p > 127) {
5653            /* Non-ASCII */
5654            return -1;
5655        }
5656        else if (*p != '\\') {
5657            /* Normal character */
5658            ++length;
5659        }
5660        else {
5661            /* Backslash-escape, check next char */
5662            ++p;
5663            /* Escape sequence reaches till end of string or
5664               non-ASCII follow-up. */
5665            if (p >= end || *p > 127)
5666                return -1;
5667            switch (*p) {
5668            case '\n':
5669                /* backslash + \n result in zero characters */
5670                break;
5671            case '\\': case '\'': case '\"':
5672            case 'b': case 'f': case 't':
5673            case 'n': case 'r': case 'v': case 'a':
5674                ++length;
5675                break;
5676            case '0': case '1': case '2': case '3':
5677            case '4': case '5': case '6': case '7':
5678            case 'x': case 'u': case 'U': case 'N':
5679                /* these do not guarantee ASCII characters */
5680                return -1;
5681            default:
5682                /* count the backslash + the other character */
5683                length += 2;
5684            }
5685        }
5686    }
5687    return length;
5688}
5689
5690static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5691
5692PyObject *
5693PyUnicode_DecodeUnicodeEscape(const char *s,
5694                              Py_ssize_t size,
5695                              const char *errors)
5696{
5697    const char *starts = s;
5698    Py_ssize_t startinpos;
5699    Py_ssize_t endinpos;
5700    _PyUnicodeWriter writer;
5701    const char *end;
5702    char* message;
5703    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5704    PyObject *errorHandler = NULL;
5705    PyObject *exc = NULL;
5706    Py_ssize_t len;
5707
5708    len = length_of_escaped_ascii_string(s, size);
5709    if (len == 0)
5710        _Py_RETURN_UNICODE_EMPTY();
5711
5712    /* After length_of_escaped_ascii_string() there are two alternatives,
5713       either the string is pure ASCII with named escapes like \n, etc.
5714       and we determined it's exact size (common case)
5715       or it contains \x, \u, ... escape sequences.  then we create a
5716       legacy wchar string and resize it at the end of this function. */
5717    _PyUnicodeWriter_Init(&writer);
5718    if (len > 0) {
5719        writer.min_length = len;
5720    }
5721    else {
5722        /* Escaped strings will always be longer than the resulting
5723           Unicode string, so we start with size here and then reduce the
5724           length after conversion to the true value.
5725           (but if the error callback returns a long replacement string
5726           we'll have to allocate more space) */
5727        writer.min_length = size;
5728    }
5729
5730    if (size == 0)
5731        return _PyUnicodeWriter_Finish(&writer);
5732    end = s + size;
5733
5734    while (s < end) {
5735        unsigned char c;
5736        Py_UCS4 x;
5737        int digits;
5738
5739        /* Non-escape characters are interpreted as Unicode ordinals */
5740        if (*s != '\\') {
5741            x = (unsigned char)*s;
5742            s++;
5743            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5744                goto onError;
5745            continue;
5746        }
5747
5748        startinpos = s-starts;
5749        /* \ - Escapes */
5750        s++;
5751        c = *s++;
5752        if (s > end)
5753            c = '\0'; /* Invalid after \ */
5754
5755        switch (c) {
5756
5757            /* \x escapes */
5758#define WRITECHAR(ch)                                                      \
5759            do {                                                           \
5760                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5761                    goto onError;                                          \
5762            } while(0)
5763
5764        case '\n': break;
5765        case '\\': WRITECHAR('\\'); break;
5766        case '\'': WRITECHAR('\''); break;
5767        case '\"': WRITECHAR('\"'); break;
5768        case 'b': WRITECHAR('\b'); break;
5769        /* FF */
5770        case 'f': WRITECHAR('\014'); break;
5771        case 't': WRITECHAR('\t'); break;
5772        case 'n': WRITECHAR('\n'); break;
5773        case 'r': WRITECHAR('\r'); break;
5774        /* VT */
5775        case 'v': WRITECHAR('\013'); break;
5776        /* BEL, not classic C */
5777        case 'a': WRITECHAR('\007'); break;
5778
5779            /* \OOO (octal) escapes */
5780        case '0': case '1': case '2': case '3':
5781        case '4': case '5': case '6': case '7':
5782            x = s[-1] - '0';
5783            if (s < end && '0' <= *s && *s <= '7') {
5784                x = (x<<3) + *s++ - '0';
5785                if (s < end && '0' <= *s && *s <= '7')
5786                    x = (x<<3) + *s++ - '0';
5787            }
5788            WRITECHAR(x);
5789            break;
5790
5791            /* hex escapes */
5792            /* \xXX */
5793        case 'x':
5794            digits = 2;
5795            message = "truncated \\xXX escape";
5796            goto hexescape;
5797
5798            /* \uXXXX */
5799        case 'u':
5800            digits = 4;
5801            message = "truncated \\uXXXX escape";
5802            goto hexescape;
5803
5804            /* \UXXXXXXXX */
5805        case 'U':
5806            digits = 8;
5807            message = "truncated \\UXXXXXXXX escape";
5808        hexescape:
5809            chr = 0;
5810            if (end - s < digits) {
5811                /* count only hex digits */
5812                for (; s < end; ++s) {
5813                    c = (unsigned char)*s;
5814                    if (!Py_ISXDIGIT(c))
5815                        goto error;
5816                }
5817                goto error;
5818            }
5819            for (; digits--; ++s) {
5820                c = (unsigned char)*s;
5821                if (!Py_ISXDIGIT(c))
5822                    goto error;
5823                chr = (chr<<4) & ~0xF;
5824                if (c >= '0' && c <= '9')
5825                    chr += c - '0';
5826                else if (c >= 'a' && c <= 'f')
5827                    chr += 10 + c - 'a';
5828                else
5829                    chr += 10 + c - 'A';
5830            }
5831            if (chr == 0xffffffff && PyErr_Occurred())
5832                /* _decoding_error will have already written into the
5833                   target buffer. */
5834                break;
5835        store:
5836            /* when we get here, chr is a 32-bit unicode character */
5837            message = "illegal Unicode character";
5838            if (chr > MAX_UNICODE)
5839                goto error;
5840            WRITECHAR(chr);
5841            break;
5842
5843            /* \N{name} */
5844        case 'N':
5845            message = "malformed \\N character escape";
5846            if (ucnhash_CAPI == NULL) {
5847                /* load the unicode data module */
5848                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5849                                                PyUnicodeData_CAPSULE_NAME, 1);
5850                if (ucnhash_CAPI == NULL)
5851                    goto ucnhashError;
5852            }
5853            if (*s == '{') {
5854                const char *start = s+1;
5855                /* look for the closing brace */
5856                while (*s != '}' && s < end)
5857                    s++;
5858                if (s > start && s < end && *s == '}') {
5859                    /* found a name.  look it up in the unicode database */
5860                    message = "unknown Unicode character name";
5861                    s++;
5862                    if (s - start - 1 <= INT_MAX &&
5863                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5864                                              &chr, 0))
5865                        goto store;
5866                }
5867            }
5868            goto error;
5869
5870        default:
5871            if (s > end) {
5872                message = "\\ at end of string";
5873                s--;
5874                goto error;
5875            }
5876            else {
5877                WRITECHAR('\\');
5878                WRITECHAR((unsigned char)s[-1]);
5879            }
5880            break;
5881        }
5882        continue;
5883
5884      error:
5885        endinpos = s-starts;
5886        if (unicode_decode_call_errorhandler_writer(
5887                errors, &errorHandler,
5888                "unicodeescape", message,
5889                &starts, &end, &startinpos, &endinpos, &exc, &s,
5890                &writer))
5891            goto onError;
5892        continue;
5893    }
5894#undef WRITECHAR
5895
5896    Py_XDECREF(errorHandler);
5897    Py_XDECREF(exc);
5898    return _PyUnicodeWriter_Finish(&writer);
5899
5900  ucnhashError:
5901    PyErr_SetString(
5902        PyExc_UnicodeError,
5903        "\\N escapes not supported (can't load unicodedata module)"
5904        );
5905    _PyUnicodeWriter_Dealloc(&writer);
5906    Py_XDECREF(errorHandler);
5907    Py_XDECREF(exc);
5908    return NULL;
5909
5910  onError:
5911    _PyUnicodeWriter_Dealloc(&writer);
5912    Py_XDECREF(errorHandler);
5913    Py_XDECREF(exc);
5914    return NULL;
5915}
5916
5917/* Return a Unicode-Escape string version of the Unicode object.
5918
5919   If quotes is true, the string is enclosed in u"" or u'' quotes as
5920   appropriate.
5921
5922*/
5923
5924PyObject *
5925PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5926{
5927    Py_ssize_t i, len;
5928    PyObject *repr;
5929    char *p;
5930    int kind;
5931    void *data;
5932    Py_ssize_t expandsize = 0;
5933
5934    /* Initial allocation is based on the longest-possible character
5935       escape.
5936
5937       For UCS1 strings it's '\xxx', 4 bytes per source character.
5938       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5939       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5940    */
5941
5942    if (!PyUnicode_Check(unicode)) {
5943        PyErr_BadArgument();
5944        return NULL;
5945    }
5946    if (PyUnicode_READY(unicode) == -1)
5947        return NULL;
5948    len = PyUnicode_GET_LENGTH(unicode);
5949    kind = PyUnicode_KIND(unicode);
5950    data = PyUnicode_DATA(unicode);
5951    switch (kind) {
5952    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5953    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5954    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5955    }
5956
5957    if (len == 0)
5958        return PyBytes_FromStringAndSize(NULL, 0);
5959
5960    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5961        return PyErr_NoMemory();
5962
5963    repr = PyBytes_FromStringAndSize(NULL,
5964                                     2
5965                                     + expandsize*len
5966                                     + 1);
5967    if (repr == NULL)
5968        return NULL;
5969
5970    p = PyBytes_AS_STRING(repr);
5971
5972    for (i = 0; i < len; i++) {
5973        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5974
5975        /* Escape backslashes */
5976        if (ch == '\\') {
5977            *p++ = '\\';
5978            *p++ = (char) ch;
5979            continue;
5980        }
5981
5982        /* Map 21-bit characters to '\U00xxxxxx' */
5983        else if (ch >= 0x10000) {
5984            assert(ch <= MAX_UNICODE);
5985            *p++ = '\\';
5986            *p++ = 'U';
5987            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5988            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5989            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5990            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5991            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5992            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5993            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5994            *p++ = Py_hexdigits[ch & 0x0000000F];
5995            continue;
5996        }
5997
5998        /* Map 16-bit characters to '\uxxxx' */
5999        if (ch >= 256) {
6000            *p++ = '\\';
6001            *p++ = 'u';
6002            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6003            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6004            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6005            *p++ = Py_hexdigits[ch & 0x000F];
6006        }
6007
6008        /* Map special whitespace to '\t', \n', '\r' */
6009        else if (ch == '\t') {
6010            *p++ = '\\';
6011            *p++ = 't';
6012        }
6013        else if (ch == '\n') {
6014            *p++ = '\\';
6015            *p++ = 'n';
6016        }
6017        else if (ch == '\r') {
6018            *p++ = '\\';
6019            *p++ = 'r';
6020        }
6021
6022        /* Map non-printable US ASCII to '\xhh' */
6023        else if (ch < ' ' || ch >= 0x7F) {
6024            *p++ = '\\';
6025            *p++ = 'x';
6026            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6027            *p++ = Py_hexdigits[ch & 0x000F];
6028        }
6029
6030        /* Copy everything else as-is */
6031        else
6032            *p++ = (char) ch;
6033    }
6034
6035    assert(p - PyBytes_AS_STRING(repr) > 0);
6036    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037        return NULL;
6038    return repr;
6039}
6040
6041PyObject *
6042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6043                              Py_ssize_t size)
6044{
6045    PyObject *result;
6046    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6047    if (tmp == NULL)
6048        return NULL;
6049    result = PyUnicode_AsUnicodeEscapeString(tmp);
6050    Py_DECREF(tmp);
6051    return result;
6052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
6056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
6058                                 Py_ssize_t size,
6059                                 const char *errors)
6060{
6061    const char *starts = s;
6062    Py_ssize_t startinpos;
6063    Py_ssize_t endinpos;
6064    _PyUnicodeWriter writer;
6065    const char *end;
6066    const char *bs;
6067    PyObject *errorHandler = NULL;
6068    PyObject *exc = NULL;
6069
6070    if (size == 0)
6071        _Py_RETURN_UNICODE_EMPTY();
6072
6073    /* Escaped strings will always be longer than the resulting
6074       Unicode string, so we start with size here and then reduce the
6075       length after conversion to the true value. (But decoding error
6076       handler might have to resize the string) */
6077    _PyUnicodeWriter_Init(&writer);
6078    writer.min_length = size;
6079
6080    end = s + size;
6081    while (s < end) {
6082        unsigned char c;
6083        Py_UCS4 x;
6084        int i;
6085        int count;
6086
6087        /* Non-escape characters are interpreted as Unicode ordinals */
6088        if (*s != '\\') {
6089            x = (unsigned char)*s++;
6090            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6091                goto onError;
6092            continue;
6093        }
6094        startinpos = s-starts;
6095
6096        /* \u-escapes are only interpreted iff the number of leading
6097           backslashes if odd */
6098        bs = s;
6099        for (;s < end;) {
6100            if (*s != '\\')
6101                break;
6102            x = (unsigned char)*s++;
6103            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6104                goto onError;
6105        }
6106        if (((s - bs) & 1) == 0 ||
6107            s >= end ||
6108            (*s != 'u' && *s != 'U')) {
6109            continue;
6110        }
6111        writer.pos--;
6112        count = *s=='u' ? 4 : 8;
6113        s++;
6114
6115        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6116        for (x = 0, i = 0; i < count; ++i, ++s) {
6117            c = (unsigned char)*s;
6118            if (!Py_ISXDIGIT(c)) {
6119                endinpos = s-starts;
6120                if (unicode_decode_call_errorhandler_writer(
6121                        errors, &errorHandler,
6122                        "rawunicodeescape", "truncated \\uXXXX",
6123                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6124                        &writer))
6125                    goto onError;
6126                goto nextByte;
6127            }
6128            x = (x<<4) & ~0xF;
6129            if (c >= '0' && c <= '9')
6130                x += c - '0';
6131            else if (c >= 'a' && c <= 'f')
6132                x += 10 + c - 'a';
6133            else
6134                x += 10 + c - 'A';
6135        }
6136        if (x <= MAX_UNICODE) {
6137            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6138                goto onError;
6139        }
6140        else {
6141            endinpos = s-starts;
6142            if (unicode_decode_call_errorhandler_writer(
6143                    errors, &errorHandler,
6144                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6145                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6146                    &writer))
6147                goto onError;
6148        }
6149      nextByte:
6150        ;
6151    }
6152    Py_XDECREF(errorHandler);
6153    Py_XDECREF(exc);
6154    return _PyUnicodeWriter_Finish(&writer);
6155
6156  onError:
6157    _PyUnicodeWriter_Dealloc(&writer);
6158    Py_XDECREF(errorHandler);
6159    Py_XDECREF(exc);
6160    return NULL;
6161}
6162
6163
6164PyObject *
6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6166{
6167    PyObject *repr;
6168    char *p;
6169    char *q;
6170    Py_ssize_t expandsize, pos;
6171    int kind;
6172    void *data;
6173    Py_ssize_t len;
6174
6175    if (!PyUnicode_Check(unicode)) {
6176        PyErr_BadArgument();
6177        return NULL;
6178    }
6179    if (PyUnicode_READY(unicode) == -1)
6180        return NULL;
6181    kind = PyUnicode_KIND(unicode);
6182    data = PyUnicode_DATA(unicode);
6183    len = PyUnicode_GET_LENGTH(unicode);
6184    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6185       bytes, and 1 byte characters 4. */
6186    expandsize = kind * 2 + 2;
6187
6188    if (len > PY_SSIZE_T_MAX / expandsize)
6189        return PyErr_NoMemory();
6190
6191    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6192    if (repr == NULL)
6193        return NULL;
6194    if (len == 0)
6195        return repr;
6196
6197    p = q = PyBytes_AS_STRING(repr);
6198    for (pos = 0; pos < len; pos++) {
6199        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6200        /* Map 32-bit characters to '\Uxxxxxxxx' */
6201        if (ch >= 0x10000) {
6202            assert(ch <= MAX_UNICODE);
6203            *p++ = '\\';
6204            *p++ = 'U';
6205            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6206            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6207            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6208            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6209            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6210            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6211            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6212            *p++ = Py_hexdigits[ch & 15];
6213        }
6214        /* Map 16-bit characters to '\uxxxx' */
6215        else if (ch >= 256) {
6216            *p++ = '\\';
6217            *p++ = 'u';
6218            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6219            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6220            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6221            *p++ = Py_hexdigits[ch & 15];
6222        }
6223        /* Copy everything else as-is */
6224        else
6225            *p++ = (char) ch;
6226    }
6227
6228    assert(p > q);
6229    if (_PyBytes_Resize(&repr, p - q) < 0)
6230        return NULL;
6231    return repr;
6232}
6233
6234PyObject *
6235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6236                                 Py_ssize_t size)
6237{
6238    PyObject *result;
6239    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6240    if (tmp == NULL)
6241        return NULL;
6242    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6243    Py_DECREF(tmp);
6244    return result;
6245}
6246
6247/* --- Unicode Internal Codec ------------------------------------------- */
6248
6249PyObject *
6250_PyUnicode_DecodeUnicodeInternal(const char *s,
6251                                 Py_ssize_t size,
6252                                 const char *errors)
6253{
6254    const char *starts = s;
6255    Py_ssize_t startinpos;
6256    Py_ssize_t endinpos;
6257    _PyUnicodeWriter writer;
6258    const char *end;
6259    const char *reason;
6260    PyObject *errorHandler = NULL;
6261    PyObject *exc = NULL;
6262
6263    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6264                     "unicode_internal codec has been deprecated",
6265                     1))
6266        return NULL;
6267
6268    if (size == 0)
6269        _Py_RETURN_UNICODE_EMPTY();
6270
6271    _PyUnicodeWriter_Init(&writer);
6272    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6273        PyErr_NoMemory();
6274        goto onError;
6275    }
6276    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6277
6278    end = s + size;
6279    while (s < end) {
6280        Py_UNICODE uch;
6281        Py_UCS4 ch;
6282        if (end - s < Py_UNICODE_SIZE) {
6283            endinpos = end-starts;
6284            reason = "truncated input";
6285            goto error;
6286        }
6287        /* We copy the raw representation one byte at a time because the
6288           pointer may be unaligned (see test_codeccallbacks). */
6289        ((char *) &uch)[0] = s[0];
6290        ((char *) &uch)[1] = s[1];
6291#ifdef Py_UNICODE_WIDE
6292        ((char *) &uch)[2] = s[2];
6293        ((char *) &uch)[3] = s[3];
6294#endif
6295        ch = uch;
6296#ifdef Py_UNICODE_WIDE
6297        /* We have to sanity check the raw data, otherwise doom looms for
6298           some malformed UCS-4 data. */
6299        if (ch > 0x10ffff) {
6300            endinpos = s - starts + Py_UNICODE_SIZE;
6301            reason = "illegal code point (> 0x10FFFF)";
6302            goto error;
6303        }
6304#endif
6305        s += Py_UNICODE_SIZE;
6306#ifndef Py_UNICODE_WIDE
6307        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6308        {
6309            Py_UNICODE uch2;
6310            ((char *) &uch2)[0] = s[0];
6311            ((char *) &uch2)[1] = s[1];
6312            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6313            {
6314                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6315                s += Py_UNICODE_SIZE;
6316            }
6317        }
6318#endif
6319
6320        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6321            goto onError;
6322        continue;
6323
6324  error:
6325        startinpos = s - starts;
6326        if (unicode_decode_call_errorhandler_writer(
6327                errors, &errorHandler,
6328                "unicode_internal", reason,
6329                &starts, &end, &startinpos, &endinpos, &exc, &s,
6330                &writer))
6331            goto onError;
6332    }
6333
6334    Py_XDECREF(errorHandler);
6335    Py_XDECREF(exc);
6336    return _PyUnicodeWriter_Finish(&writer);
6337
6338  onError:
6339    _PyUnicodeWriter_Dealloc(&writer);
6340    Py_XDECREF(errorHandler);
6341    Py_XDECREF(exc);
6342    return NULL;
6343}
6344
6345/* --- Latin-1 Codec ------------------------------------------------------ */
6346
6347PyObject *
6348PyUnicode_DecodeLatin1(const char *s,
6349                       Py_ssize_t size,
6350                       const char *errors)
6351{
6352    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6353    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6354}
6355
6356/* create or adjust a UnicodeEncodeError */
6357static void
6358make_encode_exception(PyObject **exceptionObject,
6359                      const char *encoding,
6360                      PyObject *unicode,
6361                      Py_ssize_t startpos, Py_ssize_t endpos,
6362                      const char *reason)
6363{
6364    if (*exceptionObject == NULL) {
6365        *exceptionObject = PyObject_CallFunction(
6366            PyExc_UnicodeEncodeError, "sOnns",
6367            encoding, unicode, startpos, endpos, reason);
6368    }
6369    else {
6370        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6371            goto onError;
6372        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6373            goto onError;
6374        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6375            goto onError;
6376        return;
6377      onError:
6378        Py_CLEAR(*exceptionObject);
6379    }
6380}
6381
6382/* raises a UnicodeEncodeError */
6383static void
6384raise_encode_exception(PyObject **exceptionObject,
6385                       const char *encoding,
6386                       PyObject *unicode,
6387                       Py_ssize_t startpos, Py_ssize_t endpos,
6388                       const char *reason)
6389{
6390    make_encode_exception(exceptionObject,
6391                          encoding, unicode, startpos, endpos, reason);
6392    if (*exceptionObject != NULL)
6393        PyCodec_StrictErrors(*exceptionObject);
6394}
6395
6396/* error handling callback helper:
6397   build arguments, call the callback and check the arguments,
6398   put the result into newpos and return the replacement string, which
6399   has to be freed by the caller */
6400static PyObject *
6401unicode_encode_call_errorhandler(const char *errors,
6402                                 PyObject **errorHandler,
6403                                 const char *encoding, const char *reason,
6404                                 PyObject *unicode, PyObject **exceptionObject,
6405                                 Py_ssize_t startpos, Py_ssize_t endpos,
6406                                 Py_ssize_t *newpos)
6407{
6408    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6409    Py_ssize_t len;
6410    PyObject *restuple;
6411    PyObject *resunicode;
6412
6413    if (*errorHandler == NULL) {
6414        *errorHandler = PyCodec_LookupError(errors);
6415        if (*errorHandler == NULL)
6416            return NULL;
6417    }
6418
6419    if (PyUnicode_READY(unicode) == -1)
6420        return NULL;
6421    len = PyUnicode_GET_LENGTH(unicode);
6422
6423    make_encode_exception(exceptionObject,
6424                          encoding, unicode, startpos, endpos, reason);
6425    if (*exceptionObject == NULL)
6426        return NULL;
6427
6428    restuple = PyObject_CallFunctionObjArgs(
6429        *errorHandler, *exceptionObject, NULL);
6430    if (restuple == NULL)
6431        return NULL;
6432    if (!PyTuple_Check(restuple)) {
6433        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6434        Py_DECREF(restuple);
6435        return NULL;
6436    }
6437    if (!PyArg_ParseTuple(restuple, argparse,
6438                          &resunicode, newpos)) {
6439        Py_DECREF(restuple);
6440        return NULL;
6441    }
6442    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6443        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6444        Py_DECREF(restuple);
6445        return NULL;
6446    }
6447    if (*newpos<0)
6448        *newpos = len + *newpos;
6449    if (*newpos<0 || *newpos>len) {
6450        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6451        Py_DECREF(restuple);
6452        return NULL;
6453    }
6454    Py_INCREF(resunicode);
6455    Py_DECREF(restuple);
6456    return resunicode;
6457}
6458
6459static PyObject *
6460unicode_encode_ucs1(PyObject *unicode,
6461                    const char *errors,
6462                    unsigned int limit)
6463{
6464    /* input state */
6465    Py_ssize_t pos=0, size;
6466    int kind;
6467    void *data;
6468    /* output object */
6469    PyObject *res;
6470    /* pointer into the output */
6471    char *str;
6472    /* current output position */
6473    Py_ssize_t ressize;
6474    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6475    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6476    PyObject *errorHandler = NULL;
6477    PyObject *exc = NULL;
6478    /* the following variable is used for caching string comparisons
6479     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6480    int known_errorHandler = -1;
6481
6482    if (PyUnicode_READY(unicode) == -1)
6483        return NULL;
6484    size = PyUnicode_GET_LENGTH(unicode);
6485    kind = PyUnicode_KIND(unicode);
6486    data = PyUnicode_DATA(unicode);
6487    /* allocate enough for a simple encoding without
6488       replacements, if we need more, we'll resize */
6489    if (size == 0)
6490        return PyBytes_FromStringAndSize(NULL, 0);
6491    res = PyBytes_FromStringAndSize(NULL, size);
6492    if (res == NULL)
6493        return NULL;
6494    str = PyBytes_AS_STRING(res);
6495    ressize = size;
6496
6497    while (pos < size) {
6498        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6499
6500        /* can we encode this? */
6501        if (c<limit) {
6502            /* no overflow check, because we know that the space is enough */
6503            *str++ = (char)c;
6504            ++pos;
6505        }
6506        else {
6507            Py_ssize_t requiredsize;
6508            PyObject *repunicode;
6509            Py_ssize_t repsize, newpos, respos, i;
6510            /* startpos for collecting unencodable chars */
6511            Py_ssize_t collstart = pos;
6512            Py_ssize_t collend = pos;
6513            /* find all unecodable characters */
6514            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6515                ++collend;
6516            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6517            if (known_errorHandler==-1) {
6518                if ((errors==NULL) || (!strcmp(errors, "strict")))
6519                    known_errorHandler = 1;
6520                else if (!strcmp(errors, "replace"))
6521                    known_errorHandler = 2;
6522                else if (!strcmp(errors, "ignore"))
6523                    known_errorHandler = 3;
6524                else if (!strcmp(errors, "xmlcharrefreplace"))
6525                    known_errorHandler = 4;
6526                else
6527                    known_errorHandler = 0;
6528            }
6529            switch (known_errorHandler) {
6530            case 1: /* strict */
6531                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6532                goto onError;
6533            case 2: /* replace */
6534                while (collstart++ < collend)
6535                    *str++ = '?'; /* fall through */
6536            case 3: /* ignore */
6537                pos = collend;
6538                break;
6539            case 4: /* xmlcharrefreplace */
6540                respos = str - PyBytes_AS_STRING(res);
6541                requiredsize = respos;
6542                /* determine replacement size */
6543                for (i = collstart; i < collend; ++i) {
6544                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6545                    Py_ssize_t incr;
6546                    if (ch < 10)
6547                        incr = 2+1+1;
6548                    else if (ch < 100)
6549                        incr = 2+2+1;
6550                    else if (ch < 1000)
6551                        incr = 2+3+1;
6552                    else if (ch < 10000)
6553                        incr = 2+4+1;
6554                    else if (ch < 100000)
6555                        incr = 2+5+1;
6556                    else if (ch < 1000000)
6557                        incr = 2+6+1;
6558                    else {
6559                        assert(ch <= MAX_UNICODE);
6560                        incr = 2+7+1;
6561                    }
6562                    if (requiredsize > PY_SSIZE_T_MAX - incr)
6563                        goto overflow;
6564                    requiredsize += incr;
6565                }
6566                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6567                    goto overflow;
6568                requiredsize += size - collend;
6569                if (requiredsize > ressize) {
6570                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6571                        requiredsize = 2*ressize;
6572                    if (_PyBytes_Resize(&res, requiredsize))
6573                        goto onError;
6574                    str = PyBytes_AS_STRING(res) + respos;
6575                    ressize = requiredsize;
6576                }
6577                /* generate replacement */
6578                for (i = collstart; i < collend; ++i) {
6579                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6580                }
6581                pos = collend;
6582                break;
6583            default:
6584                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6585                                                              encoding, reason, unicode, &exc,
6586                                                              collstart, collend, &newpos);
6587                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6588                                           PyUnicode_READY(repunicode) == -1))
6589                    goto onError;
6590                if (PyBytes_Check(repunicode)) {
6591                    /* Directly copy bytes result to output. */
6592                    repsize = PyBytes_Size(repunicode);
6593                    if (repsize > 1) {
6594                        /* Make room for all additional bytes. */
6595                        respos = str - PyBytes_AS_STRING(res);
6596                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6597                            Py_DECREF(repunicode);
6598                            goto overflow;
6599                        }
6600                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6601                            Py_DECREF(repunicode);
6602                            goto onError;
6603                        }
6604                        str = PyBytes_AS_STRING(res) + respos;
6605                        ressize += repsize-1;
6606                    }
6607                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6608                    str += repsize;
6609                    pos = newpos;
6610                    Py_DECREF(repunicode);
6611                    break;
6612                }
6613                /* need more space? (at least enough for what we
6614                   have+the replacement+the rest of the string, so
6615                   we won't have to check space for encodable characters) */
6616                respos = str - PyBytes_AS_STRING(res);
6617                repsize = PyUnicode_GET_LENGTH(repunicode);
6618                requiredsize = respos;
6619                if (requiredsize > PY_SSIZE_T_MAX - repsize)
6620                    goto overflow;
6621                requiredsize += repsize;
6622                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6623                    goto overflow;
6624                requiredsize += size - collend;
6625                if (requiredsize > ressize) {
6626                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6627                        requiredsize = 2*ressize;
6628                    if (_PyBytes_Resize(&res, requiredsize)) {
6629                        Py_DECREF(repunicode);
6630                        goto onError;
6631                    }
6632                    str = PyBytes_AS_STRING(res) + respos;
6633                    ressize = requiredsize;
6634                }
6635                /* check if there is anything unencodable in the replacement
6636                   and copy it to the output */
6637                for (i = 0; repsize-->0; ++i, ++str) {
6638                    c = PyUnicode_READ_CHAR(repunicode, i);
6639                    if (c >= limit) {
6640                        raise_encode_exception(&exc, encoding, unicode,
6641                                               pos, pos+1, reason);
6642                        Py_DECREF(repunicode);
6643                        goto onError;
6644                    }
6645                    *str = (char)c;
6646                }
6647                pos = newpos;
6648                Py_DECREF(repunicode);
6649            }
6650        }
6651    }
6652    /* Resize if we allocated to much */
6653    size = str - PyBytes_AS_STRING(res);
6654    if (size < ressize) { /* If this falls res will be NULL */
6655        assert(size >= 0);
6656        if (_PyBytes_Resize(&res, size) < 0)
6657            goto onError;
6658    }
6659
6660    Py_XDECREF(errorHandler);
6661    Py_XDECREF(exc);
6662    return res;
6663
6664  overflow:
6665    PyErr_SetString(PyExc_OverflowError,
6666                    "encoded result is too long for a Python string");
6667
6668  onError:
6669    Py_XDECREF(res);
6670    Py_XDECREF(errorHandler);
6671    Py_XDECREF(exc);
6672    return NULL;
6673}
6674
6675/* Deprecated */
6676PyObject *
6677PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6678                       Py_ssize_t size,
6679                       const char *errors)
6680{
6681    PyObject *result;
6682    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6683    if (unicode == NULL)
6684        return NULL;
6685    result = unicode_encode_ucs1(unicode, errors, 256);
6686    Py_DECREF(unicode);
6687    return result;
6688}
6689
6690PyObject *
6691_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6692{
6693    if (!PyUnicode_Check(unicode)) {
6694        PyErr_BadArgument();
6695        return NULL;
6696    }
6697    if (PyUnicode_READY(unicode) == -1)
6698        return NULL;
6699    /* Fast path: if it is a one-byte string, construct
6700       bytes object directly. */
6701    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6702        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6703                                         PyUnicode_GET_LENGTH(unicode));
6704    /* Non-Latin-1 characters present. Defer to above function to
6705       raise the exception. */
6706    return unicode_encode_ucs1(unicode, errors, 256);
6707}
6708
6709PyObject*
6710PyUnicode_AsLatin1String(PyObject *unicode)
6711{
6712    return _PyUnicode_AsLatin1String(unicode, NULL);
6713}
6714
6715/* --- 7-bit ASCII Codec -------------------------------------------------- */
6716
6717PyObject *
6718PyUnicode_DecodeASCII(const char *s,
6719                      Py_ssize_t size,
6720                      const char *errors)
6721{
6722    const char *starts = s;
6723    _PyUnicodeWriter writer;
6724    int kind;
6725    void *data;
6726    Py_ssize_t startinpos;
6727    Py_ssize_t endinpos;
6728    Py_ssize_t outpos;
6729    const char *e;
6730    PyObject *errorHandler = NULL;
6731    PyObject *exc = NULL;
6732
6733    if (size == 0)
6734        _Py_RETURN_UNICODE_EMPTY();
6735
6736    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6737    if (size == 1 && (unsigned char)s[0] < 128)
6738        return get_latin1_char((unsigned char)s[0]);
6739
6740    _PyUnicodeWriter_Init(&writer);
6741    writer.min_length = size;
6742    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6743        return NULL;
6744
6745    e = s + size;
6746    data = writer.data;
6747    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6748    writer.pos = outpos;
6749    if (writer.pos == size)
6750        return _PyUnicodeWriter_Finish(&writer);
6751
6752    s += writer.pos;
6753    kind = writer.kind;
6754    while (s < e) {
6755        unsigned char c = (unsigned char)*s;
6756        if (c < 128) {
6757            PyUnicode_WRITE(kind, data, writer.pos, c);
6758            writer.pos++;
6759            ++s;
6760        }
6761        else {
6762            startinpos = s-starts;
6763            endinpos = startinpos + 1;
6764            if (unicode_decode_call_errorhandler_writer(
6765                    errors, &errorHandler,
6766                    "ascii", "ordinal not in range(128)",
6767                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6768                    &writer))
6769                goto onError;
6770            kind = writer.kind;
6771            data = writer.data;
6772        }
6773    }
6774    Py_XDECREF(errorHandler);
6775    Py_XDECREF(exc);
6776    return _PyUnicodeWriter_Finish(&writer);
6777
6778  onError:
6779    _PyUnicodeWriter_Dealloc(&writer);
6780    Py_XDECREF(errorHandler);
6781    Py_XDECREF(exc);
6782    return NULL;
6783}
6784
6785/* Deprecated */
6786PyObject *
6787PyUnicode_EncodeASCII(const Py_UNICODE *p,
6788                      Py_ssize_t size,
6789                      const char *errors)
6790{
6791    PyObject *result;
6792    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6793    if (unicode == NULL)
6794        return NULL;
6795    result = unicode_encode_ucs1(unicode, errors, 128);
6796    Py_DECREF(unicode);
6797    return result;
6798}
6799
6800PyObject *
6801_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6802{
6803    if (!PyUnicode_Check(unicode)) {
6804        PyErr_BadArgument();
6805        return NULL;
6806    }
6807    if (PyUnicode_READY(unicode) == -1)
6808        return NULL;
6809    /* Fast path: if it is an ASCII-only string, construct bytes object
6810       directly. Else defer to above function to raise the exception. */
6811    if (PyUnicode_IS_ASCII(unicode))
6812        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6813                                         PyUnicode_GET_LENGTH(unicode));
6814    return unicode_encode_ucs1(unicode, errors, 128);
6815}
6816
6817PyObject *
6818PyUnicode_AsASCIIString(PyObject *unicode)
6819{
6820    return _PyUnicode_AsASCIIString(unicode, NULL);
6821}
6822
6823#ifdef HAVE_MBCS
6824
6825/* --- MBCS codecs for Windows -------------------------------------------- */
6826
6827#if SIZEOF_INT < SIZEOF_SIZE_T
6828#define NEED_RETRY
6829#endif
6830
6831#ifndef WC_ERR_INVALID_CHARS
6832#  define WC_ERR_INVALID_CHARS 0x0080
6833#endif
6834
6835static char*
6836code_page_name(UINT code_page, PyObject **obj)
6837{
6838    *obj = NULL;
6839    if (code_page == CP_ACP)
6840        return "mbcs";
6841    if (code_page == CP_UTF7)
6842        return "CP_UTF7";
6843    if (code_page == CP_UTF8)
6844        return "CP_UTF8";
6845
6846    *obj = PyBytes_FromFormat("cp%u", code_page);
6847    if (*obj == NULL)
6848        return NULL;
6849    return PyBytes_AS_STRING(*obj);
6850}
6851
6852static DWORD
6853decode_code_page_flags(UINT code_page)
6854{
6855    if (code_page == CP_UTF7) {
6856        /* The CP_UTF7 decoder only supports flags=0 */
6857        return 0;
6858    }
6859    else
6860        return MB_ERR_INVALID_CHARS;
6861}
6862
6863/*
6864 * Decode a byte string from a Windows code page into unicode object in strict
6865 * mode.
6866 *
6867 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6868 * OSError and returns -1 on other error.
6869 */
6870static int
6871decode_code_page_strict(UINT code_page,
6872                        PyObject **v,
6873                        const char *in,
6874                        int insize)
6875{
6876    const DWORD flags = decode_code_page_flags(code_page);
6877    wchar_t *out;
6878    DWORD outsize;
6879
6880    /* First get the size of the result */
6881    assert(insize > 0);
6882    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6883    if (outsize <= 0)
6884        goto error;
6885
6886    if (*v == NULL) {
6887        /* Create unicode object */
6888        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6889        *v = (PyObject*)_PyUnicode_New(outsize);
6890        if (*v == NULL)
6891            return -1;
6892        out = PyUnicode_AS_UNICODE(*v);
6893    }
6894    else {
6895        /* Extend unicode object */
6896        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6897        if (unicode_resize(v, n + outsize) < 0)
6898            return -1;
6899        out = PyUnicode_AS_UNICODE(*v) + n;
6900    }
6901
6902    /* Do the conversion */
6903    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6904    if (outsize <= 0)
6905        goto error;
6906    return insize;
6907
6908error:
6909    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6910        return -2;
6911    PyErr_SetFromWindowsErr(0);
6912    return -1;
6913}
6914
6915/*
6916 * Decode a byte string from a code page into unicode object with an error
6917 * handler.
6918 *
6919 * Returns consumed size if succeed, or raise an OSError or
6920 * UnicodeDecodeError exception and returns -1 on error.
6921 */
6922static int
6923decode_code_page_errors(UINT code_page,
6924                        PyObject **v,
6925                        const char *in, const int size,
6926                        const char *errors, int final)
6927{
6928    const char *startin = in;
6929    const char *endin = in + size;
6930    const DWORD flags = decode_code_page_flags(code_page);
6931    /* Ideally, we should get reason from FormatMessage. This is the Windows
6932       2000 English version of the message. */
6933    const char *reason = "No mapping for the Unicode character exists "
6934                         "in the target code page.";
6935    /* each step cannot decode more than 1 character, but a character can be
6936       represented as a surrogate pair */
6937    wchar_t buffer[2], *startout, *out;
6938    int insize;
6939    Py_ssize_t outsize;
6940    PyObject *errorHandler = NULL;
6941    PyObject *exc = NULL;
6942    PyObject *encoding_obj = NULL;
6943    char *encoding;
6944    DWORD err;
6945    int ret = -1;
6946
6947    assert(size > 0);
6948
6949    encoding = code_page_name(code_page, &encoding_obj);
6950    if (encoding == NULL)
6951        return -1;
6952
6953    if ((errors == NULL || strcmp(errors, "strict") == 0) && final) {
6954        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6955           UnicodeDecodeError. */
6956        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6957        if (exc != NULL) {
6958            PyCodec_StrictErrors(exc);
6959            Py_CLEAR(exc);
6960        }
6961        goto error;
6962    }
6963
6964    if (*v == NULL) {
6965        /* Create unicode object */
6966        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6967            PyErr_NoMemory();
6968            goto error;
6969        }
6970        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6971        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6972        if (*v == NULL)
6973            goto error;
6974        startout = PyUnicode_AS_UNICODE(*v);
6975    }
6976    else {
6977        /* Extend unicode object */
6978        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6979        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6980            PyErr_NoMemory();
6981            goto error;
6982        }
6983        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6984            goto error;
6985        startout = PyUnicode_AS_UNICODE(*v) + n;
6986    }
6987
6988    /* Decode the byte string character per character */
6989    out = startout;
6990    while (in < endin)
6991    {
6992        /* Decode a character */
6993        insize = 1;
6994        do
6995        {
6996            outsize = MultiByteToWideChar(code_page, flags,
6997                                          in, insize,
6998                                          buffer, Py_ARRAY_LENGTH(buffer));
6999            if (outsize > 0)
7000                break;
7001            err = GetLastError();
7002            if (err != ERROR_NO_UNICODE_TRANSLATION
7003                && err != ERROR_INSUFFICIENT_BUFFER)
7004            {
7005                PyErr_SetFromWindowsErr(0);
7006                goto error;
7007            }
7008            insize++;
7009        }
7010        /* 4=maximum length of a UTF-8 sequence */
7011        while (insize <= 4 && (in + insize) <= endin);
7012
7013        if (outsize <= 0) {
7014            Py_ssize_t startinpos, endinpos, outpos;
7015
7016            /* last character in partial decode? */
7017            if (in + insize >= endin && !final)
7018                break;
7019
7020            startinpos = in - startin;
7021            endinpos = startinpos + 1;
7022            outpos = out - PyUnicode_AS_UNICODE(*v);
7023            if (unicode_decode_call_errorhandler_wchar(
7024                    errors, &errorHandler,
7025                    encoding, reason,
7026                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7027                    v, &outpos))
7028            {
7029                goto error;
7030            }
7031            out = PyUnicode_AS_UNICODE(*v) + outpos;
7032        }
7033        else {
7034            in += insize;
7035            memcpy(out, buffer, outsize * sizeof(wchar_t));
7036            out += outsize;
7037        }
7038    }
7039
7040    /* write a NUL character at the end */
7041    *out = 0;
7042
7043    /* Extend unicode object */
7044    outsize = out - startout;
7045    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7046    if (unicode_resize(v, outsize) < 0)
7047        goto error;
7048    /* (in - startin) <= size and size is an int */
7049    ret = Py_SAFE_DOWNCAST(in - startin, Py_ssize_t, int);
7050
7051error:
7052    Py_XDECREF(encoding_obj);
7053    Py_XDECREF(errorHandler);
7054    Py_XDECREF(exc);
7055    return ret;
7056}
7057
7058static PyObject *
7059decode_code_page_stateful(int code_page,
7060                          const char *s, Py_ssize_t size,
7061                          const char *errors, Py_ssize_t *consumed)
7062{
7063    PyObject *v = NULL;
7064    int chunk_size, final, converted, done;
7065
7066    if (code_page < 0) {
7067        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7068        return NULL;
7069    }
7070
7071    if (consumed)
7072        *consumed = 0;
7073
7074    do
7075    {
7076#ifdef NEED_RETRY
7077        if (size > INT_MAX) {
7078            chunk_size = INT_MAX;
7079            final = 0;
7080            done = 0;
7081        }
7082        else
7083#endif
7084        {
7085            chunk_size = (int)size;
7086            final = (consumed == NULL);
7087            done = 1;
7088        }
7089
7090        if (chunk_size == 0 && done) {
7091            if (v != NULL)
7092                break;
7093            _Py_RETURN_UNICODE_EMPTY();
7094        }
7095
7096        converted = decode_code_page_strict(code_page, &v,
7097                                            s, chunk_size);
7098        if (converted == -2)
7099            converted = decode_code_page_errors(code_page, &v,
7100                                                s, chunk_size,
7101                                                errors, final);
7102        assert(converted != 0 || done);
7103
7104        if (converted < 0) {
7105            Py_XDECREF(v);
7106            return NULL;
7107        }
7108
7109        if (consumed)
7110            *consumed += converted;
7111
7112        s += converted;
7113        size -= converted;
7114    } while (!done);
7115
7116    return unicode_result(v);
7117}
7118
7119PyObject *
7120PyUnicode_DecodeCodePageStateful(int code_page,
7121                                 const char *s,
7122                                 Py_ssize_t size,
7123                                 const char *errors,
7124                                 Py_ssize_t *consumed)
7125{
7126    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7127}
7128
7129PyObject *
7130PyUnicode_DecodeMBCSStateful(const char *s,
7131                             Py_ssize_t size,
7132                             const char *errors,
7133                             Py_ssize_t *consumed)
7134{
7135    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7136}
7137
7138PyObject *
7139PyUnicode_DecodeMBCS(const char *s,
7140                     Py_ssize_t size,
7141                     const char *errors)
7142{
7143    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7144}
7145
7146static DWORD
7147encode_code_page_flags(UINT code_page, const char *errors)
7148{
7149    if (code_page == CP_UTF8) {
7150        if (winver.dwMajorVersion >= 6)
7151            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7152               and later */
7153            return WC_ERR_INVALID_CHARS;
7154        else
7155            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7156            return 0;
7157    }
7158    else if (code_page == CP_UTF7) {
7159        /* CP_UTF7 only supports flags=0 */
7160        return 0;
7161    }
7162    else {
7163        if (errors != NULL && strcmp(errors, "replace") == 0)
7164            return 0;
7165        else
7166            return WC_NO_BEST_FIT_CHARS;
7167    }
7168}
7169
7170/*
7171 * Encode a Unicode string to a Windows code page into a byte string in strict
7172 * mode.
7173 *
7174 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7175 * an OSError and returns -1 on other error.
7176 */
7177static int
7178encode_code_page_strict(UINT code_page, PyObject **outbytes,
7179                        PyObject *unicode, Py_ssize_t offset, int len,
7180                        const char* errors)
7181{
7182    BOOL usedDefaultChar = FALSE;
7183    BOOL *pusedDefaultChar = &usedDefaultChar;
7184    int outsize;
7185    PyObject *exc = NULL;
7186    wchar_t *p;
7187    Py_ssize_t size;
7188    const DWORD flags = encode_code_page_flags(code_page, NULL);
7189    char *out;
7190    /* Create a substring so that we can get the UTF-16 representation
7191       of just the slice under consideration. */
7192    PyObject *substring;
7193
7194    assert(len > 0);
7195
7196    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7197        pusedDefaultChar = &usedDefaultChar;
7198    else
7199        pusedDefaultChar = NULL;
7200
7201    substring = PyUnicode_Substring(unicode, offset, offset+len);
7202    if (substring == NULL)
7203        return -1;
7204    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7205    if (p == NULL) {
7206        Py_DECREF(substring);
7207        return -1;
7208    }
7209    assert(size <= INT_MAX);
7210
7211    /* First get the size of the result */
7212    outsize = WideCharToMultiByte(code_page, flags,
7213                                  p, (int)size,
7214                                  NULL, 0,
7215                                  NULL, pusedDefaultChar);
7216    if (outsize <= 0)
7217        goto error;
7218    /* If we used a default char, then we failed! */
7219    if (pusedDefaultChar && *pusedDefaultChar) {
7220        Py_DECREF(substring);
7221        return -2;
7222    }
7223
7224    if (*outbytes == NULL) {
7225        /* Create string object */
7226        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7227        if (*outbytes == NULL) {
7228            Py_DECREF(substring);
7229            return -1;
7230        }
7231        out = PyBytes_AS_STRING(*outbytes);
7232    }
7233    else {
7234        /* Extend string object */
7235        const Py_ssize_t n = PyBytes_Size(*outbytes);
7236        if (outsize > PY_SSIZE_T_MAX - n) {
7237            PyErr_NoMemory();
7238            Py_DECREF(substring);
7239            return -1;
7240        }
7241        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7242            Py_DECREF(substring);
7243            return -1;
7244        }
7245        out = PyBytes_AS_STRING(*outbytes) + n;
7246    }
7247
7248    /* Do the conversion */
7249    outsize = WideCharToMultiByte(code_page, flags,
7250                                  p, (int)size,
7251                                  out, outsize,
7252                                  NULL, pusedDefaultChar);
7253    Py_CLEAR(substring);
7254    if (outsize <= 0)
7255        goto error;
7256    if (pusedDefaultChar && *pusedDefaultChar)
7257        return -2;
7258    return 0;
7259
7260error:
7261    Py_XDECREF(substring);
7262    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7263        return -2;
7264    PyErr_SetFromWindowsErr(0);
7265    return -1;
7266}
7267
7268/*
7269 * Encode a Unicode string to a Windows code page into a byte string using a
7270 * error handler.
7271 *
7272 * Returns consumed characters if succeed, or raise an OSError and returns
7273 * -1 on other error.
7274 */
7275static int
7276encode_code_page_errors(UINT code_page, PyObject **outbytes,
7277                        PyObject *unicode, Py_ssize_t unicode_offset,
7278                        Py_ssize_t insize, const char* errors)
7279{
7280    const DWORD flags = encode_code_page_flags(code_page, errors);
7281    Py_ssize_t pos = unicode_offset;
7282    Py_ssize_t endin = unicode_offset + insize;
7283    /* Ideally, we should get reason from FormatMessage. This is the Windows
7284       2000 English version of the message. */
7285    const char *reason = "invalid character";
7286    /* 4=maximum length of a UTF-8 sequence */
7287    char buffer[4];
7288    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7289    Py_ssize_t outsize;
7290    char *out;
7291    PyObject *errorHandler = NULL;
7292    PyObject *exc = NULL;
7293    PyObject *encoding_obj = NULL;
7294    char *encoding;
7295    Py_ssize_t newpos, newoutsize;
7296    PyObject *rep;
7297    int ret = -1;
7298
7299    assert(insize > 0);
7300
7301    encoding = code_page_name(code_page, &encoding_obj);
7302    if (encoding == NULL)
7303        return -1;
7304
7305    if (errors == NULL || strcmp(errors, "strict") == 0) {
7306        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7307           then we raise a UnicodeEncodeError. */
7308        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7309        if (exc != NULL) {
7310            PyCodec_StrictErrors(exc);
7311            Py_DECREF(exc);
7312        }
7313        Py_XDECREF(encoding_obj);
7314        return -1;
7315    }
7316
7317    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7318        pusedDefaultChar = &usedDefaultChar;
7319    else
7320        pusedDefaultChar = NULL;
7321
7322    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7323        PyErr_NoMemory();
7324        goto error;
7325    }
7326    outsize = insize * Py_ARRAY_LENGTH(buffer);
7327
7328    if (*outbytes == NULL) {
7329        /* Create string object */
7330        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7331        if (*outbytes == NULL)
7332            goto error;
7333        out = PyBytes_AS_STRING(*outbytes);
7334    }
7335    else {
7336        /* Extend string object */
7337        Py_ssize_t n = PyBytes_Size(*outbytes);
7338        if (n > PY_SSIZE_T_MAX - outsize) {
7339            PyErr_NoMemory();
7340            goto error;
7341        }
7342        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7343            goto error;
7344        out = PyBytes_AS_STRING(*outbytes) + n;
7345    }
7346
7347    /* Encode the string character per character */
7348    while (pos < endin)
7349    {
7350        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7351        wchar_t chars[2];
7352        int charsize;
7353        if (ch < 0x10000) {
7354            chars[0] = (wchar_t)ch;
7355            charsize = 1;
7356        }
7357        else {
7358            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7359            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7360            charsize = 2;
7361        }
7362
7363        outsize = WideCharToMultiByte(code_page, flags,
7364                                      chars, charsize,
7365                                      buffer, Py_ARRAY_LENGTH(buffer),
7366                                      NULL, pusedDefaultChar);
7367        if (outsize > 0) {
7368            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7369            {
7370                pos++;
7371                memcpy(out, buffer, outsize);
7372                out += outsize;
7373                continue;
7374            }
7375        }
7376        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7377            PyErr_SetFromWindowsErr(0);
7378            goto error;
7379        }
7380
7381        rep = unicode_encode_call_errorhandler(
7382                  errors, &errorHandler, encoding, reason,
7383                  unicode, &exc,
7384                  pos, pos + 1, &newpos);
7385        if (rep == NULL)
7386            goto error;
7387        pos = newpos;
7388
7389        if (PyBytes_Check(rep)) {
7390            outsize = PyBytes_GET_SIZE(rep);
7391            if (outsize != 1) {
7392                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7393                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7394                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7395                    Py_DECREF(rep);
7396                    goto error;
7397                }
7398                out = PyBytes_AS_STRING(*outbytes) + offset;
7399            }
7400            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7401            out += outsize;
7402        }
7403        else {
7404            Py_ssize_t i;
7405            enum PyUnicode_Kind kind;
7406            void *data;
7407
7408            if (PyUnicode_READY(rep) == -1) {
7409                Py_DECREF(rep);
7410                goto error;
7411            }
7412
7413            outsize = PyUnicode_GET_LENGTH(rep);
7414            if (outsize != 1) {
7415                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7416                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7417                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7418                    Py_DECREF(rep);
7419                    goto error;
7420                }
7421                out = PyBytes_AS_STRING(*outbytes) + offset;
7422            }
7423            kind = PyUnicode_KIND(rep);
7424            data = PyUnicode_DATA(rep);
7425            for (i=0; i < outsize; i++) {
7426                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7427                if (ch > 127) {
7428                    raise_encode_exception(&exc,
7429                        encoding, unicode,
7430                        pos, pos + 1,
7431                        "unable to encode error handler result to ASCII");
7432                    Py_DECREF(rep);
7433                    goto error;
7434                }
7435                *out = (unsigned char)ch;
7436                out++;
7437            }
7438        }
7439        Py_DECREF(rep);
7440    }
7441    /* write a NUL byte */
7442    *out = 0;
7443    outsize = out - PyBytes_AS_STRING(*outbytes);
7444    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7445    if (_PyBytes_Resize(outbytes, outsize) < 0)
7446        goto error;
7447    ret = 0;
7448
7449error:
7450    Py_XDECREF(encoding_obj);
7451    Py_XDECREF(errorHandler);
7452    Py_XDECREF(exc);
7453    return ret;
7454}
7455
7456static PyObject *
7457encode_code_page(int code_page,
7458                 PyObject *unicode,
7459                 const char *errors)
7460{
7461    Py_ssize_t len;
7462    PyObject *outbytes = NULL;
7463    Py_ssize_t offset;
7464    int chunk_len, ret, done;
7465
7466    if (PyUnicode_READY(unicode) == -1)
7467        return NULL;
7468    len = PyUnicode_GET_LENGTH(unicode);
7469
7470    if (code_page < 0) {
7471        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7472        return NULL;
7473    }
7474
7475    if (len == 0)
7476        return PyBytes_FromStringAndSize(NULL, 0);
7477
7478    offset = 0;
7479    do
7480    {
7481#ifdef NEED_RETRY
7482        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7483           chunks. */
7484        if (len > INT_MAX/2) {
7485            chunk_len = INT_MAX/2;
7486            done = 0;
7487        }
7488        else
7489#endif
7490        {
7491            chunk_len = (int)len;
7492            done = 1;
7493        }
7494
7495        ret = encode_code_page_strict(code_page, &outbytes,
7496                                      unicode, offset, chunk_len,
7497                                      errors);
7498        if (ret == -2)
7499            ret = encode_code_page_errors(code_page, &outbytes,
7500                                          unicode, offset,
7501                                          chunk_len, errors);
7502        if (ret < 0) {
7503            Py_XDECREF(outbytes);
7504            return NULL;
7505        }
7506
7507        offset += chunk_len;
7508        len -= chunk_len;
7509    } while (!done);
7510
7511    return outbytes;
7512}
7513
7514PyObject *
7515PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7516                     Py_ssize_t size,
7517                     const char *errors)
7518{
7519    PyObject *unicode, *res;
7520    unicode = PyUnicode_FromUnicode(p, size);
7521    if (unicode == NULL)
7522        return NULL;
7523    res = encode_code_page(CP_ACP, unicode, errors);
7524    Py_DECREF(unicode);
7525    return res;
7526}
7527
7528PyObject *
7529PyUnicode_EncodeCodePage(int code_page,
7530                         PyObject *unicode,
7531                         const char *errors)
7532{
7533    return encode_code_page(code_page, unicode, errors);
7534}
7535
7536PyObject *
7537PyUnicode_AsMBCSString(PyObject *unicode)
7538{
7539    if (!PyUnicode_Check(unicode)) {
7540        PyErr_BadArgument();
7541        return NULL;
7542    }
7543    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7544}
7545
7546#undef NEED_RETRY
7547
7548#endif /* HAVE_MBCS */
7549
7550/* --- Character Mapping Codec -------------------------------------------- */
7551
7552static int
7553charmap_decode_string(const char *s,
7554                      Py_ssize_t size,
7555                      PyObject *mapping,
7556                      const char *errors,
7557                      _PyUnicodeWriter *writer)
7558{
7559    const char *starts = s;
7560    const char *e;
7561    Py_ssize_t startinpos, endinpos;
7562    PyObject *errorHandler = NULL, *exc = NULL;
7563    Py_ssize_t maplen;
7564    enum PyUnicode_Kind mapkind;
7565    void *mapdata;
7566    Py_UCS4 x;
7567    unsigned char ch;
7568
7569    if (PyUnicode_READY(mapping) == -1)
7570        return -1;
7571
7572    maplen = PyUnicode_GET_LENGTH(mapping);
7573    mapdata = PyUnicode_DATA(mapping);
7574    mapkind = PyUnicode_KIND(mapping);
7575
7576    e = s + size;
7577
7578    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7579        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7580         * is disabled in encoding aliases, latin1 is preferred because
7581         * its implementation is faster. */
7582        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7583        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7584        Py_UCS4 maxchar = writer->maxchar;
7585
7586        assert (writer->kind == PyUnicode_1BYTE_KIND);
7587        while (s < e) {
7588            ch = *s;
7589            x = mapdata_ucs1[ch];
7590            if (x > maxchar) {
7591                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7592                    goto onError;
7593                maxchar = writer->maxchar;
7594                outdata = (Py_UCS1 *)writer->data;
7595            }
7596            outdata[writer->pos] = x;
7597            writer->pos++;
7598            ++s;
7599        }
7600        return 0;
7601    }
7602
7603    while (s < e) {
7604        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7605            enum PyUnicode_Kind outkind = writer->kind;
7606            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7607            if (outkind == PyUnicode_1BYTE_KIND) {
7608                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7609                Py_UCS4 maxchar = writer->maxchar;
7610                while (s < e) {
7611                    ch = *s;
7612                    x = mapdata_ucs2[ch];
7613                    if (x > maxchar)
7614                        goto Error;
7615                    outdata[writer->pos] = x;
7616                    writer->pos++;
7617                    ++s;
7618                }
7619                break;
7620            }
7621            else if (outkind == PyUnicode_2BYTE_KIND) {
7622                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7623                while (s < e) {
7624                    ch = *s;
7625                    x = mapdata_ucs2[ch];
7626                    if (x == 0xFFFE)
7627                        goto Error;
7628                    outdata[writer->pos] = x;
7629                    writer->pos++;
7630                    ++s;
7631                }
7632                break;
7633            }
7634        }
7635        ch = *s;
7636
7637        if (ch < maplen)
7638            x = PyUnicode_READ(mapkind, mapdata, ch);
7639        else
7640            x = 0xfffe; /* invalid value */
7641Error:
7642        if (x == 0xfffe)
7643        {
7644            /* undefined mapping */
7645            startinpos = s-starts;
7646            endinpos = startinpos+1;
7647            if (unicode_decode_call_errorhandler_writer(
7648                    errors, &errorHandler,
7649                    "charmap", "character maps to <undefined>",
7650                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7651                    writer)) {
7652                goto onError;
7653            }
7654            continue;
7655        }
7656
7657        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7658            goto onError;
7659        ++s;
7660    }
7661    Py_XDECREF(errorHandler);
7662    Py_XDECREF(exc);
7663    return 0;
7664
7665onError:
7666    Py_XDECREF(errorHandler);
7667    Py_XDECREF(exc);
7668    return -1;
7669}
7670
7671static int
7672charmap_decode_mapping(const char *s,
7673                       Py_ssize_t size,
7674                       PyObject *mapping,
7675                       const char *errors,
7676                       _PyUnicodeWriter *writer)
7677{
7678    const char *starts = s;
7679    const char *e;
7680    Py_ssize_t startinpos, endinpos;
7681    PyObject *errorHandler = NULL, *exc = NULL;
7682    unsigned char ch;
7683    PyObject *key, *item = NULL;
7684
7685    e = s + size;
7686
7687    while (s < e) {
7688        ch = *s;
7689
7690        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7691        key = PyLong_FromLong((long)ch);
7692        if (key == NULL)
7693            goto onError;
7694
7695        item = PyObject_GetItem(mapping, key);
7696        Py_DECREF(key);
7697        if (item == NULL) {
7698            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7699                /* No mapping found means: mapping is undefined. */
7700                PyErr_Clear();
7701                goto Undefined;
7702            } else
7703                goto onError;
7704        }
7705
7706        /* Apply mapping */
7707        if (item == Py_None)
7708            goto Undefined;
7709        if (PyLong_Check(item)) {
7710            long value = PyLong_AS_LONG(item);
7711            if (value == 0xFFFE)
7712                goto Undefined;
7713            if (value < 0 || value > MAX_UNICODE) {
7714                PyErr_Format(PyExc_TypeError,
7715                             "character mapping must be in range(0x%lx)",
7716                             (unsigned long)MAX_UNICODE + 1);
7717                goto onError;
7718            }
7719
7720            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7721                goto onError;
7722        }
7723        else if (PyUnicode_Check(item)) {
7724            if (PyUnicode_READY(item) == -1)
7725                goto onError;
7726            if (PyUnicode_GET_LENGTH(item) == 1) {
7727                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7728                if (value == 0xFFFE)
7729                    goto Undefined;
7730                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7731                    goto onError;
7732            }
7733            else {
7734                writer->overallocate = 1;
7735                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7736                    goto onError;
7737            }
7738        }
7739        else {
7740            /* wrong return value */
7741            PyErr_SetString(PyExc_TypeError,
7742                            "character mapping must return integer, None or str");
7743            goto onError;
7744        }
7745        Py_CLEAR(item);
7746        ++s;
7747        continue;
7748
7749Undefined:
7750        /* undefined mapping */
7751        Py_CLEAR(item);
7752        startinpos = s-starts;
7753        endinpos = startinpos+1;
7754        if (unicode_decode_call_errorhandler_writer(
7755                errors, &errorHandler,
7756                "charmap", "character maps to <undefined>",
7757                &starts, &e, &startinpos, &endinpos, &exc, &s,
7758                writer)) {
7759            goto onError;
7760        }
7761    }
7762    Py_XDECREF(errorHandler);
7763    Py_XDECREF(exc);
7764    return 0;
7765
7766onError:
7767    Py_XDECREF(item);
7768    Py_XDECREF(errorHandler);
7769    Py_XDECREF(exc);
7770    return -1;
7771}
7772
7773PyObject *
7774PyUnicode_DecodeCharmap(const char *s,
7775                        Py_ssize_t size,
7776                        PyObject *mapping,
7777                        const char *errors)
7778{
7779    _PyUnicodeWriter writer;
7780
7781    /* Default to Latin-1 */
7782    if (mapping == NULL)
7783        return PyUnicode_DecodeLatin1(s, size, errors);
7784
7785    if (size == 0)
7786        _Py_RETURN_UNICODE_EMPTY();
7787    _PyUnicodeWriter_Init(&writer);
7788    writer.min_length = size;
7789    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7790        goto onError;
7791
7792    if (PyUnicode_CheckExact(mapping)) {
7793        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7794            goto onError;
7795    }
7796    else {
7797        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7798            goto onError;
7799    }
7800    return _PyUnicodeWriter_Finish(&writer);
7801
7802  onError:
7803    _PyUnicodeWriter_Dealloc(&writer);
7804    return NULL;
7805}
7806
7807/* Charmap encoding: the lookup table */
7808
7809struct encoding_map {
7810    PyObject_HEAD
7811    unsigned char level1[32];
7812    int count2, count3;
7813    unsigned char level23[1];
7814};
7815
7816static PyObject*
7817encoding_map_size(PyObject *obj, PyObject* args)
7818{
7819    struct encoding_map *map = (struct encoding_map*)obj;
7820    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7821                           128*map->count3);
7822}
7823
7824static PyMethodDef encoding_map_methods[] = {
7825    {"size", encoding_map_size, METH_NOARGS,
7826     PyDoc_STR("Return the size (in bytes) of this object") },
7827    { 0 }
7828};
7829
7830static void
7831encoding_map_dealloc(PyObject* o)
7832{
7833    PyObject_FREE(o);
7834}
7835
7836static PyTypeObject EncodingMapType = {
7837    PyVarObject_HEAD_INIT(NULL, 0)
7838    "EncodingMap",          /*tp_name*/
7839    sizeof(struct encoding_map),   /*tp_basicsize*/
7840    0,                      /*tp_itemsize*/
7841    /* methods */
7842    encoding_map_dealloc,   /*tp_dealloc*/
7843    0,                      /*tp_print*/
7844    0,                      /*tp_getattr*/
7845    0,                      /*tp_setattr*/
7846    0,                      /*tp_reserved*/
7847    0,                      /*tp_repr*/
7848    0,                      /*tp_as_number*/
7849    0,                      /*tp_as_sequence*/
7850    0,                      /*tp_as_mapping*/
7851    0,                      /*tp_hash*/
7852    0,                      /*tp_call*/
7853    0,                      /*tp_str*/
7854    0,                      /*tp_getattro*/
7855    0,                      /*tp_setattro*/
7856    0,                      /*tp_as_buffer*/
7857    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7858    0,                      /*tp_doc*/
7859    0,                      /*tp_traverse*/
7860    0,                      /*tp_clear*/
7861    0,                      /*tp_richcompare*/
7862    0,                      /*tp_weaklistoffset*/
7863    0,                      /*tp_iter*/
7864    0,                      /*tp_iternext*/
7865    encoding_map_methods,   /*tp_methods*/
7866    0,                      /*tp_members*/
7867    0,                      /*tp_getset*/
7868    0,                      /*tp_base*/
7869    0,                      /*tp_dict*/
7870    0,                      /*tp_descr_get*/
7871    0,                      /*tp_descr_set*/
7872    0,                      /*tp_dictoffset*/
7873    0,                      /*tp_init*/
7874    0,                      /*tp_alloc*/
7875    0,                      /*tp_new*/
7876    0,                      /*tp_free*/
7877    0,                      /*tp_is_gc*/
7878};
7879
7880PyObject*
7881PyUnicode_BuildEncodingMap(PyObject* string)
7882{
7883    PyObject *result;
7884    struct encoding_map *mresult;
7885    int i;
7886    int need_dict = 0;
7887    unsigned char level1[32];
7888    unsigned char level2[512];
7889    unsigned char *mlevel1, *mlevel2, *mlevel3;
7890    int count2 = 0, count3 = 0;
7891    int kind;
7892    void *data;
7893    Py_ssize_t length;
7894    Py_UCS4 ch;
7895
7896    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7897        PyErr_BadArgument();
7898        return NULL;
7899    }
7900    kind = PyUnicode_KIND(string);
7901    data = PyUnicode_DATA(string);
7902    length = PyUnicode_GET_LENGTH(string);
7903    length = Py_MIN(length, 256);
7904    memset(level1, 0xFF, sizeof level1);
7905    memset(level2, 0xFF, sizeof level2);
7906
7907    /* If there isn't a one-to-one mapping of NULL to \0,
7908       or if there are non-BMP characters, we need to use
7909       a mapping dictionary. */
7910    if (PyUnicode_READ(kind, data, 0) != 0)
7911        need_dict = 1;
7912    for (i = 1; i < length; i++) {
7913        int l1, l2;
7914        ch = PyUnicode_READ(kind, data, i);
7915        if (ch == 0 || ch > 0xFFFF) {
7916            need_dict = 1;
7917            break;
7918        }
7919        if (ch == 0xFFFE)
7920            /* unmapped character */
7921            continue;
7922        l1 = ch >> 11;
7923        l2 = ch >> 7;
7924        if (level1[l1] == 0xFF)
7925            level1[l1] = count2++;
7926        if (level2[l2] == 0xFF)
7927            level2[l2] = count3++;
7928    }
7929
7930    if (count2 >= 0xFF || count3 >= 0xFF)
7931        need_dict = 1;
7932
7933    if (need_dict) {
7934        PyObject *result = PyDict_New();
7935        PyObject *key, *value;
7936        if (!result)
7937            return NULL;
7938        for (i = 0; i < length; i++) {
7939            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7940            value = PyLong_FromLong(i);
7941            if (!key || !value)
7942                goto failed1;
7943            if (PyDict_SetItem(result, key, value) == -1)
7944                goto failed1;
7945            Py_DECREF(key);
7946            Py_DECREF(value);
7947        }
7948        return result;
7949      failed1:
7950        Py_XDECREF(key);
7951        Py_XDECREF(value);
7952        Py_DECREF(result);
7953        return NULL;
7954    }
7955
7956    /* Create a three-level trie */
7957    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7958                             16*count2 + 128*count3 - 1);
7959    if (!result)
7960        return PyErr_NoMemory();
7961    PyObject_Init(result, &EncodingMapType);
7962    mresult = (struct encoding_map*)result;
7963    mresult->count2 = count2;
7964    mresult->count3 = count3;
7965    mlevel1 = mresult->level1;
7966    mlevel2 = mresult->level23;
7967    mlevel3 = mresult->level23 + 16*count2;
7968    memcpy(mlevel1, level1, 32);
7969    memset(mlevel2, 0xFF, 16*count2);
7970    memset(mlevel3, 0, 128*count3);
7971    count3 = 0;
7972    for (i = 1; i < length; i++) {
7973        int o1, o2, o3, i2, i3;
7974        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7975        if (ch == 0xFFFE)
7976            /* unmapped character */
7977            continue;
7978        o1 = ch>>11;
7979        o2 = (ch>>7) & 0xF;
7980        i2 = 16*mlevel1[o1] + o2;
7981        if (mlevel2[i2] == 0xFF)
7982            mlevel2[i2] = count3++;
7983        o3 = ch & 0x7F;
7984        i3 = 128*mlevel2[i2] + o3;
7985        mlevel3[i3] = i;
7986    }
7987    return result;
7988}
7989
7990static int
7991encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7992{
7993    struct encoding_map *map = (struct encoding_map*)mapping;
7994    int l1 = c>>11;
7995    int l2 = (c>>7) & 0xF;
7996    int l3 = c & 0x7F;
7997    int i;
7998
7999    if (c > 0xFFFF)
8000        return -1;
8001    if (c == 0)
8002        return 0;
8003    /* level 1*/
8004    i = map->level1[l1];
8005    if (i == 0xFF) {
8006        return -1;
8007    }
8008    /* level 2*/
8009    i = map->level23[16*i+l2];
8010    if (i == 0xFF) {
8011        return -1;
8012    }
8013    /* level 3 */
8014    i = map->level23[16*map->count2 + 128*i + l3];
8015    if (i == 0) {
8016        return -1;
8017    }
8018    return i;
8019}
8020
8021/* Lookup the character ch in the mapping. If the character
8022   can't be found, Py_None is returned (or NULL, if another
8023   error occurred). */
8024static PyObject *
8025charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8026{
8027    PyObject *w = PyLong_FromLong((long)c);
8028    PyObject *x;
8029
8030    if (w == NULL)
8031        return NULL;
8032    x = PyObject_GetItem(mapping, w);
8033    Py_DECREF(w);
8034    if (x == NULL) {
8035        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8036            /* No mapping found means: mapping is undefined. */
8037            PyErr_Clear();
8038            x = Py_None;
8039            Py_INCREF(x);
8040            return x;
8041        } else
8042            return NULL;
8043    }
8044    else if (x == Py_None)
8045        return x;
8046    else if (PyLong_Check(x)) {
8047        long value = PyLong_AS_LONG(x);
8048        if (value < 0 || value > 255) {
8049            PyErr_SetString(PyExc_TypeError,
8050                            "character mapping must be in range(256)");
8051            Py_DECREF(x);
8052            return NULL;
8053        }
8054        return x;
8055    }
8056    else if (PyBytes_Check(x))
8057        return x;
8058    else {
8059        /* wrong return value */
8060        PyErr_Format(PyExc_TypeError,
8061                     "character mapping must return integer, bytes or None, not %.400s",
8062                     x->ob_type->tp_name);
8063        Py_DECREF(x);
8064        return NULL;
8065    }
8066}
8067
8068static int
8069charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8070{
8071    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8072    /* exponentially overallocate to minimize reallocations */
8073    if (requiredsize < 2*outsize)
8074        requiredsize = 2*outsize;
8075    if (_PyBytes_Resize(outobj, requiredsize))
8076        return -1;
8077    return 0;
8078}
8079
8080typedef enum charmapencode_result {
8081    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8082} charmapencode_result;
8083/* lookup the character, put the result in the output string and adjust
8084   various state variables. Resize the output bytes object if not enough
8085   space is available. Return a new reference to the object that
8086   was put in the output buffer, or Py_None, if the mapping was undefined
8087   (in which case no character was written) or NULL, if a
8088   reallocation error occurred. The caller must decref the result */
8089static charmapencode_result
8090charmapencode_output(Py_UCS4 c, PyObject *mapping,
8091                     PyObject **outobj, Py_ssize_t *outpos)
8092{
8093    PyObject *rep;
8094    char *outstart;
8095    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8096
8097    if (Py_TYPE(mapping) == &EncodingMapType) {
8098        int res = encoding_map_lookup(c, mapping);
8099        Py_ssize_t requiredsize = *outpos+1;
8100        if (res == -1)
8101            return enc_FAILED;
8102        if (outsize<requiredsize)
8103            if (charmapencode_resize(outobj, outpos, requiredsize))
8104                return enc_EXCEPTION;
8105        outstart = PyBytes_AS_STRING(*outobj);
8106        outstart[(*outpos)++] = (char)res;
8107        return enc_SUCCESS;
8108    }
8109
8110    rep = charmapencode_lookup(c, mapping);
8111    if (rep==NULL)
8112        return enc_EXCEPTION;
8113    else if (rep==Py_None) {
8114        Py_DECREF(rep);
8115        return enc_FAILED;
8116    } else {
8117        if (PyLong_Check(rep)) {
8118            Py_ssize_t requiredsize = *outpos+1;
8119            if (outsize<requiredsize)
8120                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8121                    Py_DECREF(rep);
8122                    return enc_EXCEPTION;
8123                }
8124            outstart = PyBytes_AS_STRING(*outobj);
8125            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8126        }
8127        else {
8128            const char *repchars = PyBytes_AS_STRING(rep);
8129            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8130            Py_ssize_t requiredsize = *outpos+repsize;
8131            if (outsize<requiredsize)
8132                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8133                    Py_DECREF(rep);
8134                    return enc_EXCEPTION;
8135                }
8136            outstart = PyBytes_AS_STRING(*outobj);
8137            memcpy(outstart + *outpos, repchars, repsize);
8138            *outpos += repsize;
8139        }
8140    }
8141    Py_DECREF(rep);
8142    return enc_SUCCESS;
8143}
8144
8145/* handle an error in PyUnicode_EncodeCharmap
8146   Return 0 on success, -1 on error */
8147static int
8148charmap_encoding_error(
8149    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8150    PyObject **exceptionObject,
8151    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8152    PyObject **res, Py_ssize_t *respos)
8153{
8154    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8155    Py_ssize_t size, repsize;
8156    Py_ssize_t newpos;
8157    enum PyUnicode_Kind kind;
8158    void *data;
8159    Py_ssize_t index;
8160    /* startpos for collecting unencodable chars */
8161    Py_ssize_t collstartpos = *inpos;
8162    Py_ssize_t collendpos = *inpos+1;
8163    Py_ssize_t collpos;
8164    char *encoding = "charmap";
8165    char *reason = "character maps to <undefined>";
8166    charmapencode_result x;
8167    Py_UCS4 ch;
8168    int val;
8169
8170    if (PyUnicode_READY(unicode) == -1)
8171        return -1;
8172    size = PyUnicode_GET_LENGTH(unicode);
8173    /* find all unencodable characters */
8174    while (collendpos < size) {
8175        PyObject *rep;
8176        if (Py_TYPE(mapping) == &EncodingMapType) {
8177            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8178            val = encoding_map_lookup(ch, mapping);
8179            if (val != -1)
8180                break;
8181            ++collendpos;
8182            continue;
8183        }
8184
8185        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8186        rep = charmapencode_lookup(ch, mapping);
8187        if (rep==NULL)
8188            return -1;
8189        else if (rep!=Py_None) {
8190            Py_DECREF(rep);
8191            break;
8192        }
8193        Py_DECREF(rep);
8194        ++collendpos;
8195    }
8196    /* cache callback name lookup
8197     * (if not done yet, i.e. it's the first error) */
8198    if (*known_errorHandler==-1) {
8199        if ((errors==NULL) || (!strcmp(errors, "strict")))
8200            *known_errorHandler = 1;
8201        else if (!strcmp(errors, "replace"))
8202            *known_errorHandler = 2;
8203        else if (!strcmp(errors, "ignore"))
8204            *known_errorHandler = 3;
8205        else if (!strcmp(errors, "xmlcharrefreplace"))
8206            *known_errorHandler = 4;
8207        else
8208            *known_errorHandler = 0;
8209    }
8210    switch (*known_errorHandler) {
8211    case 1: /* strict */
8212        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8213        return -1;
8214    case 2: /* replace */
8215        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8216            x = charmapencode_output('?', mapping, res, respos);
8217            if (x==enc_EXCEPTION) {
8218                return -1;
8219            }
8220            else if (x==enc_FAILED) {
8221                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8222                return -1;
8223            }
8224        }
8225        /* fall through */
8226    case 3: /* ignore */
8227        *inpos = collendpos;
8228        break;
8229    case 4: /* xmlcharrefreplace */
8230        /* generate replacement (temporarily (mis)uses p) */
8231        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8232            char buffer[2+29+1+1];
8233            char *cp;
8234            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8235            for (cp = buffer; *cp; ++cp) {
8236                x = charmapencode_output(*cp, mapping, res, respos);
8237                if (x==enc_EXCEPTION)
8238                    return -1;
8239                else if (x==enc_FAILED) {
8240                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8241                    return -1;
8242                }
8243            }
8244        }
8245        *inpos = collendpos;
8246        break;
8247    default:
8248        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8249                                                      encoding, reason, unicode, exceptionObject,
8250                                                      collstartpos, collendpos, &newpos);
8251        if (repunicode == NULL)
8252            return -1;
8253        if (PyBytes_Check(repunicode)) {
8254            /* Directly copy bytes result to output. */
8255            Py_ssize_t outsize = PyBytes_Size(*res);
8256            Py_ssize_t requiredsize;
8257            repsize = PyBytes_Size(repunicode);
8258            requiredsize = *respos + repsize;
8259            if (requiredsize > outsize)
8260                /* Make room for all additional bytes. */
8261                if (charmapencode_resize(res, respos, requiredsize)) {
8262                    Py_DECREF(repunicode);
8263                    return -1;
8264                }
8265            memcpy(PyBytes_AsString(*res) + *respos,
8266                   PyBytes_AsString(repunicode),  repsize);
8267            *respos += repsize;
8268            *inpos = newpos;
8269            Py_DECREF(repunicode);
8270            break;
8271        }
8272        /* generate replacement  */
8273        if (PyUnicode_READY(repunicode) == -1) {
8274            Py_DECREF(repunicode);
8275            return -1;
8276        }
8277        repsize = PyUnicode_GET_LENGTH(repunicode);
8278        data = PyUnicode_DATA(repunicode);
8279        kind = PyUnicode_KIND(repunicode);
8280        for (index = 0; index < repsize; index++) {
8281            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8282            x = charmapencode_output(repch, mapping, res, respos);
8283            if (x==enc_EXCEPTION) {
8284                Py_DECREF(repunicode);
8285                return -1;
8286            }
8287            else if (x==enc_FAILED) {
8288                Py_DECREF(repunicode);
8289                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8290                return -1;
8291            }
8292        }
8293        *inpos = newpos;
8294        Py_DECREF(repunicode);
8295    }
8296    return 0;
8297}
8298
8299PyObject *
8300_PyUnicode_EncodeCharmap(PyObject *unicode,
8301                         PyObject *mapping,
8302                         const char *errors)
8303{
8304    /* output object */
8305    PyObject *res = NULL;
8306    /* current input position */
8307    Py_ssize_t inpos = 0;
8308    Py_ssize_t size;
8309    /* current output position */
8310    Py_ssize_t respos = 0;
8311    PyObject *errorHandler = NULL;
8312    PyObject *exc = NULL;
8313    /* the following variable is used for caching string comparisons
8314     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8315     * 3=ignore, 4=xmlcharrefreplace */
8316    int known_errorHandler = -1;
8317    void *data;
8318    int kind;
8319
8320    if (PyUnicode_READY(unicode) == -1)
8321        return NULL;
8322    size = PyUnicode_GET_LENGTH(unicode);
8323    data = PyUnicode_DATA(unicode);
8324    kind = PyUnicode_KIND(unicode);
8325
8326    /* Default to Latin-1 */
8327    if (mapping == NULL)
8328        return unicode_encode_ucs1(unicode, errors, 256);
8329
8330    /* allocate enough for a simple encoding without
8331       replacements, if we need more, we'll resize */
8332    res = PyBytes_FromStringAndSize(NULL, size);
8333    if (res == NULL)
8334        goto onError;
8335    if (size == 0)
8336        return res;
8337
8338    while (inpos<size) {
8339        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8340        /* try to encode it */
8341        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8342        if (x==enc_EXCEPTION) /* error */
8343            goto onError;
8344        if (x==enc_FAILED) { /* unencodable character */
8345            if (charmap_encoding_error(unicode, &inpos, mapping,
8346                                       &exc,
8347                                       &known_errorHandler, &errorHandler, errors,
8348                                       &res, &respos)) {
8349                goto onError;
8350            }
8351        }
8352        else
8353            /* done with this character => adjust input position */
8354            ++inpos;
8355    }
8356
8357    /* Resize if we allocated to much */
8358    if (respos<PyBytes_GET_SIZE(res))
8359        if (_PyBytes_Resize(&res, respos) < 0)
8360            goto onError;
8361
8362    Py_XDECREF(exc);
8363    Py_XDECREF(errorHandler);
8364    return res;
8365
8366  onError:
8367    Py_XDECREF(res);
8368    Py_XDECREF(exc);
8369    Py_XDECREF(errorHandler);
8370    return NULL;
8371}
8372
8373/* Deprecated */
8374PyObject *
8375PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8376                        Py_ssize_t size,
8377                        PyObject *mapping,
8378                        const char *errors)
8379{
8380    PyObject *result;
8381    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8382    if (unicode == NULL)
8383        return NULL;
8384    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8385    Py_DECREF(unicode);
8386    return result;
8387}
8388
8389PyObject *
8390PyUnicode_AsCharmapString(PyObject *unicode,
8391                          PyObject *mapping)
8392{
8393    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8394        PyErr_BadArgument();
8395        return NULL;
8396    }
8397    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8398}
8399
8400/* create or adjust a UnicodeTranslateError */
8401static void
8402make_translate_exception(PyObject **exceptionObject,
8403                         PyObject *unicode,
8404                         Py_ssize_t startpos, Py_ssize_t endpos,
8405                         const char *reason)
8406{
8407    if (*exceptionObject == NULL) {
8408        *exceptionObject = _PyUnicodeTranslateError_Create(
8409            unicode, startpos, endpos, reason);
8410    }
8411    else {
8412        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8413            goto onError;
8414        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8415            goto onError;
8416        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8417            goto onError;
8418        return;
8419      onError:
8420        Py_CLEAR(*exceptionObject);
8421    }
8422}
8423
8424/* error handling callback helper:
8425   build arguments, call the callback and check the arguments,
8426   put the result into newpos and return the replacement string, which
8427   has to be freed by the caller */
8428static PyObject *
8429unicode_translate_call_errorhandler(const char *errors,
8430                                    PyObject **errorHandler,
8431                                    const char *reason,
8432                                    PyObject *unicode, PyObject **exceptionObject,
8433                                    Py_ssize_t startpos, Py_ssize_t endpos,
8434                                    Py_ssize_t *newpos)
8435{
8436    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8437
8438    Py_ssize_t i_newpos;
8439    PyObject *restuple;
8440    PyObject *resunicode;
8441
8442    if (*errorHandler == NULL) {
8443        *errorHandler = PyCodec_LookupError(errors);
8444        if (*errorHandler == NULL)
8445            return NULL;
8446    }
8447
8448    make_translate_exception(exceptionObject,
8449                             unicode, startpos, endpos, reason);
8450    if (*exceptionObject == NULL)
8451        return NULL;
8452
8453    restuple = PyObject_CallFunctionObjArgs(
8454        *errorHandler, *exceptionObject, NULL);
8455    if (restuple == NULL)
8456        return NULL;
8457    if (!PyTuple_Check(restuple)) {
8458        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8459        Py_DECREF(restuple);
8460        return NULL;
8461    }
8462    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8463                          &resunicode, &i_newpos)) {
8464        Py_DECREF(restuple);
8465        return NULL;
8466    }
8467    if (i_newpos<0)
8468        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8469    else
8470        *newpos = i_newpos;
8471    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8472        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8473        Py_DECREF(restuple);
8474        return NULL;
8475    }
8476    Py_INCREF(resunicode);
8477    Py_DECREF(restuple);
8478    return resunicode;
8479}
8480
8481/* Lookup the character ch in the mapping and put the result in result,
8482   which must be decrefed by the caller.
8483   Return 0 on success, -1 on error */
8484static int
8485charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8486{
8487    PyObject *w = PyLong_FromLong((long)c);
8488    PyObject *x;
8489
8490    if (w == NULL)
8491        return -1;
8492    x = PyObject_GetItem(mapping, w);
8493    Py_DECREF(w);
8494    if (x == NULL) {
8495        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8496            /* No mapping found means: use 1:1 mapping. */
8497            PyErr_Clear();
8498            *result = NULL;
8499            return 0;
8500        } else
8501            return -1;
8502    }
8503    else if (x == Py_None) {
8504        *result = x;
8505        return 0;
8506    }
8507    else if (PyLong_Check(x)) {
8508        long value = PyLong_AS_LONG(x);
8509        if (value < 0 || value > MAX_UNICODE) {
8510            PyErr_Format(PyExc_ValueError,
8511                         "character mapping must be in range(0x%x)",
8512                         MAX_UNICODE+1);
8513            Py_DECREF(x);
8514            return -1;
8515        }
8516        *result = x;
8517        return 0;
8518    }
8519    else if (PyUnicode_Check(x)) {
8520        *result = x;
8521        return 0;
8522    }
8523    else {
8524        /* wrong return value */
8525        PyErr_SetString(PyExc_TypeError,
8526                        "character mapping must return integer, None or str");
8527        Py_DECREF(x);
8528        return -1;
8529    }
8530}
8531
8532/* lookup the character, write the result into the writer.
8533   Return 1 if the result was written into the writer, return 0 if the mapping
8534   was undefined, raise an exception return -1 on error. */
8535static int
8536charmaptranslate_output(Py_UCS4 ch, PyObject *mapping,
8537                        _PyUnicodeWriter *writer)
8538{
8539    PyObject *item;
8540
8541    if (charmaptranslate_lookup(ch, mapping, &item))
8542        return -1;
8543
8544    if (item == NULL) {
8545        /* not found => default to 1:1 mapping */
8546        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8547            return -1;
8548        }
8549        return 1;
8550    }
8551
8552    if (item == Py_None) {
8553        Py_DECREF(item);
8554        return 0;
8555    }
8556
8557    if (PyLong_Check(item)) {
8558        long ch = (Py_UCS4)PyLong_AS_LONG(item);
8559        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8560           used it */
8561        if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0) {
8562            Py_DECREF(item);
8563            return -1;
8564        }
8565        Py_DECREF(item);
8566        return 1;
8567    }
8568
8569    if (!PyUnicode_Check(item)) {
8570        Py_DECREF(item);
8571        return -1;
8572    }
8573
8574    if (_PyUnicodeWriter_WriteStr(writer, item) < 0) {
8575        Py_DECREF(item);
8576        return -1;
8577    }
8578
8579    Py_DECREF(item);
8580    return 1;
8581}
8582
8583static int
8584unicode_fast_translate_lookup(PyObject *mapping, Py_UCS1 ch,
8585                              Py_UCS1 *translate)
8586{
8587    PyObject *item = NULL;
8588    int ret = 0;
8589
8590    if (charmaptranslate_lookup(ch, mapping, &item)) {
8591        return -1;
8592    }
8593
8594    if (item == Py_None) {
8595        /* deletion */
8596        translate[ch] = 0xfe;
8597    }
8598    else if (item == NULL) {
8599        /* not found => default to 1:1 mapping */
8600        translate[ch] = ch;
8601        return 1;
8602    }
8603    else if (PyLong_Check(item)) {
8604        long replace = PyLong_AS_LONG(item);
8605        /* PyLong_AS_LONG() cannot fail, charmaptranslate_lookup() already
8606           used it */
8607        if (127 < replace) {
8608            /* invalid character or character outside ASCII:
8609               skip the fast translate */
8610            goto exit;
8611        }
8612        translate[ch] = (Py_UCS1)replace;
8613    }
8614    else if (PyUnicode_Check(item)) {
8615        Py_UCS4 replace;
8616
8617        if (PyUnicode_READY(item) == -1) {
8618            Py_DECREF(item);
8619            return -1;
8620        }
8621        if (PyUnicode_GET_LENGTH(item) != 1)
8622            goto exit;
8623
8624        replace = PyUnicode_READ_CHAR(item, 0);
8625        if (replace > 127)
8626            goto exit;
8627        translate[ch] = (Py_UCS1)replace;
8628    }
8629    else {
8630        /* not None, NULL, long or unicode */
8631        goto exit;
8632    }
8633    ret = 1;
8634
8635  exit:
8636    Py_DECREF(item);
8637    return ret;
8638}
8639
8640/* Fast path for ascii => ascii translation. Return 1 if the whole string
8641   was translated into writer, return 0 if the input string was partially
8642   translated into writer, raise an exception and return -1 on error. */
8643static int
8644unicode_fast_translate(PyObject *input, PyObject *mapping,
8645                       _PyUnicodeWriter *writer, int ignore)
8646{
8647    Py_UCS1 ascii_table[128], ch, ch2;
8648    Py_ssize_t len;
8649    Py_UCS1 *in, *end, *out;
8650    int res = 0;
8651
8652    if (PyUnicode_READY(input) == -1)
8653        return -1;
8654    if (!PyUnicode_IS_ASCII(input))
8655        return 0;
8656    len = PyUnicode_GET_LENGTH(input);
8657
8658    memset(ascii_table, 0xff, 128);
8659
8660    in = PyUnicode_1BYTE_DATA(input);
8661    end = in + len;
8662
8663    assert(PyUnicode_IS_ASCII(writer->buffer));
8664    assert(PyUnicode_GET_LENGTH(writer->buffer) == len);
8665    out = PyUnicode_1BYTE_DATA(writer->buffer);
8666
8667    for (; in < end; in++) {
8668        ch = *in;
8669        ch2 = ascii_table[ch];
8670        if (ch2 == 0xff) {
8671            int translate = unicode_fast_translate_lookup(mapping, ch,
8672                                                          ascii_table);
8673            if (translate < 0)
8674                return -1;
8675            if (translate == 0)
8676                goto exit;
8677            ch2 = ascii_table[ch];
8678        }
8679        if (ch2 == 0xfe) {
8680            if (ignore)
8681                continue;
8682            goto exit;
8683        }
8684        assert(ch2 < 128);
8685        *out = ch2;
8686        out++;
8687    }
8688    res = 1;
8689
8690exit:
8691    writer->pos = out - PyUnicode_1BYTE_DATA(writer->buffer);
8692    return res;
8693}
8694
8695PyObject *
8696_PyUnicode_TranslateCharmap(PyObject *input,
8697                            PyObject *mapping,
8698                            const char *errors)
8699{
8700    /* input object */
8701    char *data;
8702    Py_ssize_t size, i;
8703    int kind;
8704    /* output buffer */
8705    _PyUnicodeWriter writer;
8706    /* error handler */
8707    char *reason = "character maps to <undefined>";
8708    PyObject *errorHandler = NULL;
8709    PyObject *exc = NULL;
8710    int ignore;
8711    int res;
8712
8713    if (mapping == NULL) {
8714        PyErr_BadArgument();
8715        return NULL;
8716    }
8717
8718    if (PyUnicode_READY(input) == -1)
8719        return NULL;
8720    data = (char*)PyUnicode_DATA(input);
8721    kind = PyUnicode_KIND(input);
8722    size = PyUnicode_GET_LENGTH(input);
8723
8724    if (size == 0) {
8725        Py_INCREF(input);
8726        return input;
8727    }
8728
8729    /* allocate enough for a simple 1:1 translation without
8730       replacements, if we need more, we'll resize */
8731    _PyUnicodeWriter_Init(&writer);
8732    if (_PyUnicodeWriter_Prepare(&writer, size, 127) == -1)
8733        goto onError;
8734
8735    ignore = (errors != NULL && strcmp(errors, "ignore") == 0);
8736
8737    res = unicode_fast_translate(input, mapping, &writer, ignore);
8738    if (res < 0) {
8739        _PyUnicodeWriter_Dealloc(&writer);
8740        return NULL;
8741    }
8742    if (res == 1)
8743        return _PyUnicodeWriter_Finish(&writer);
8744
8745    i = writer.pos;
8746    while (i<size) {
8747        /* try to encode it */
8748        int translate;
8749        PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8750        Py_ssize_t newpos;
8751        /* startpos for collecting untranslatable chars */
8752        Py_ssize_t collstart;
8753        Py_ssize_t collend;
8754        Py_UCS4 ch;
8755
8756        ch = PyUnicode_READ(kind, data, i);
8757        translate = charmaptranslate_output(ch, mapping, &writer);
8758        if (translate < 0)
8759            goto onError;
8760
8761        if (translate != 0) {
8762            /* it worked => adjust input pointer */
8763            ++i;
8764            continue;
8765        }
8766
8767        /* untranslatable character */
8768        collstart = i;
8769        collend = i+1;
8770
8771        /* find all untranslatable characters */
8772        while (collend < size) {
8773            PyObject *x;
8774            ch = PyUnicode_READ(kind, data, collend);
8775            if (charmaptranslate_lookup(ch, mapping, &x))
8776                goto onError;
8777            Py_XDECREF(x);
8778            if (x != Py_None)
8779                break;
8780            ++collend;
8781        }
8782
8783        if (ignore) {
8784            i = collend;
8785        }
8786        else {
8787            repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8788                                                             reason, input, &exc,
8789                                                             collstart, collend, &newpos);
8790            if (repunicode == NULL)
8791                goto onError;
8792            if (_PyUnicodeWriter_WriteStr(&writer, repunicode) < 0) {
8793                Py_DECREF(repunicode);
8794                goto onError;
8795            }
8796            Py_DECREF(repunicode);
8797            i = newpos;
8798        }
8799    }
8800    Py_XDECREF(exc);
8801    Py_XDECREF(errorHandler);
8802    return _PyUnicodeWriter_Finish(&writer);
8803
8804  onError:
8805    _PyUnicodeWriter_Dealloc(&writer);
8806    Py_XDECREF(exc);
8807    Py_XDECREF(errorHandler);
8808    return NULL;
8809}
8810
8811/* Deprecated. Use PyUnicode_Translate instead. */
8812PyObject *
8813PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8814                           Py_ssize_t size,
8815                           PyObject *mapping,
8816                           const char *errors)
8817{
8818    PyObject *result;
8819    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8820    if (!unicode)
8821        return NULL;
8822    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8823    Py_DECREF(unicode);
8824    return result;
8825}
8826
8827PyObject *
8828PyUnicode_Translate(PyObject *str,
8829                    PyObject *mapping,
8830                    const char *errors)
8831{
8832    PyObject *result;
8833
8834    str = PyUnicode_FromObject(str);
8835    if (str == NULL)
8836        return NULL;
8837    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8838    Py_DECREF(str);
8839    return result;
8840}
8841
8842static Py_UCS4
8843fix_decimal_and_space_to_ascii(PyObject *self)
8844{
8845    /* No need to call PyUnicode_READY(self) because this function is only
8846       called as a callback from fixup() which does it already. */
8847    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8848    const int kind = PyUnicode_KIND(self);
8849    void *data = PyUnicode_DATA(self);
8850    Py_UCS4 maxchar = 127, ch, fixed;
8851    int modified = 0;
8852    Py_ssize_t i;
8853
8854    for (i = 0; i < len; ++i) {
8855        ch = PyUnicode_READ(kind, data, i);
8856        fixed = 0;
8857        if (ch > 127) {
8858            if (Py_UNICODE_ISSPACE(ch))
8859                fixed = ' ';
8860            else {
8861                const int decimal = Py_UNICODE_TODECIMAL(ch);
8862                if (decimal >= 0)
8863                    fixed = '0' + decimal;
8864            }
8865            if (fixed != 0) {
8866                modified = 1;
8867                maxchar = Py_MAX(maxchar, fixed);
8868                PyUnicode_WRITE(kind, data, i, fixed);
8869            }
8870            else
8871                maxchar = Py_MAX(maxchar, ch);
8872        }
8873    }
8874
8875    return (modified) ? maxchar : 0;
8876}
8877
8878PyObject *
8879_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8880{
8881    if (!PyUnicode_Check(unicode)) {
8882        PyErr_BadInternalCall();
8883        return NULL;
8884    }
8885    if (PyUnicode_READY(unicode) == -1)
8886        return NULL;
8887    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8888        /* If the string is already ASCII, just return the same string */
8889        Py_INCREF(unicode);
8890        return unicode;
8891    }
8892    return fixup(unicode, fix_decimal_and_space_to_ascii);
8893}
8894
8895PyObject *
8896PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8897                                  Py_ssize_t length)
8898{
8899    PyObject *decimal;
8900    Py_ssize_t i;
8901    Py_UCS4 maxchar;
8902    enum PyUnicode_Kind kind;
8903    void *data;
8904
8905    maxchar = 127;
8906    for (i = 0; i < length; i++) {
8907        Py_UCS4 ch = s[i];
8908        if (ch > 127) {
8909            int decimal = Py_UNICODE_TODECIMAL(ch);
8910            if (decimal >= 0)
8911                ch = '0' + decimal;
8912            maxchar = Py_MAX(maxchar, ch);
8913        }
8914    }
8915
8916    /* Copy to a new string */
8917    decimal = PyUnicode_New(length, maxchar);
8918    if (decimal == NULL)
8919        return decimal;
8920    kind = PyUnicode_KIND(decimal);
8921    data = PyUnicode_DATA(decimal);
8922    /* Iterate over code points */
8923    for (i = 0; i < length; i++) {
8924        Py_UCS4 ch = s[i];
8925        if (ch > 127) {
8926            int decimal = Py_UNICODE_TODECIMAL(ch);
8927            if (decimal >= 0)
8928                ch = '0' + decimal;
8929        }
8930        PyUnicode_WRITE(kind, data, i, ch);
8931    }
8932    return unicode_result(decimal);
8933}
8934/* --- Decimal Encoder ---------------------------------------------------- */
8935
8936int
8937PyUnicode_EncodeDecimal(Py_UNICODE *s,
8938                        Py_ssize_t length,
8939                        char *output,
8940                        const char *errors)
8941{
8942    PyObject *unicode;
8943    Py_ssize_t i;
8944    enum PyUnicode_Kind kind;
8945    void *data;
8946
8947    if (output == NULL) {
8948        PyErr_BadArgument();
8949        return -1;
8950    }
8951
8952    unicode = PyUnicode_FromUnicode(s, length);
8953    if (unicode == NULL)
8954        return -1;
8955
8956    if (PyUnicode_READY(unicode) == -1) {
8957        Py_DECREF(unicode);
8958        return -1;
8959    }
8960    kind = PyUnicode_KIND(unicode);
8961    data = PyUnicode_DATA(unicode);
8962
8963    for (i=0; i < length; ) {
8964        PyObject *exc;
8965        Py_UCS4 ch;
8966        int decimal;
8967        Py_ssize_t startpos;
8968
8969        ch = PyUnicode_READ(kind, data, i);
8970
8971        if (Py_UNICODE_ISSPACE(ch)) {
8972            *output++ = ' ';
8973            i++;
8974            continue;
8975        }
8976        decimal = Py_UNICODE_TODECIMAL(ch);
8977        if (decimal >= 0) {
8978            *output++ = '0' + decimal;
8979            i++;
8980            continue;
8981        }
8982        if (0 < ch && ch < 256) {
8983            *output++ = (char)ch;
8984            i++;
8985            continue;
8986        }
8987
8988        startpos = i;
8989        exc = NULL;
8990        raise_encode_exception(&exc, "decimal", unicode,
8991                               startpos, startpos+1,
8992                               "invalid decimal Unicode string");
8993        Py_XDECREF(exc);
8994        Py_DECREF(unicode);
8995        return -1;
8996    }
8997    /* 0-terminate the output string */
8998    *output++ = '\0';
8999    Py_DECREF(unicode);
9000    return 0;
9001}
9002
9003/* --- Helpers ------------------------------------------------------------ */
9004
9005static Py_ssize_t
9006any_find_slice(int direction, PyObject* s1, PyObject* s2,
9007               Py_ssize_t start,
9008               Py_ssize_t end)
9009{
9010    int kind1, kind2, kind;
9011    void *buf1, *buf2;
9012    Py_ssize_t len1, len2, result;
9013
9014    kind1 = PyUnicode_KIND(s1);
9015    kind2 = PyUnicode_KIND(s2);
9016    kind = kind1 > kind2 ? kind1 : kind2;
9017    buf1 = PyUnicode_DATA(s1);
9018    buf2 = PyUnicode_DATA(s2);
9019    if (kind1 != kind)
9020        buf1 = _PyUnicode_AsKind(s1, kind);
9021    if (!buf1)
9022        return -2;
9023    if (kind2 != kind)
9024        buf2 = _PyUnicode_AsKind(s2, kind);
9025    if (!buf2) {
9026        if (kind1 != kind) PyMem_Free(buf1);
9027        return -2;
9028    }
9029    len1 = PyUnicode_GET_LENGTH(s1);
9030    len2 = PyUnicode_GET_LENGTH(s2);
9031
9032    if (direction > 0) {
9033        switch (kind) {
9034        case PyUnicode_1BYTE_KIND:
9035            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9036                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9037            else
9038                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9039            break;
9040        case PyUnicode_2BYTE_KIND:
9041            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9042            break;
9043        case PyUnicode_4BYTE_KIND:
9044            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9045            break;
9046        default:
9047            assert(0); result = -2;
9048        }
9049    }
9050    else {
9051        switch (kind) {
9052        case PyUnicode_1BYTE_KIND:
9053            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9054                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9055            else
9056                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9057            break;
9058        case PyUnicode_2BYTE_KIND:
9059            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9060            break;
9061        case PyUnicode_4BYTE_KIND:
9062            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9063            break;
9064        default:
9065            assert(0); result = -2;
9066        }
9067    }
9068
9069    if (kind1 != kind)
9070        PyMem_Free(buf1);
9071    if (kind2 != kind)
9072        PyMem_Free(buf2);
9073
9074    return result;
9075}
9076
9077Py_ssize_t
9078_PyUnicode_InsertThousandsGrouping(
9079    PyObject *unicode, Py_ssize_t index,
9080    Py_ssize_t n_buffer,
9081    void *digits, Py_ssize_t n_digits,
9082    Py_ssize_t min_width,
9083    const char *grouping, PyObject *thousands_sep,
9084    Py_UCS4 *maxchar)
9085{
9086    unsigned int kind, thousands_sep_kind;
9087    char *data, *thousands_sep_data;
9088    Py_ssize_t thousands_sep_len;
9089    Py_ssize_t len;
9090
9091    if (unicode != NULL) {
9092        kind = PyUnicode_KIND(unicode);
9093        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9094    }
9095    else {
9096        kind = PyUnicode_1BYTE_KIND;
9097        data = NULL;
9098    }
9099    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9100    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9101    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9102    if (unicode != NULL && thousands_sep_kind != kind) {
9103        if (thousands_sep_kind < kind) {
9104            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9105            if (!thousands_sep_data)
9106                return -1;
9107        }
9108        else {
9109            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9110            if (!data)
9111                return -1;
9112        }
9113    }
9114
9115    switch (kind) {
9116    case PyUnicode_1BYTE_KIND:
9117        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9118            len = asciilib_InsertThousandsGrouping(
9119                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9120                min_width, grouping,
9121                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9122        else
9123            len = ucs1lib_InsertThousandsGrouping(
9124                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9125                min_width, grouping,
9126                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9127        break;
9128    case PyUnicode_2BYTE_KIND:
9129        len = ucs2lib_InsertThousandsGrouping(
9130            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9131            min_width, grouping,
9132            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9133        break;
9134    case PyUnicode_4BYTE_KIND:
9135        len = ucs4lib_InsertThousandsGrouping(
9136            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9137            min_width, grouping,
9138            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9139        break;
9140    default:
9141        assert(0);
9142        return -1;
9143    }
9144    if (unicode != NULL && thousands_sep_kind != kind) {
9145        if (thousands_sep_kind < kind)
9146            PyMem_Free(thousands_sep_data);
9147        else
9148            PyMem_Free(data);
9149    }
9150    if (unicode == NULL) {
9151        *maxchar = 127;
9152        if (len != n_digits) {
9153            *maxchar = Py_MAX(*maxchar,
9154                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9155        }
9156    }
9157    return len;
9158}
9159
9160
9161/* helper macro to fixup start/end slice values */
9162#define ADJUST_INDICES(start, end, len)         \
9163    if (end > len)                              \
9164        end = len;                              \
9165    else if (end < 0) {                         \
9166        end += len;                             \
9167        if (end < 0)                            \
9168            end = 0;                            \
9169    }                                           \
9170    if (start < 0) {                            \
9171        start += len;                           \
9172        if (start < 0)                          \
9173            start = 0;                          \
9174    }
9175
9176Py_ssize_t
9177PyUnicode_Count(PyObject *str,
9178                PyObject *substr,
9179                Py_ssize_t start,
9180                Py_ssize_t end)
9181{
9182    Py_ssize_t result;
9183    PyObject* str_obj;
9184    PyObject* sub_obj;
9185    int kind1, kind2, kind;
9186    void *buf1 = NULL, *buf2 = NULL;
9187    Py_ssize_t len1, len2;
9188
9189    str_obj = PyUnicode_FromObject(str);
9190    if (!str_obj)
9191        return -1;
9192    sub_obj = PyUnicode_FromObject(substr);
9193    if (!sub_obj) {
9194        Py_DECREF(str_obj);
9195        return -1;
9196    }
9197    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9198        Py_DECREF(sub_obj);
9199        Py_DECREF(str_obj);
9200        return -1;
9201    }
9202
9203    kind1 = PyUnicode_KIND(str_obj);
9204    kind2 = PyUnicode_KIND(sub_obj);
9205    kind = kind1;
9206    buf1 = PyUnicode_DATA(str_obj);
9207    buf2 = PyUnicode_DATA(sub_obj);
9208    if (kind2 != kind) {
9209        if (kind2 > kind) {
9210            Py_DECREF(sub_obj);
9211            Py_DECREF(str_obj);
9212            return 0;
9213        }
9214        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9215    }
9216    if (!buf2)
9217        goto onError;
9218    len1 = PyUnicode_GET_LENGTH(str_obj);
9219    len2 = PyUnicode_GET_LENGTH(sub_obj);
9220
9221    ADJUST_INDICES(start, end, len1);
9222    switch (kind) {
9223    case PyUnicode_1BYTE_KIND:
9224        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9225            result = asciilib_count(
9226                ((Py_UCS1*)buf1) + start, end - start,
9227                buf2, len2, PY_SSIZE_T_MAX
9228                );
9229        else
9230            result = ucs1lib_count(
9231                ((Py_UCS1*)buf1) + start, end - start,
9232                buf2, len2, PY_SSIZE_T_MAX
9233                );
9234        break;
9235    case PyUnicode_2BYTE_KIND:
9236        result = ucs2lib_count(
9237            ((Py_UCS2*)buf1) + start, end - start,
9238            buf2, len2, PY_SSIZE_T_MAX
9239            );
9240        break;
9241    case PyUnicode_4BYTE_KIND:
9242        result = ucs4lib_count(
9243            ((Py_UCS4*)buf1) + start, end - start,
9244            buf2, len2, PY_SSIZE_T_MAX
9245            );
9246        break;
9247    default:
9248        assert(0); result = 0;
9249    }
9250
9251    Py_DECREF(sub_obj);
9252    Py_DECREF(str_obj);
9253
9254    if (kind2 != kind)
9255        PyMem_Free(buf2);
9256
9257    return result;
9258  onError:
9259    Py_DECREF(sub_obj);
9260    Py_DECREF(str_obj);
9261    if (kind2 != kind && buf2)
9262        PyMem_Free(buf2);
9263    return -1;
9264}
9265
9266Py_ssize_t
9267PyUnicode_Find(PyObject *str,
9268               PyObject *sub,
9269               Py_ssize_t start,
9270               Py_ssize_t end,
9271               int direction)
9272{
9273    Py_ssize_t result;
9274
9275    str = PyUnicode_FromObject(str);
9276    if (!str)
9277        return -2;
9278    sub = PyUnicode_FromObject(sub);
9279    if (!sub) {
9280        Py_DECREF(str);
9281        return -2;
9282    }
9283    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9284        Py_DECREF(sub);
9285        Py_DECREF(str);
9286        return -2;
9287    }
9288
9289    result = any_find_slice(direction,
9290        str, sub, start, end
9291        );
9292
9293    Py_DECREF(str);
9294    Py_DECREF(sub);
9295
9296    return result;
9297}
9298
9299Py_ssize_t
9300PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9301                   Py_ssize_t start, Py_ssize_t end,
9302                   int direction)
9303{
9304    int kind;
9305    Py_ssize_t result;
9306    if (PyUnicode_READY(str) == -1)
9307        return -2;
9308    if (start < 0 || end < 0) {
9309        PyErr_SetString(PyExc_IndexError, "string index out of range");
9310        return -2;
9311    }
9312    if (end > PyUnicode_GET_LENGTH(str))
9313        end = PyUnicode_GET_LENGTH(str);
9314    kind = PyUnicode_KIND(str);
9315    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9316                      kind, end-start, ch, direction);
9317    if (result == -1)
9318        return -1;
9319    else
9320        return start + result;
9321}
9322
9323static int
9324tailmatch(PyObject *self,
9325          PyObject *substring,
9326          Py_ssize_t start,
9327          Py_ssize_t end,
9328          int direction)
9329{
9330    int kind_self;
9331    int kind_sub;
9332    void *data_self;
9333    void *data_sub;
9334    Py_ssize_t offset;
9335    Py_ssize_t i;
9336    Py_ssize_t end_sub;
9337
9338    if (PyUnicode_READY(self) == -1 ||
9339        PyUnicode_READY(substring) == -1)
9340        return -1;
9341
9342    if (PyUnicode_GET_LENGTH(substring) == 0)
9343        return 1;
9344
9345    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9346    end -= PyUnicode_GET_LENGTH(substring);
9347    if (end < start)
9348        return 0;
9349
9350    kind_self = PyUnicode_KIND(self);
9351    data_self = PyUnicode_DATA(self);
9352    kind_sub = PyUnicode_KIND(substring);
9353    data_sub = PyUnicode_DATA(substring);
9354    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9355
9356    if (direction > 0)
9357        offset = end;
9358    else
9359        offset = start;
9360
9361    if (PyUnicode_READ(kind_self, data_self, offset) ==
9362        PyUnicode_READ(kind_sub, data_sub, 0) &&
9363        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9364        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9365        /* If both are of the same kind, memcmp is sufficient */
9366        if (kind_self == kind_sub) {
9367            return ! memcmp((char *)data_self +
9368                                (offset * PyUnicode_KIND(substring)),
9369                            data_sub,
9370                            PyUnicode_GET_LENGTH(substring) *
9371                                PyUnicode_KIND(substring));
9372        }
9373        /* otherwise we have to compare each character by first accesing it */
9374        else {
9375            /* We do not need to compare 0 and len(substring)-1 because
9376               the if statement above ensured already that they are equal
9377               when we end up here. */
9378            for (i = 1; i < end_sub; ++i) {
9379                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9380                    PyUnicode_READ(kind_sub, data_sub, i))
9381                    return 0;
9382            }
9383            return 1;
9384        }
9385    }
9386
9387    return 0;
9388}
9389
9390Py_ssize_t
9391PyUnicode_Tailmatch(PyObject *str,
9392                    PyObject *substr,
9393                    Py_ssize_t start,
9394                    Py_ssize_t end,
9395                    int direction)
9396{
9397    Py_ssize_t result;
9398
9399    str = PyUnicode_FromObject(str);
9400    if (str == NULL)
9401        return -1;
9402    substr = PyUnicode_FromObject(substr);
9403    if (substr == NULL) {
9404        Py_DECREF(str);
9405        return -1;
9406    }
9407
9408    result = tailmatch(str, substr,
9409                       start, end, direction);
9410    Py_DECREF(str);
9411    Py_DECREF(substr);
9412    return result;
9413}
9414
9415/* Apply fixfct filter to the Unicode object self and return a
9416   reference to the modified object */
9417
9418static PyObject *
9419fixup(PyObject *self,
9420      Py_UCS4 (*fixfct)(PyObject *s))
9421{
9422    PyObject *u;
9423    Py_UCS4 maxchar_old, maxchar_new = 0;
9424    PyObject *v;
9425
9426    u = _PyUnicode_Copy(self);
9427    if (u == NULL)
9428        return NULL;
9429    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9430
9431    /* fix functions return the new maximum character in a string,
9432       if the kind of the resulting unicode object does not change,
9433       everything is fine.  Otherwise we need to change the string kind
9434       and re-run the fix function. */
9435    maxchar_new = fixfct(u);
9436
9437    if (maxchar_new == 0) {
9438        /* no changes */;
9439        if (PyUnicode_CheckExact(self)) {
9440            Py_DECREF(u);
9441            Py_INCREF(self);
9442            return self;
9443        }
9444        else
9445            return u;
9446    }
9447
9448    maxchar_new = align_maxchar(maxchar_new);
9449
9450    if (maxchar_new == maxchar_old)
9451        return u;
9452
9453    /* In case the maximum character changed, we need to
9454       convert the string to the new category. */
9455    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9456    if (v == NULL) {
9457        Py_DECREF(u);
9458        return NULL;
9459    }
9460    if (maxchar_new > maxchar_old) {
9461        /* If the maxchar increased so that the kind changed, not all
9462           characters are representable anymore and we need to fix the
9463           string again. This only happens in very few cases. */
9464        _PyUnicode_FastCopyCharacters(v, 0,
9465                                      self, 0, PyUnicode_GET_LENGTH(self));
9466        maxchar_old = fixfct(v);
9467        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9468    }
9469    else {
9470        _PyUnicode_FastCopyCharacters(v, 0,
9471                                      u, 0, PyUnicode_GET_LENGTH(self));
9472    }
9473    Py_DECREF(u);
9474    assert(_PyUnicode_CheckConsistency(v, 1));
9475    return v;
9476}
9477
9478static PyObject *
9479ascii_upper_or_lower(PyObject *self, int lower)
9480{
9481    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9482    char *resdata, *data = PyUnicode_DATA(self);
9483    PyObject *res;
9484
9485    res = PyUnicode_New(len, 127);
9486    if (res == NULL)
9487        return NULL;
9488    resdata = PyUnicode_DATA(res);
9489    if (lower)
9490        _Py_bytes_lower(resdata, data, len);
9491    else
9492        _Py_bytes_upper(resdata, data, len);
9493    return res;
9494}
9495
9496static Py_UCS4
9497handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9498{
9499    Py_ssize_t j;
9500    int final_sigma;
9501    Py_UCS4 c;
9502    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9503
9504     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9505
9506    where ! is a negation and \p{xxx} is a character with property xxx.
9507    */
9508    for (j = i - 1; j >= 0; j--) {
9509        c = PyUnicode_READ(kind, data, j);
9510        if (!_PyUnicode_IsCaseIgnorable(c))
9511            break;
9512    }
9513    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9514    if (final_sigma) {
9515        for (j = i + 1; j < length; j++) {
9516            c = PyUnicode_READ(kind, data, j);
9517            if (!_PyUnicode_IsCaseIgnorable(c))
9518                break;
9519        }
9520        final_sigma = j == length || !_PyUnicode_IsCased(c);
9521    }
9522    return (final_sigma) ? 0x3C2 : 0x3C3;
9523}
9524
9525static int
9526lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9527           Py_UCS4 c, Py_UCS4 *mapped)
9528{
9529    /* Obscure special case. */
9530    if (c == 0x3A3) {
9531        mapped[0] = handle_capital_sigma(kind, data, length, i);
9532        return 1;
9533    }
9534    return _PyUnicode_ToLowerFull(c, mapped);
9535}
9536
9537static Py_ssize_t
9538do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9539{
9540    Py_ssize_t i, k = 0;
9541    int n_res, j;
9542    Py_UCS4 c, mapped[3];
9543
9544    c = PyUnicode_READ(kind, data, 0);
9545    n_res = _PyUnicode_ToUpperFull(c, mapped);
9546    for (j = 0; j < n_res; j++) {
9547        *maxchar = Py_MAX(*maxchar, mapped[j]);
9548        res[k++] = mapped[j];
9549    }
9550    for (i = 1; i < length; i++) {
9551        c = PyUnicode_READ(kind, data, i);
9552        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9553        for (j = 0; j < n_res; j++) {
9554            *maxchar = Py_MAX(*maxchar, mapped[j]);
9555            res[k++] = mapped[j];
9556        }
9557    }
9558    return k;
9559}
9560
9561static Py_ssize_t
9562do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9563    Py_ssize_t i, k = 0;
9564
9565    for (i = 0; i < length; i++) {
9566        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9567        int n_res, j;
9568        if (Py_UNICODE_ISUPPER(c)) {
9569            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9570        }
9571        else if (Py_UNICODE_ISLOWER(c)) {
9572            n_res = _PyUnicode_ToUpperFull(c, mapped);
9573        }
9574        else {
9575            n_res = 1;
9576            mapped[0] = c;
9577        }
9578        for (j = 0; j < n_res; j++) {
9579            *maxchar = Py_MAX(*maxchar, mapped[j]);
9580            res[k++] = mapped[j];
9581        }
9582    }
9583    return k;
9584}
9585
9586static Py_ssize_t
9587do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9588                  Py_UCS4 *maxchar, int lower)
9589{
9590    Py_ssize_t i, k = 0;
9591
9592    for (i = 0; i < length; i++) {
9593        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9594        int n_res, j;
9595        if (lower)
9596            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9597        else
9598            n_res = _PyUnicode_ToUpperFull(c, mapped);
9599        for (j = 0; j < n_res; j++) {
9600            *maxchar = Py_MAX(*maxchar, mapped[j]);
9601            res[k++] = mapped[j];
9602        }
9603    }
9604    return k;
9605}
9606
9607static Py_ssize_t
9608do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9609{
9610    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9611}
9612
9613static Py_ssize_t
9614do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9615{
9616    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9617}
9618
9619static Py_ssize_t
9620do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9621{
9622    Py_ssize_t i, k = 0;
9623
9624    for (i = 0; i < length; i++) {
9625        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9626        Py_UCS4 mapped[3];
9627        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9628        for (j = 0; j < n_res; j++) {
9629            *maxchar = Py_MAX(*maxchar, mapped[j]);
9630            res[k++] = mapped[j];
9631        }
9632    }
9633    return k;
9634}
9635
9636static Py_ssize_t
9637do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9638{
9639    Py_ssize_t i, k = 0;
9640    int previous_is_cased;
9641
9642    previous_is_cased = 0;
9643    for (i = 0; i < length; i++) {
9644        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9645        Py_UCS4 mapped[3];
9646        int n_res, j;
9647
9648        if (previous_is_cased)
9649            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9650        else
9651            n_res = _PyUnicode_ToTitleFull(c, mapped);
9652
9653        for (j = 0; j < n_res; j++) {
9654            *maxchar = Py_MAX(*maxchar, mapped[j]);
9655            res[k++] = mapped[j];
9656        }
9657
9658        previous_is_cased = _PyUnicode_IsCased(c);
9659    }
9660    return k;
9661}
9662
9663static PyObject *
9664case_operation(PyObject *self,
9665               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9666{
9667    PyObject *res = NULL;
9668    Py_ssize_t length, newlength = 0;
9669    int kind, outkind;
9670    void *data, *outdata;
9671    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9672
9673    assert(PyUnicode_IS_READY(self));
9674
9675    kind = PyUnicode_KIND(self);
9676    data = PyUnicode_DATA(self);
9677    length = PyUnicode_GET_LENGTH(self);
9678    if (length > PY_SSIZE_T_MAX / 3 ||
9679        length > PY_SIZE_MAX / (3 * sizeof(Py_UCS4))) {
9680        PyErr_SetString(PyExc_OverflowError, "string is too long");
9681        return NULL;
9682    }
9683    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * (size_t)length);
9684    if (tmp == NULL)
9685        return PyErr_NoMemory();
9686    newlength = perform(kind, data, length, tmp, &maxchar);
9687    res = PyUnicode_New(newlength, maxchar);
9688    if (res == NULL)
9689        goto leave;
9690    tmpend = tmp + newlength;
9691    outdata = PyUnicode_DATA(res);
9692    outkind = PyUnicode_KIND(res);
9693    switch (outkind) {
9694    case PyUnicode_1BYTE_KIND:
9695        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9696        break;
9697    case PyUnicode_2BYTE_KIND:
9698        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9699        break;
9700    case PyUnicode_4BYTE_KIND:
9701        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9702        break;
9703    default:
9704        assert(0);
9705        break;
9706    }
9707  leave:
9708    PyMem_FREE(tmp);
9709    return res;
9710}
9711
9712PyObject *
9713PyUnicode_Join(PyObject *separator, PyObject *seq)
9714{
9715    PyObject *sep = NULL;
9716    Py_ssize_t seplen;
9717    PyObject *res = NULL; /* the result */
9718    PyObject *fseq;          /* PySequence_Fast(seq) */
9719    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9720    PyObject **items;
9721    PyObject *item;
9722    Py_ssize_t sz, i, res_offset;
9723    Py_UCS4 maxchar;
9724    Py_UCS4 item_maxchar;
9725    int use_memcpy;
9726    unsigned char *res_data = NULL, *sep_data = NULL;
9727    PyObject *last_obj;
9728    unsigned int kind = 0;
9729
9730    fseq = PySequence_Fast(seq, "can only join an iterable");
9731    if (fseq == NULL) {
9732        return NULL;
9733    }
9734
9735    /* NOTE: the following code can't call back into Python code,
9736     * so we are sure that fseq won't be mutated.
9737     */
9738
9739    seqlen = PySequence_Fast_GET_SIZE(fseq);
9740    /* If empty sequence, return u"". */
9741    if (seqlen == 0) {
9742        Py_DECREF(fseq);
9743        _Py_RETURN_UNICODE_EMPTY();
9744    }
9745
9746    /* If singleton sequence with an exact Unicode, return that. */
9747    last_obj = NULL;
9748    items = PySequence_Fast_ITEMS(fseq);
9749    if (seqlen == 1) {
9750        if (PyUnicode_CheckExact(items[0])) {
9751            res = items[0];
9752            Py_INCREF(res);
9753            Py_DECREF(fseq);
9754            return res;
9755        }
9756        seplen = 0;
9757        maxchar = 0;
9758    }
9759    else {
9760        /* Set up sep and seplen */
9761        if (separator == NULL) {
9762            /* fall back to a blank space separator */
9763            sep = PyUnicode_FromOrdinal(' ');
9764            if (!sep)
9765                goto onError;
9766            seplen = 1;
9767            maxchar = 32;
9768        }
9769        else {
9770            if (!PyUnicode_Check(separator)) {
9771                PyErr_Format(PyExc_TypeError,
9772                             "separator: expected str instance,"
9773                             " %.80s found",
9774                             Py_TYPE(separator)->tp_name);
9775                goto onError;
9776            }
9777            if (PyUnicode_READY(separator))
9778                goto onError;
9779            sep = separator;
9780            seplen = PyUnicode_GET_LENGTH(separator);
9781            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9782            /* inc refcount to keep this code path symmetric with the
9783               above case of a blank separator */
9784            Py_INCREF(sep);
9785        }
9786        last_obj = sep;
9787    }
9788
9789    /* There are at least two things to join, or else we have a subclass
9790     * of str in the sequence.
9791     * Do a pre-pass to figure out the total amount of space we'll
9792     * need (sz), and see whether all argument are strings.
9793     */
9794    sz = 0;
9795#ifdef Py_DEBUG
9796    use_memcpy = 0;
9797#else
9798    use_memcpy = 1;
9799#endif
9800    for (i = 0; i < seqlen; i++) {
9801        const Py_ssize_t old_sz = sz;
9802        item = items[i];
9803        if (!PyUnicode_Check(item)) {
9804            PyErr_Format(PyExc_TypeError,
9805                         "sequence item %zd: expected str instance,"
9806                         " %.80s found",
9807                         i, Py_TYPE(item)->tp_name);
9808            goto onError;
9809        }
9810        if (PyUnicode_READY(item) == -1)
9811            goto onError;
9812        sz += PyUnicode_GET_LENGTH(item);
9813        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9814        maxchar = Py_MAX(maxchar, item_maxchar);
9815        if (i != 0)
9816            sz += seplen;
9817        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9818            PyErr_SetString(PyExc_OverflowError,
9819                            "join() result is too long for a Python string");
9820            goto onError;
9821        }
9822        if (use_memcpy && last_obj != NULL) {
9823            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9824                use_memcpy = 0;
9825        }
9826        last_obj = item;
9827    }
9828
9829    res = PyUnicode_New(sz, maxchar);
9830    if (res == NULL)
9831        goto onError;
9832
9833    /* Catenate everything. */
9834#ifdef Py_DEBUG
9835    use_memcpy = 0;
9836#else
9837    if (use_memcpy) {
9838        res_data = PyUnicode_1BYTE_DATA(res);
9839        kind = PyUnicode_KIND(res);
9840        if (seplen != 0)
9841            sep_data = PyUnicode_1BYTE_DATA(sep);
9842    }
9843#endif
9844    if (use_memcpy) {
9845        for (i = 0; i < seqlen; ++i) {
9846            Py_ssize_t itemlen;
9847            item = items[i];
9848
9849            /* Copy item, and maybe the separator. */
9850            if (i && seplen != 0) {
9851                Py_MEMCPY(res_data,
9852                          sep_data,
9853                          kind * seplen);
9854                res_data += kind * seplen;
9855            }
9856
9857            itemlen = PyUnicode_GET_LENGTH(item);
9858            if (itemlen != 0) {
9859                Py_MEMCPY(res_data,
9860                          PyUnicode_DATA(item),
9861                          kind * itemlen);
9862                res_data += kind * itemlen;
9863            }
9864        }
9865        assert(res_data == PyUnicode_1BYTE_DATA(res)
9866                           + kind * PyUnicode_GET_LENGTH(res));
9867    }
9868    else {
9869        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9870            Py_ssize_t itemlen;
9871            item = items[i];
9872
9873            /* Copy item, and maybe the separator. */
9874            if (i && seplen != 0) {
9875                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9876                res_offset += seplen;
9877            }
9878
9879            itemlen = PyUnicode_GET_LENGTH(item);
9880            if (itemlen != 0) {
9881                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9882                res_offset += itemlen;
9883            }
9884        }
9885        assert(res_offset == PyUnicode_GET_LENGTH(res));
9886    }
9887
9888    Py_DECREF(fseq);
9889    Py_XDECREF(sep);
9890    assert(_PyUnicode_CheckConsistency(res, 1));
9891    return res;
9892
9893  onError:
9894    Py_DECREF(fseq);
9895    Py_XDECREF(sep);
9896    Py_XDECREF(res);
9897    return NULL;
9898}
9899
9900#define FILL(kind, data, value, start, length) \
9901    do { \
9902        Py_ssize_t i_ = 0; \
9903        assert(kind != PyUnicode_WCHAR_KIND); \
9904        switch ((kind)) { \
9905        case PyUnicode_1BYTE_KIND: { \
9906            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9907            memset(to_, (unsigned char)value, (length)); \
9908            break; \
9909        } \
9910        case PyUnicode_2BYTE_KIND: { \
9911            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9912            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9913            break; \
9914        } \
9915        case PyUnicode_4BYTE_KIND: { \
9916            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9917            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9918            break; \
9919        default: assert(0); \
9920        } \
9921        } \
9922    } while (0)
9923
9924void
9925_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9926                    Py_UCS4 fill_char)
9927{
9928    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9929    const void *data = PyUnicode_DATA(unicode);
9930    assert(PyUnicode_IS_READY(unicode));
9931    assert(unicode_modifiable(unicode));
9932    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9933    assert(start >= 0);
9934    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9935    FILL(kind, data, fill_char, start, length);
9936}
9937
9938Py_ssize_t
9939PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9940               Py_UCS4 fill_char)
9941{
9942    Py_ssize_t maxlen;
9943
9944    if (!PyUnicode_Check(unicode)) {
9945        PyErr_BadInternalCall();
9946        return -1;
9947    }
9948    if (PyUnicode_READY(unicode) == -1)
9949        return -1;
9950    if (unicode_check_modifiable(unicode))
9951        return -1;
9952
9953    if (start < 0) {
9954        PyErr_SetString(PyExc_IndexError, "string index out of range");
9955        return -1;
9956    }
9957    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9958        PyErr_SetString(PyExc_ValueError,
9959                         "fill character is bigger than "
9960                         "the string maximum character");
9961        return -1;
9962    }
9963
9964    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9965    length = Py_MIN(maxlen, length);
9966    if (length <= 0)
9967        return 0;
9968
9969    _PyUnicode_FastFill(unicode, start, length, fill_char);
9970    return length;
9971}
9972
9973static PyObject *
9974pad(PyObject *self,
9975    Py_ssize_t left,
9976    Py_ssize_t right,
9977    Py_UCS4 fill)
9978{
9979    PyObject *u;
9980    Py_UCS4 maxchar;
9981    int kind;
9982    void *data;
9983
9984    if (left < 0)
9985        left = 0;
9986    if (right < 0)
9987        right = 0;
9988
9989    if (left == 0 && right == 0)
9990        return unicode_result_unchanged(self);
9991
9992    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9993        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9994        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9995        return NULL;
9996    }
9997    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9998    maxchar = Py_MAX(maxchar, fill);
9999    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
10000    if (!u)
10001        return NULL;
10002
10003    kind = PyUnicode_KIND(u);
10004    data = PyUnicode_DATA(u);
10005    if (left)
10006        FILL(kind, data, fill, 0, left);
10007    if (right)
10008        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
10009    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
10010    assert(_PyUnicode_CheckConsistency(u, 1));
10011    return u;
10012}
10013
10014PyObject *
10015PyUnicode_Splitlines(PyObject *string, int keepends)
10016{
10017    PyObject *list;
10018
10019    string = PyUnicode_FromObject(string);
10020    if (string == NULL)
10021        return NULL;
10022    if (PyUnicode_READY(string) == -1) {
10023        Py_DECREF(string);
10024        return NULL;
10025    }
10026
10027    switch (PyUnicode_KIND(string)) {
10028    case PyUnicode_1BYTE_KIND:
10029        if (PyUnicode_IS_ASCII(string))
10030            list = asciilib_splitlines(
10031                string, PyUnicode_1BYTE_DATA(string),
10032                PyUnicode_GET_LENGTH(string), keepends);
10033        else
10034            list = ucs1lib_splitlines(
10035                string, PyUnicode_1BYTE_DATA(string),
10036                PyUnicode_GET_LENGTH(string), keepends);
10037        break;
10038    case PyUnicode_2BYTE_KIND:
10039        list = ucs2lib_splitlines(
10040            string, PyUnicode_2BYTE_DATA(string),
10041            PyUnicode_GET_LENGTH(string), keepends);
10042        break;
10043    case PyUnicode_4BYTE_KIND:
10044        list = ucs4lib_splitlines(
10045            string, PyUnicode_4BYTE_DATA(string),
10046            PyUnicode_GET_LENGTH(string), keepends);
10047        break;
10048    default:
10049        assert(0);
10050        list = 0;
10051    }
10052    Py_DECREF(string);
10053    return list;
10054}
10055
10056static PyObject *
10057split(PyObject *self,
10058      PyObject *substring,
10059      Py_ssize_t maxcount)
10060{
10061    int kind1, kind2, kind;
10062    void *buf1, *buf2;
10063    Py_ssize_t len1, len2;
10064    PyObject* out;
10065
10066    if (maxcount < 0)
10067        maxcount = PY_SSIZE_T_MAX;
10068
10069    if (PyUnicode_READY(self) == -1)
10070        return NULL;
10071
10072    if (substring == NULL)
10073        switch (PyUnicode_KIND(self)) {
10074        case PyUnicode_1BYTE_KIND:
10075            if (PyUnicode_IS_ASCII(self))
10076                return asciilib_split_whitespace(
10077                    self,  PyUnicode_1BYTE_DATA(self),
10078                    PyUnicode_GET_LENGTH(self), maxcount
10079                    );
10080            else
10081                return ucs1lib_split_whitespace(
10082                    self,  PyUnicode_1BYTE_DATA(self),
10083                    PyUnicode_GET_LENGTH(self), maxcount
10084                    );
10085        case PyUnicode_2BYTE_KIND:
10086            return ucs2lib_split_whitespace(
10087                self,  PyUnicode_2BYTE_DATA(self),
10088                PyUnicode_GET_LENGTH(self), maxcount
10089                );
10090        case PyUnicode_4BYTE_KIND:
10091            return ucs4lib_split_whitespace(
10092                self,  PyUnicode_4BYTE_DATA(self),
10093                PyUnicode_GET_LENGTH(self), maxcount
10094                );
10095        default:
10096            assert(0);
10097            return NULL;
10098        }
10099
10100    if (PyUnicode_READY(substring) == -1)
10101        return NULL;
10102
10103    kind1 = PyUnicode_KIND(self);
10104    kind2 = PyUnicode_KIND(substring);
10105    kind = kind1 > kind2 ? kind1 : kind2;
10106    buf1 = PyUnicode_DATA(self);
10107    buf2 = PyUnicode_DATA(substring);
10108    if (kind1 != kind)
10109        buf1 = _PyUnicode_AsKind(self, kind);
10110    if (!buf1)
10111        return NULL;
10112    if (kind2 != kind)
10113        buf2 = _PyUnicode_AsKind(substring, kind);
10114    if (!buf2) {
10115        if (kind1 != kind) PyMem_Free(buf1);
10116        return NULL;
10117    }
10118    len1 = PyUnicode_GET_LENGTH(self);
10119    len2 = PyUnicode_GET_LENGTH(substring);
10120
10121    switch (kind) {
10122    case PyUnicode_1BYTE_KIND:
10123        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10124            out = asciilib_split(
10125                self,  buf1, len1, buf2, len2, maxcount);
10126        else
10127            out = ucs1lib_split(
10128                self,  buf1, len1, buf2, len2, maxcount);
10129        break;
10130    case PyUnicode_2BYTE_KIND:
10131        out = ucs2lib_split(
10132            self,  buf1, len1, buf2, len2, maxcount);
10133        break;
10134    case PyUnicode_4BYTE_KIND:
10135        out = ucs4lib_split(
10136            self,  buf1, len1, buf2, len2, maxcount);
10137        break;
10138    default:
10139        out = NULL;
10140    }
10141    if (kind1 != kind)
10142        PyMem_Free(buf1);
10143    if (kind2 != kind)
10144        PyMem_Free(buf2);
10145    return out;
10146}
10147
10148static PyObject *
10149rsplit(PyObject *self,
10150       PyObject *substring,
10151       Py_ssize_t maxcount)
10152{
10153    int kind1, kind2, kind;
10154    void *buf1, *buf2;
10155    Py_ssize_t len1, len2;
10156    PyObject* out;
10157
10158    if (maxcount < 0)
10159        maxcount = PY_SSIZE_T_MAX;
10160
10161    if (PyUnicode_READY(self) == -1)
10162        return NULL;
10163
10164    if (substring == NULL)
10165        switch (PyUnicode_KIND(self)) {
10166        case PyUnicode_1BYTE_KIND:
10167            if (PyUnicode_IS_ASCII(self))
10168                return asciilib_rsplit_whitespace(
10169                    self,  PyUnicode_1BYTE_DATA(self),
10170                    PyUnicode_GET_LENGTH(self), maxcount
10171                    );
10172            else
10173                return ucs1lib_rsplit_whitespace(
10174                    self,  PyUnicode_1BYTE_DATA(self),
10175                    PyUnicode_GET_LENGTH(self), maxcount
10176                    );
10177        case PyUnicode_2BYTE_KIND:
10178            return ucs2lib_rsplit_whitespace(
10179                self,  PyUnicode_2BYTE_DATA(self),
10180                PyUnicode_GET_LENGTH(self), maxcount
10181                );
10182        case PyUnicode_4BYTE_KIND:
10183            return ucs4lib_rsplit_whitespace(
10184                self,  PyUnicode_4BYTE_DATA(self),
10185                PyUnicode_GET_LENGTH(self), maxcount
10186                );
10187        default:
10188            assert(0);
10189            return NULL;
10190        }
10191
10192    if (PyUnicode_READY(substring) == -1)
10193        return NULL;
10194
10195    kind1 = PyUnicode_KIND(self);
10196    kind2 = PyUnicode_KIND(substring);
10197    kind = kind1 > kind2 ? kind1 : kind2;
10198    buf1 = PyUnicode_DATA(self);
10199    buf2 = PyUnicode_DATA(substring);
10200    if (kind1 != kind)
10201        buf1 = _PyUnicode_AsKind(self, kind);
10202    if (!buf1)
10203        return NULL;
10204    if (kind2 != kind)
10205        buf2 = _PyUnicode_AsKind(substring, kind);
10206    if (!buf2) {
10207        if (kind1 != kind) PyMem_Free(buf1);
10208        return NULL;
10209    }
10210    len1 = PyUnicode_GET_LENGTH(self);
10211    len2 = PyUnicode_GET_LENGTH(substring);
10212
10213    switch (kind) {
10214    case PyUnicode_1BYTE_KIND:
10215        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10216            out = asciilib_rsplit(
10217                self,  buf1, len1, buf2, len2, maxcount);
10218        else
10219            out = ucs1lib_rsplit(
10220                self,  buf1, len1, buf2, len2, maxcount);
10221        break;
10222    case PyUnicode_2BYTE_KIND:
10223        out = ucs2lib_rsplit(
10224            self,  buf1, len1, buf2, len2, maxcount);
10225        break;
10226    case PyUnicode_4BYTE_KIND:
10227        out = ucs4lib_rsplit(
10228            self,  buf1, len1, buf2, len2, maxcount);
10229        break;
10230    default:
10231        out = NULL;
10232    }
10233    if (kind1 != kind)
10234        PyMem_Free(buf1);
10235    if (kind2 != kind)
10236        PyMem_Free(buf2);
10237    return out;
10238}
10239
10240static Py_ssize_t
10241anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10242            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10243{
10244    switch (kind) {
10245    case PyUnicode_1BYTE_KIND:
10246        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10247            return asciilib_find(buf1, len1, buf2, len2, offset);
10248        else
10249            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10250    case PyUnicode_2BYTE_KIND:
10251        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10252    case PyUnicode_4BYTE_KIND:
10253        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10254    }
10255    assert(0);
10256    return -1;
10257}
10258
10259static Py_ssize_t
10260anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10261             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10262{
10263    switch (kind) {
10264    case PyUnicode_1BYTE_KIND:
10265        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10266            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10267        else
10268            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10269    case PyUnicode_2BYTE_KIND:
10270        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10271    case PyUnicode_4BYTE_KIND:
10272        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10273    }
10274    assert(0);
10275    return 0;
10276}
10277
10278static void
10279replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10280                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10281{
10282    int kind = PyUnicode_KIND(u);
10283    void *data = PyUnicode_DATA(u);
10284    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10285    if (kind == PyUnicode_1BYTE_KIND) {
10286        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10287                                      (Py_UCS1 *)data + len,
10288                                      u1, u2, maxcount);
10289    }
10290    else if (kind == PyUnicode_2BYTE_KIND) {
10291        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10292                                      (Py_UCS2 *)data + len,
10293                                      u1, u2, maxcount);
10294    }
10295    else {
10296        assert(kind == PyUnicode_4BYTE_KIND);
10297        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10298                                      (Py_UCS4 *)data + len,
10299                                      u1, u2, maxcount);
10300    }
10301}
10302
10303static PyObject *
10304replace(PyObject *self, PyObject *str1,
10305        PyObject *str2, Py_ssize_t maxcount)
10306{
10307    PyObject *u;
10308    char *sbuf = PyUnicode_DATA(self);
10309    char *buf1 = PyUnicode_DATA(str1);
10310    char *buf2 = PyUnicode_DATA(str2);
10311    int srelease = 0, release1 = 0, release2 = 0;
10312    int skind = PyUnicode_KIND(self);
10313    int kind1 = PyUnicode_KIND(str1);
10314    int kind2 = PyUnicode_KIND(str2);
10315    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10316    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10317    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10318    int mayshrink;
10319    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10320
10321    if (maxcount < 0)
10322        maxcount = PY_SSIZE_T_MAX;
10323    else if (maxcount == 0 || slen == 0)
10324        goto nothing;
10325
10326    if (str1 == str2)
10327        goto nothing;
10328
10329    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10330    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10331    if (maxchar < maxchar_str1)
10332        /* substring too wide to be present */
10333        goto nothing;
10334    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10335    /* Replacing str1 with str2 may cause a maxchar reduction in the
10336       result string. */
10337    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10338    maxchar = Py_MAX(maxchar, maxchar_str2);
10339
10340    if (len1 == len2) {
10341        /* same length */
10342        if (len1 == 0)
10343            goto nothing;
10344        if (len1 == 1) {
10345            /* replace characters */
10346            Py_UCS4 u1, u2;
10347            Py_ssize_t pos;
10348
10349            u1 = PyUnicode_READ(kind1, buf1, 0);
10350            pos = findchar(sbuf, skind, slen, u1, 1);
10351            if (pos < 0)
10352                goto nothing;
10353            u2 = PyUnicode_READ(kind2, buf2, 0);
10354            u = PyUnicode_New(slen, maxchar);
10355            if (!u)
10356                goto error;
10357
10358            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10359            replace_1char_inplace(u, pos, u1, u2, maxcount);
10360        }
10361        else {
10362            int rkind = skind;
10363            char *res;
10364            Py_ssize_t i;
10365
10366            if (kind1 < rkind) {
10367                /* widen substring */
10368                buf1 = _PyUnicode_AsKind(str1, rkind);
10369                if (!buf1) goto error;
10370                release1 = 1;
10371            }
10372            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10373            if (i < 0)
10374                goto nothing;
10375            if (rkind > kind2) {
10376                /* widen replacement */
10377                buf2 = _PyUnicode_AsKind(str2, rkind);
10378                if (!buf2) goto error;
10379                release2 = 1;
10380            }
10381            else if (rkind < kind2) {
10382                /* widen self and buf1 */
10383                rkind = kind2;
10384                if (release1) PyMem_Free(buf1);
10385                release1 = 0;
10386                sbuf = _PyUnicode_AsKind(self, rkind);
10387                if (!sbuf) goto error;
10388                srelease = 1;
10389                buf1 = _PyUnicode_AsKind(str1, rkind);
10390                if (!buf1) goto error;
10391                release1 = 1;
10392            }
10393            u = PyUnicode_New(slen, maxchar);
10394            if (!u)
10395                goto error;
10396            assert(PyUnicode_KIND(u) == rkind);
10397            res = PyUnicode_DATA(u);
10398
10399            memcpy(res, sbuf, rkind * slen);
10400            /* change everything in-place, starting with this one */
10401            memcpy(res + rkind * i,
10402                   buf2,
10403                   rkind * len2);
10404            i += len1;
10405
10406            while ( --maxcount > 0) {
10407                i = anylib_find(rkind, self,
10408                                sbuf+rkind*i, slen-i,
10409                                str1, buf1, len1, i);
10410                if (i == -1)
10411                    break;
10412                memcpy(res + rkind * i,
10413                       buf2,
10414                       rkind * len2);
10415                i += len1;
10416            }
10417        }
10418    }
10419    else {
10420        Py_ssize_t n, i, j, ires;
10421        Py_ssize_t new_size;
10422        int rkind = skind;
10423        char *res;
10424
10425        if (kind1 < rkind) {
10426            /* widen substring */
10427            buf1 = _PyUnicode_AsKind(str1, rkind);
10428            if (!buf1) goto error;
10429            release1 = 1;
10430        }
10431        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10432        if (n == 0)
10433            goto nothing;
10434        if (kind2 < rkind) {
10435            /* widen replacement */
10436            buf2 = _PyUnicode_AsKind(str2, rkind);
10437            if (!buf2) goto error;
10438            release2 = 1;
10439        }
10440        else if (kind2 > rkind) {
10441            /* widen self and buf1 */
10442            rkind = kind2;
10443            sbuf = _PyUnicode_AsKind(self, rkind);
10444            if (!sbuf) goto error;
10445            srelease = 1;
10446            if (release1) PyMem_Free(buf1);
10447            release1 = 0;
10448            buf1 = _PyUnicode_AsKind(str1, rkind);
10449            if (!buf1) goto error;
10450            release1 = 1;
10451        }
10452        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10453           PyUnicode_GET_LENGTH(str1))); */
10454        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10455                PyErr_SetString(PyExc_OverflowError,
10456                                "replace string is too long");
10457                goto error;
10458        }
10459        new_size = slen + n * (len2 - len1);
10460        if (new_size == 0) {
10461            _Py_INCREF_UNICODE_EMPTY();
10462            if (!unicode_empty)
10463                goto error;
10464            u = unicode_empty;
10465            goto done;
10466        }
10467        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10468            PyErr_SetString(PyExc_OverflowError,
10469                            "replace string is too long");
10470            goto error;
10471        }
10472        u = PyUnicode_New(new_size, maxchar);
10473        if (!u)
10474            goto error;
10475        assert(PyUnicode_KIND(u) == rkind);
10476        res = PyUnicode_DATA(u);
10477        ires = i = 0;
10478        if (len1 > 0) {
10479            while (n-- > 0) {
10480                /* look for next match */
10481                j = anylib_find(rkind, self,
10482                                sbuf + rkind * i, slen-i,
10483                                str1, buf1, len1, i);
10484                if (j == -1)
10485                    break;
10486                else if (j > i) {
10487                    /* copy unchanged part [i:j] */
10488                    memcpy(res + rkind * ires,
10489                           sbuf + rkind * i,
10490                           rkind * (j-i));
10491                    ires += j - i;
10492                }
10493                /* copy substitution string */
10494                if (len2 > 0) {
10495                    memcpy(res + rkind * ires,
10496                           buf2,
10497                           rkind * len2);
10498                    ires += len2;
10499                }
10500                i = j + len1;
10501            }
10502            if (i < slen)
10503                /* copy tail [i:] */
10504                memcpy(res + rkind * ires,
10505                       sbuf + rkind * i,
10506                       rkind * (slen-i));
10507        }
10508        else {
10509            /* interleave */
10510            while (n > 0) {
10511                memcpy(res + rkind * ires,
10512                       buf2,
10513                       rkind * len2);
10514                ires += len2;
10515                if (--n <= 0)
10516                    break;
10517                memcpy(res + rkind * ires,
10518                       sbuf + rkind * i,
10519                       rkind);
10520                ires++;
10521                i++;
10522            }
10523            memcpy(res + rkind * ires,
10524                   sbuf + rkind * i,
10525                   rkind * (slen-i));
10526        }
10527    }
10528
10529    if (mayshrink) {
10530        unicode_adjust_maxchar(&u);
10531        if (u == NULL)
10532            goto error;
10533    }
10534
10535  done:
10536    if (srelease)
10537        PyMem_FREE(sbuf);
10538    if (release1)
10539        PyMem_FREE(buf1);
10540    if (release2)
10541        PyMem_FREE(buf2);
10542    assert(_PyUnicode_CheckConsistency(u, 1));
10543    return u;
10544
10545  nothing:
10546    /* nothing to replace; return original string (when possible) */
10547    if (srelease)
10548        PyMem_FREE(sbuf);
10549    if (release1)
10550        PyMem_FREE(buf1);
10551    if (release2)
10552        PyMem_FREE(buf2);
10553    return unicode_result_unchanged(self);
10554
10555  error:
10556    if (srelease && sbuf)
10557        PyMem_FREE(sbuf);
10558    if (release1 && buf1)
10559        PyMem_FREE(buf1);
10560    if (release2 && buf2)
10561        PyMem_FREE(buf2);
10562    return NULL;
10563}
10564
10565/* --- Unicode Object Methods --------------------------------------------- */
10566
10567PyDoc_STRVAR(title__doc__,
10568             "S.title() -> str\n\
10569\n\
10570Return a titlecased version of S, i.e. words start with title case\n\
10571characters, all remaining cased characters have lower case.");
10572
10573static PyObject*
10574unicode_title(PyObject *self)
10575{
10576    if (PyUnicode_READY(self) == -1)
10577        return NULL;
10578    return case_operation(self, do_title);
10579}
10580
10581PyDoc_STRVAR(capitalize__doc__,
10582             "S.capitalize() -> str\n\
10583\n\
10584Return a capitalized version of S, i.e. make the first character\n\
10585have upper case and the rest lower case.");
10586
10587static PyObject*
10588unicode_capitalize(PyObject *self)
10589{
10590    if (PyUnicode_READY(self) == -1)
10591        return NULL;
10592    if (PyUnicode_GET_LENGTH(self) == 0)
10593        return unicode_result_unchanged(self);
10594    return case_operation(self, do_capitalize);
10595}
10596
10597PyDoc_STRVAR(casefold__doc__,
10598             "S.casefold() -> str\n\
10599\n\
10600Return a version of S suitable for caseless comparisons.");
10601
10602static PyObject *
10603unicode_casefold(PyObject *self)
10604{
10605    if (PyUnicode_READY(self) == -1)
10606        return NULL;
10607    if (PyUnicode_IS_ASCII(self))
10608        return ascii_upper_or_lower(self, 1);
10609    return case_operation(self, do_casefold);
10610}
10611
10612
10613/* Argument converter.  Coerces to a single unicode character */
10614
10615static int
10616convert_uc(PyObject *obj, void *addr)
10617{
10618    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10619    PyObject *uniobj;
10620
10621    uniobj = PyUnicode_FromObject(obj);
10622    if (uniobj == NULL) {
10623        PyErr_SetString(PyExc_TypeError,
10624                        "The fill character cannot be converted to Unicode");
10625        return 0;
10626    }
10627    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10628        PyErr_SetString(PyExc_TypeError,
10629                        "The fill character must be exactly one character long");
10630        Py_DECREF(uniobj);
10631        return 0;
10632    }
10633    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10634    Py_DECREF(uniobj);
10635    return 1;
10636}
10637
10638PyDoc_STRVAR(center__doc__,
10639             "S.center(width[, fillchar]) -> str\n\
10640\n\
10641Return S centered in a string of length width. Padding is\n\
10642done using the specified fill character (default is a space)");
10643
10644static PyObject *
10645unicode_center(PyObject *self, PyObject *args)
10646{
10647    Py_ssize_t marg, left;
10648    Py_ssize_t width;
10649    Py_UCS4 fillchar = ' ';
10650
10651    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10652        return NULL;
10653
10654    if (PyUnicode_READY(self) == -1)
10655        return NULL;
10656
10657    if (PyUnicode_GET_LENGTH(self) >= width)
10658        return unicode_result_unchanged(self);
10659
10660    marg = width - PyUnicode_GET_LENGTH(self);
10661    left = marg / 2 + (marg & width & 1);
10662
10663    return pad(self, left, marg - left, fillchar);
10664}
10665
10666/* This function assumes that str1 and str2 are readied by the caller. */
10667
10668static int
10669unicode_compare(PyObject *str1, PyObject *str2)
10670{
10671#define COMPARE(TYPE1, TYPE2) \
10672    do { \
10673        TYPE1* p1 = (TYPE1 *)data1; \
10674        TYPE2* p2 = (TYPE2 *)data2; \
10675        TYPE1* end = p1 + len; \
10676        Py_UCS4 c1, c2; \
10677        for (; p1 != end; p1++, p2++) { \
10678            c1 = *p1; \
10679            c2 = *p2; \
10680            if (c1 != c2) \
10681                return (c1 < c2) ? -1 : 1; \
10682        } \
10683    } \
10684    while (0)
10685
10686    int kind1, kind2;
10687    void *data1, *data2;
10688    Py_ssize_t len1, len2, len;
10689
10690    kind1 = PyUnicode_KIND(str1);
10691    kind2 = PyUnicode_KIND(str2);
10692    data1 = PyUnicode_DATA(str1);
10693    data2 = PyUnicode_DATA(str2);
10694    len1 = PyUnicode_GET_LENGTH(str1);
10695    len2 = PyUnicode_GET_LENGTH(str2);
10696    len = Py_MIN(len1, len2);
10697
10698    switch(kind1) {
10699    case PyUnicode_1BYTE_KIND:
10700    {
10701        switch(kind2) {
10702        case PyUnicode_1BYTE_KIND:
10703        {
10704            int cmp = memcmp(data1, data2, len);
10705            /* normalize result of memcmp() into the range [-1; 1] */
10706            if (cmp < 0)
10707                return -1;
10708            if (cmp > 0)
10709                return 1;
10710            break;
10711        }
10712        case PyUnicode_2BYTE_KIND:
10713            COMPARE(Py_UCS1, Py_UCS2);
10714            break;
10715        case PyUnicode_4BYTE_KIND:
10716            COMPARE(Py_UCS1, Py_UCS4);
10717            break;
10718        default:
10719            assert(0);
10720        }
10721        break;
10722    }
10723    case PyUnicode_2BYTE_KIND:
10724    {
10725        switch(kind2) {
10726        case PyUnicode_1BYTE_KIND:
10727            COMPARE(Py_UCS2, Py_UCS1);
10728            break;
10729        case PyUnicode_2BYTE_KIND:
10730        {
10731            COMPARE(Py_UCS2, Py_UCS2);
10732            break;
10733        }
10734        case PyUnicode_4BYTE_KIND:
10735            COMPARE(Py_UCS2, Py_UCS4);
10736            break;
10737        default:
10738            assert(0);
10739        }
10740        break;
10741    }
10742    case PyUnicode_4BYTE_KIND:
10743    {
10744        switch(kind2) {
10745        case PyUnicode_1BYTE_KIND:
10746            COMPARE(Py_UCS4, Py_UCS1);
10747            break;
10748        case PyUnicode_2BYTE_KIND:
10749            COMPARE(Py_UCS4, Py_UCS2);
10750            break;
10751        case PyUnicode_4BYTE_KIND:
10752        {
10753#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10754            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10755            /* normalize result of wmemcmp() into the range [-1; 1] */
10756            if (cmp < 0)
10757                return -1;
10758            if (cmp > 0)
10759                return 1;
10760#else
10761            COMPARE(Py_UCS4, Py_UCS4);
10762#endif
10763            break;
10764        }
10765        default:
10766            assert(0);
10767        }
10768        break;
10769    }
10770    default:
10771        assert(0);
10772    }
10773
10774    if (len1 == len2)
10775        return 0;
10776    if (len1 < len2)
10777        return -1;
10778    else
10779        return 1;
10780
10781#undef COMPARE
10782}
10783
10784Py_LOCAL(int)
10785unicode_compare_eq(PyObject *str1, PyObject *str2)
10786{
10787    int kind;
10788    void *data1, *data2;
10789    Py_ssize_t len;
10790    int cmp;
10791
10792    len = PyUnicode_GET_LENGTH(str1);
10793    if (PyUnicode_GET_LENGTH(str2) != len)
10794        return 0;
10795    kind = PyUnicode_KIND(str1);
10796    if (PyUnicode_KIND(str2) != kind)
10797        return 0;
10798    data1 = PyUnicode_DATA(str1);
10799    data2 = PyUnicode_DATA(str2);
10800
10801    cmp = memcmp(data1, data2, len * kind);
10802    return (cmp == 0);
10803}
10804
10805
10806int
10807PyUnicode_Compare(PyObject *left, PyObject *right)
10808{
10809    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10810        if (PyUnicode_READY(left) == -1 ||
10811            PyUnicode_READY(right) == -1)
10812            return -1;
10813
10814        /* a string is equal to itself */
10815        if (left == right)
10816            return 0;
10817
10818        return unicode_compare(left, right);
10819    }
10820    PyErr_Format(PyExc_TypeError,
10821                 "Can't compare %.100s and %.100s",
10822                 left->ob_type->tp_name,
10823                 right->ob_type->tp_name);
10824    return -1;
10825}
10826
10827int
10828_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10829{
10830    PyObject *right_str = _PyUnicode_FromId(right);   /* borrowed */
10831    if (right_str == NULL)
10832        return -1;
10833    return PyUnicode_Compare(left, right_str);
10834}
10835
10836int
10837PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10838{
10839    Py_ssize_t i;
10840    int kind;
10841    Py_UCS4 chr;
10842
10843    assert(_PyUnicode_CHECK(uni));
10844    if (PyUnicode_READY(uni) == -1)
10845        return -1;
10846    kind = PyUnicode_KIND(uni);
10847    if (kind == PyUnicode_1BYTE_KIND) {
10848        const void *data = PyUnicode_1BYTE_DATA(uni);
10849        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10850        size_t len, len2 = strlen(str);
10851        int cmp;
10852
10853        len = Py_MIN(len1, len2);
10854        cmp = memcmp(data, str, len);
10855        if (cmp != 0) {
10856            if (cmp < 0)
10857                return -1;
10858            else
10859                return 1;
10860        }
10861        if (len1 > len2)
10862            return 1; /* uni is longer */
10863        if (len2 > len1)
10864            return -1; /* str is longer */
10865        return 0;
10866    }
10867    else {
10868        void *data = PyUnicode_DATA(uni);
10869        /* Compare Unicode string and source character set string */
10870        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10871            if (chr != (unsigned char)str[i])
10872                return (chr < (unsigned char)(str[i])) ? -1 : 1;
10873        /* This check keeps Python strings that end in '\0' from comparing equal
10874         to C strings identical up to that point. */
10875        if (PyUnicode_GET_LENGTH(uni) != i || chr)
10876            return 1; /* uni is longer */
10877        if (str[i])
10878            return -1; /* str is longer */
10879        return 0;
10880    }
10881}
10882
10883
10884#define TEST_COND(cond)                         \
10885    ((cond) ? Py_True : Py_False)
10886
10887PyObject *
10888PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10889{
10890    int result;
10891    PyObject *v;
10892
10893    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10894        Py_RETURN_NOTIMPLEMENTED;
10895
10896    if (PyUnicode_READY(left) == -1 ||
10897        PyUnicode_READY(right) == -1)
10898        return NULL;
10899
10900    if (left == right) {
10901        switch (op) {
10902        case Py_EQ:
10903        case Py_LE:
10904        case Py_GE:
10905            /* a string is equal to itself */
10906            v = Py_True;
10907            break;
10908        case Py_NE:
10909        case Py_LT:
10910        case Py_GT:
10911            v = Py_False;
10912            break;
10913        default:
10914            PyErr_BadArgument();
10915            return NULL;
10916        }
10917    }
10918    else if (op == Py_EQ || op == Py_NE) {
10919        result = unicode_compare_eq(left, right);
10920        result ^= (op == Py_NE);
10921        v = TEST_COND(result);
10922    }
10923    else {
10924        result = unicode_compare(left, right);
10925
10926        /* Convert the return value to a Boolean */
10927        switch (op) {
10928        case Py_LE:
10929            v = TEST_COND(result <= 0);
10930            break;
10931        case Py_GE:
10932            v = TEST_COND(result >= 0);
10933            break;
10934        case Py_LT:
10935            v = TEST_COND(result == -1);
10936            break;
10937        case Py_GT:
10938            v = TEST_COND(result == 1);
10939            break;
10940        default:
10941            PyErr_BadArgument();
10942            return NULL;
10943        }
10944    }
10945    Py_INCREF(v);
10946    return v;
10947}
10948
10949int
10950PyUnicode_Contains(PyObject *container, PyObject *element)
10951{
10952    PyObject *str, *sub;
10953    int kind1, kind2;
10954    void *buf1, *buf2;
10955    Py_ssize_t len1, len2;
10956    int result;
10957
10958    /* Coerce the two arguments */
10959    sub = PyUnicode_FromObject(element);
10960    if (!sub) {
10961        PyErr_Format(PyExc_TypeError,
10962                     "'in <string>' requires string as left operand, not %s",
10963                     element->ob_type->tp_name);
10964        return -1;
10965    }
10966
10967    str = PyUnicode_FromObject(container);
10968    if (!str) {
10969        Py_DECREF(sub);
10970        return -1;
10971    }
10972
10973    kind1 = PyUnicode_KIND(str);
10974    kind2 = PyUnicode_KIND(sub);
10975    buf1 = PyUnicode_DATA(str);
10976    buf2 = PyUnicode_DATA(sub);
10977    if (kind2 != kind1) {
10978        if (kind2 > kind1) {
10979            Py_DECREF(sub);
10980            Py_DECREF(str);
10981            return 0;
10982        }
10983        buf2 = _PyUnicode_AsKind(sub, kind1);
10984    }
10985    if (!buf2) {
10986        Py_DECREF(sub);
10987        Py_DECREF(str);
10988        return -1;
10989    }
10990    len1 = PyUnicode_GET_LENGTH(str);
10991    len2 = PyUnicode_GET_LENGTH(sub);
10992
10993    switch (kind1) {
10994    case PyUnicode_1BYTE_KIND:
10995        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10996        break;
10997    case PyUnicode_2BYTE_KIND:
10998        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10999        break;
11000    case PyUnicode_4BYTE_KIND:
11001        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
11002        break;
11003    default:
11004        result = -1;
11005        assert(0);
11006    }
11007
11008    Py_DECREF(str);
11009    Py_DECREF(sub);
11010
11011    if (kind2 != kind1)
11012        PyMem_Free(buf2);
11013
11014    return result;
11015}
11016
11017/* Concat to string or Unicode object giving a new Unicode object. */
11018
11019PyObject *
11020PyUnicode_Concat(PyObject *left, PyObject *right)
11021{
11022    PyObject *u = NULL, *v = NULL, *w;
11023    Py_UCS4 maxchar, maxchar2;
11024    Py_ssize_t u_len, v_len, new_len;
11025
11026    /* Coerce the two arguments */
11027    u = PyUnicode_FromObject(left);
11028    if (u == NULL)
11029        goto onError;
11030    v = PyUnicode_FromObject(right);
11031    if (v == NULL)
11032        goto onError;
11033
11034    /* Shortcuts */
11035    if (v == unicode_empty) {
11036        Py_DECREF(v);
11037        return u;
11038    }
11039    if (u == unicode_empty) {
11040        Py_DECREF(u);
11041        return v;
11042    }
11043
11044    u_len = PyUnicode_GET_LENGTH(u);
11045    v_len = PyUnicode_GET_LENGTH(v);
11046    if (u_len > PY_SSIZE_T_MAX - v_len) {
11047        PyErr_SetString(PyExc_OverflowError,
11048                        "strings are too large to concat");
11049        goto onError;
11050    }
11051    new_len = u_len + v_len;
11052
11053    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
11054    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11055    maxchar = Py_MAX(maxchar, maxchar2);
11056
11057    /* Concat the two Unicode strings */
11058    w = PyUnicode_New(new_len, maxchar);
11059    if (w == NULL)
11060        goto onError;
11061    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11062    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
11063    Py_DECREF(u);
11064    Py_DECREF(v);
11065    assert(_PyUnicode_CheckConsistency(w, 1));
11066    return w;
11067
11068  onError:
11069    Py_XDECREF(u);
11070    Py_XDECREF(v);
11071    return NULL;
11072}
11073
11074void
11075PyUnicode_Append(PyObject **p_left, PyObject *right)
11076{
11077    PyObject *left, *res;
11078    Py_UCS4 maxchar, maxchar2;
11079    Py_ssize_t left_len, right_len, new_len;
11080
11081    if (p_left == NULL) {
11082        if (!PyErr_Occurred())
11083            PyErr_BadInternalCall();
11084        return;
11085    }
11086    left = *p_left;
11087    if (right == NULL || left == NULL
11088        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11089        if (!PyErr_Occurred())
11090            PyErr_BadInternalCall();
11091        goto error;
11092    }
11093
11094    if (PyUnicode_READY(left) == -1)
11095        goto error;
11096    if (PyUnicode_READY(right) == -1)
11097        goto error;
11098
11099    /* Shortcuts */
11100    if (left == unicode_empty) {
11101        Py_DECREF(left);
11102        Py_INCREF(right);
11103        *p_left = right;
11104        return;
11105    }
11106    if (right == unicode_empty)
11107        return;
11108
11109    left_len = PyUnicode_GET_LENGTH(left);
11110    right_len = PyUnicode_GET_LENGTH(right);
11111    if (left_len > PY_SSIZE_T_MAX - right_len) {
11112        PyErr_SetString(PyExc_OverflowError,
11113                        "strings are too large to concat");
11114        goto error;
11115    }
11116    new_len = left_len + right_len;
11117
11118    if (unicode_modifiable(left)
11119        && PyUnicode_CheckExact(right)
11120        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11121        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11122           to change the structure size, but characters are stored just after
11123           the structure, and so it requires to move all characters which is
11124           not so different than duplicating the string. */
11125        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11126    {
11127        /* append inplace */
11128        if (unicode_resize(p_left, new_len) != 0)
11129            goto error;
11130
11131        /* copy 'right' into the newly allocated area of 'left' */
11132        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11133    }
11134    else {
11135        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11136        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11137        maxchar = Py_MAX(maxchar, maxchar2);
11138
11139        /* Concat the two Unicode strings */
11140        res = PyUnicode_New(new_len, maxchar);
11141        if (res == NULL)
11142            goto error;
11143        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11144        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11145        Py_DECREF(left);
11146        *p_left = res;
11147    }
11148    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11149    return;
11150
11151error:
11152    Py_CLEAR(*p_left);
11153}
11154
11155void
11156PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11157{
11158    PyUnicode_Append(pleft, right);
11159    Py_XDECREF(right);
11160}
11161
11162PyDoc_STRVAR(count__doc__,
11163             "S.count(sub[, start[, end]]) -> int\n\
11164\n\
11165Return the number of non-overlapping occurrences of substring sub in\n\
11166string S[start:end].  Optional arguments start and end are\n\
11167interpreted as in slice notation.");
11168
11169static PyObject *
11170unicode_count(PyObject *self, PyObject *args)
11171{
11172    PyObject *substring;
11173    Py_ssize_t start = 0;
11174    Py_ssize_t end = PY_SSIZE_T_MAX;
11175    PyObject *result;
11176    int kind1, kind2, kind;
11177    void *buf1, *buf2;
11178    Py_ssize_t len1, len2, iresult;
11179
11180    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11181                                            &start, &end))
11182        return NULL;
11183
11184    kind1 = PyUnicode_KIND(self);
11185    kind2 = PyUnicode_KIND(substring);
11186    if (kind2 > kind1) {
11187        Py_DECREF(substring);
11188        return PyLong_FromLong(0);
11189    }
11190    kind = kind1;
11191    buf1 = PyUnicode_DATA(self);
11192    buf2 = PyUnicode_DATA(substring);
11193    if (kind2 != kind)
11194        buf2 = _PyUnicode_AsKind(substring, kind);
11195    if (!buf2) {
11196        Py_DECREF(substring);
11197        return NULL;
11198    }
11199    len1 = PyUnicode_GET_LENGTH(self);
11200    len2 = PyUnicode_GET_LENGTH(substring);
11201
11202    ADJUST_INDICES(start, end, len1);
11203    switch (kind) {
11204    case PyUnicode_1BYTE_KIND:
11205        iresult = ucs1lib_count(
11206            ((Py_UCS1*)buf1) + start, end - start,
11207            buf2, len2, PY_SSIZE_T_MAX
11208            );
11209        break;
11210    case PyUnicode_2BYTE_KIND:
11211        iresult = ucs2lib_count(
11212            ((Py_UCS2*)buf1) + start, end - start,
11213            buf2, len2, PY_SSIZE_T_MAX
11214            );
11215        break;
11216    case PyUnicode_4BYTE_KIND:
11217        iresult = ucs4lib_count(
11218            ((Py_UCS4*)buf1) + start, end - start,
11219            buf2, len2, PY_SSIZE_T_MAX
11220            );
11221        break;
11222    default:
11223        assert(0); iresult = 0;
11224    }
11225
11226    result = PyLong_FromSsize_t(iresult);
11227
11228    if (kind2 != kind)
11229        PyMem_Free(buf2);
11230
11231    Py_DECREF(substring);
11232
11233    return result;
11234}
11235
11236PyDoc_STRVAR(encode__doc__,
11237             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11238\n\
11239Encode S using the codec registered for encoding. Default encoding\n\
11240is 'utf-8'. errors may be given to set a different error\n\
11241handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11242a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11243'xmlcharrefreplace' as well as any other name registered with\n\
11244codecs.register_error that can handle UnicodeEncodeErrors.");
11245
11246static PyObject *
11247unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11248{
11249    static char *kwlist[] = {"encoding", "errors", 0};
11250    char *encoding = NULL;
11251    char *errors = NULL;
11252
11253    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11254                                     kwlist, &encoding, &errors))
11255        return NULL;
11256    return PyUnicode_AsEncodedString(self, encoding, errors);
11257}
11258
11259PyDoc_STRVAR(expandtabs__doc__,
11260             "S.expandtabs(tabsize=8) -> str\n\
11261\n\
11262Return a copy of S where all tab characters are expanded using spaces.\n\
11263If tabsize is not given, a tab size of 8 characters is assumed.");
11264
11265static PyObject*
11266unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11267{
11268    Py_ssize_t i, j, line_pos, src_len, incr;
11269    Py_UCS4 ch;
11270    PyObject *u;
11271    void *src_data, *dest_data;
11272    static char *kwlist[] = {"tabsize", 0};
11273    int tabsize = 8;
11274    int kind;
11275    int found;
11276
11277    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11278                                     kwlist, &tabsize))
11279        return NULL;
11280
11281    if (PyUnicode_READY(self) == -1)
11282        return NULL;
11283
11284    /* First pass: determine size of output string */
11285    src_len = PyUnicode_GET_LENGTH(self);
11286    i = j = line_pos = 0;
11287    kind = PyUnicode_KIND(self);
11288    src_data = PyUnicode_DATA(self);
11289    found = 0;
11290    for (; i < src_len; i++) {
11291        ch = PyUnicode_READ(kind, src_data, i);
11292        if (ch == '\t') {
11293            found = 1;
11294            if (tabsize > 0) {
11295                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11296                if (j > PY_SSIZE_T_MAX - incr)
11297                    goto overflow;
11298                line_pos += incr;
11299                j += incr;
11300            }
11301        }
11302        else {
11303            if (j > PY_SSIZE_T_MAX - 1)
11304                goto overflow;
11305            line_pos++;
11306            j++;
11307            if (ch == '\n' || ch == '\r')
11308                line_pos = 0;
11309        }
11310    }
11311    if (!found)
11312        return unicode_result_unchanged(self);
11313
11314    /* Second pass: create output string and fill it */
11315    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11316    if (!u)
11317        return NULL;
11318    dest_data = PyUnicode_DATA(u);
11319
11320    i = j = line_pos = 0;
11321
11322    for (; i < src_len; i++) {
11323        ch = PyUnicode_READ(kind, src_data, i);
11324        if (ch == '\t') {
11325            if (tabsize > 0) {
11326                incr = tabsize - (line_pos % tabsize);
11327                line_pos += incr;
11328                FILL(kind, dest_data, ' ', j, incr);
11329                j += incr;
11330            }
11331        }
11332        else {
11333            line_pos++;
11334            PyUnicode_WRITE(kind, dest_data, j, ch);
11335            j++;
11336            if (ch == '\n' || ch == '\r')
11337                line_pos = 0;
11338        }
11339    }
11340    assert (j == PyUnicode_GET_LENGTH(u));
11341    return unicode_result(u);
11342
11343  overflow:
11344    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11345    return NULL;
11346}
11347
11348PyDoc_STRVAR(find__doc__,
11349             "S.find(sub[, start[, end]]) -> int\n\
11350\n\
11351Return the lowest index in S where substring sub is found,\n\
11352such that sub is contained within S[start:end].  Optional\n\
11353arguments start and end are interpreted as in slice notation.\n\
11354\n\
11355Return -1 on failure.");
11356
11357static PyObject *
11358unicode_find(PyObject *self, PyObject *args)
11359{
11360    PyObject *substring;
11361    Py_ssize_t start;
11362    Py_ssize_t end;
11363    Py_ssize_t result;
11364
11365    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11366                                            &start, &end))
11367        return NULL;
11368
11369    if (PyUnicode_READY(self) == -1) {
11370        Py_DECREF(substring);
11371        return NULL;
11372    }
11373    if (PyUnicode_READY(substring) == -1) {
11374        Py_DECREF(substring);
11375        return NULL;
11376    }
11377
11378    result = any_find_slice(1, self, substring, start, end);
11379
11380    Py_DECREF(substring);
11381
11382    if (result == -2)
11383        return NULL;
11384
11385    return PyLong_FromSsize_t(result);
11386}
11387
11388static PyObject *
11389unicode_getitem(PyObject *self, Py_ssize_t index)
11390{
11391    void *data;
11392    enum PyUnicode_Kind kind;
11393    Py_UCS4 ch;
11394
11395    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11396        PyErr_BadArgument();
11397        return NULL;
11398    }
11399    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11400        PyErr_SetString(PyExc_IndexError, "string index out of range");
11401        return NULL;
11402    }
11403    kind = PyUnicode_KIND(self);
11404    data = PyUnicode_DATA(self);
11405    ch = PyUnicode_READ(kind, data, index);
11406    return unicode_char(ch);
11407}
11408
11409/* Believe it or not, this produces the same value for ASCII strings
11410   as bytes_hash(). */
11411static Py_hash_t
11412unicode_hash(PyObject *self)
11413{
11414    Py_ssize_t len;
11415    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11416
11417#ifdef Py_DEBUG
11418    assert(_Py_HashSecret_Initialized);
11419#endif
11420    if (_PyUnicode_HASH(self) != -1)
11421        return _PyUnicode_HASH(self);
11422    if (PyUnicode_READY(self) == -1)
11423        return -1;
11424    len = PyUnicode_GET_LENGTH(self);
11425    /*
11426      We make the hash of the empty string be 0, rather than using
11427      (prefix ^ suffix), since this slightly obfuscates the hash secret
11428    */
11429    if (len == 0) {
11430        _PyUnicode_HASH(self) = 0;
11431        return 0;
11432    }
11433    x = _Py_HashBytes(PyUnicode_DATA(self),
11434                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11435    _PyUnicode_HASH(self) = x;
11436    return x;
11437}
11438
11439PyDoc_STRVAR(index__doc__,
11440             "S.index(sub[, start[, end]]) -> int\n\
11441\n\
11442Like S.find() but raise ValueError when the substring is not found.");
11443
11444static PyObject *
11445unicode_index(PyObject *self, PyObject *args)
11446{
11447    Py_ssize_t result;
11448    PyObject *substring;
11449    Py_ssize_t start;
11450    Py_ssize_t end;
11451
11452    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11453                                            &start, &end))
11454        return NULL;
11455
11456    if (PyUnicode_READY(self) == -1) {
11457        Py_DECREF(substring);
11458        return NULL;
11459    }
11460    if (PyUnicode_READY(substring) == -1) {
11461        Py_DECREF(substring);
11462        return NULL;
11463    }
11464
11465    result = any_find_slice(1, self, substring, start, end);
11466
11467    Py_DECREF(substring);
11468
11469    if (result == -2)
11470        return NULL;
11471
11472    if (result < 0) {
11473        PyErr_SetString(PyExc_ValueError, "substring not found");
11474        return NULL;
11475    }
11476
11477    return PyLong_FromSsize_t(result);
11478}
11479
11480PyDoc_STRVAR(islower__doc__,
11481             "S.islower() -> bool\n\
11482\n\
11483Return True if all cased characters in S are lowercase and there is\n\
11484at least one cased character in S, False otherwise.");
11485
11486static PyObject*
11487unicode_islower(PyObject *self)
11488{
11489    Py_ssize_t i, length;
11490    int kind;
11491    void *data;
11492    int cased;
11493
11494    if (PyUnicode_READY(self) == -1)
11495        return NULL;
11496    length = PyUnicode_GET_LENGTH(self);
11497    kind = PyUnicode_KIND(self);
11498    data = PyUnicode_DATA(self);
11499
11500    /* Shortcut for single character strings */
11501    if (length == 1)
11502        return PyBool_FromLong(
11503            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11504
11505    /* Special case for empty strings */
11506    if (length == 0)
11507        return PyBool_FromLong(0);
11508
11509    cased = 0;
11510    for (i = 0; i < length; i++) {
11511        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11512
11513        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11514            return PyBool_FromLong(0);
11515        else if (!cased && Py_UNICODE_ISLOWER(ch))
11516            cased = 1;
11517    }
11518    return PyBool_FromLong(cased);
11519}
11520
11521PyDoc_STRVAR(isupper__doc__,
11522             "S.isupper() -> bool\n\
11523\n\
11524Return True if all cased characters in S are uppercase and there is\n\
11525at least one cased character in S, False otherwise.");
11526
11527static PyObject*
11528unicode_isupper(PyObject *self)
11529{
11530    Py_ssize_t i, length;
11531    int kind;
11532    void *data;
11533    int cased;
11534
11535    if (PyUnicode_READY(self) == -1)
11536        return NULL;
11537    length = PyUnicode_GET_LENGTH(self);
11538    kind = PyUnicode_KIND(self);
11539    data = PyUnicode_DATA(self);
11540
11541    /* Shortcut for single character strings */
11542    if (length == 1)
11543        return PyBool_FromLong(
11544            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11545
11546    /* Special case for empty strings */
11547    if (length == 0)
11548        return PyBool_FromLong(0);
11549
11550    cased = 0;
11551    for (i = 0; i < length; i++) {
11552        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11553
11554        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11555            return PyBool_FromLong(0);
11556        else if (!cased && Py_UNICODE_ISUPPER(ch))
11557            cased = 1;
11558    }
11559    return PyBool_FromLong(cased);
11560}
11561
11562PyDoc_STRVAR(istitle__doc__,
11563             "S.istitle() -> bool\n\
11564\n\
11565Return True if S is a titlecased string and there is at least one\n\
11566character in S, i.e. upper- and titlecase characters may only\n\
11567follow uncased characters and lowercase characters only cased ones.\n\
11568Return False otherwise.");
11569
11570static PyObject*
11571unicode_istitle(PyObject *self)
11572{
11573    Py_ssize_t i, length;
11574    int kind;
11575    void *data;
11576    int cased, previous_is_cased;
11577
11578    if (PyUnicode_READY(self) == -1)
11579        return NULL;
11580    length = PyUnicode_GET_LENGTH(self);
11581    kind = PyUnicode_KIND(self);
11582    data = PyUnicode_DATA(self);
11583
11584    /* Shortcut for single character strings */
11585    if (length == 1) {
11586        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11587        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11588                               (Py_UNICODE_ISUPPER(ch) != 0));
11589    }
11590
11591    /* Special case for empty strings */
11592    if (length == 0)
11593        return PyBool_FromLong(0);
11594
11595    cased = 0;
11596    previous_is_cased = 0;
11597    for (i = 0; i < length; i++) {
11598        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11599
11600        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11601            if (previous_is_cased)
11602                return PyBool_FromLong(0);
11603            previous_is_cased = 1;
11604            cased = 1;
11605        }
11606        else if (Py_UNICODE_ISLOWER(ch)) {
11607            if (!previous_is_cased)
11608                return PyBool_FromLong(0);
11609            previous_is_cased = 1;
11610            cased = 1;
11611        }
11612        else
11613            previous_is_cased = 0;
11614    }
11615    return PyBool_FromLong(cased);
11616}
11617
11618PyDoc_STRVAR(isspace__doc__,
11619             "S.isspace() -> bool\n\
11620\n\
11621Return True if all characters in S are whitespace\n\
11622and there is at least one character in S, False otherwise.");
11623
11624static PyObject*
11625unicode_isspace(PyObject *self)
11626{
11627    Py_ssize_t i, length;
11628    int kind;
11629    void *data;
11630
11631    if (PyUnicode_READY(self) == -1)
11632        return NULL;
11633    length = PyUnicode_GET_LENGTH(self);
11634    kind = PyUnicode_KIND(self);
11635    data = PyUnicode_DATA(self);
11636
11637    /* Shortcut for single character strings */
11638    if (length == 1)
11639        return PyBool_FromLong(
11640            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11641
11642    /* Special case for empty strings */
11643    if (length == 0)
11644        return PyBool_FromLong(0);
11645
11646    for (i = 0; i < length; i++) {
11647        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11648        if (!Py_UNICODE_ISSPACE(ch))
11649            return PyBool_FromLong(0);
11650    }
11651    return PyBool_FromLong(1);
11652}
11653
11654PyDoc_STRVAR(isalpha__doc__,
11655             "S.isalpha() -> bool\n\
11656\n\
11657Return True if all characters in S are alphabetic\n\
11658and there is at least one character in S, False otherwise.");
11659
11660static PyObject*
11661unicode_isalpha(PyObject *self)
11662{
11663    Py_ssize_t i, length;
11664    int kind;
11665    void *data;
11666
11667    if (PyUnicode_READY(self) == -1)
11668        return NULL;
11669    length = PyUnicode_GET_LENGTH(self);
11670    kind = PyUnicode_KIND(self);
11671    data = PyUnicode_DATA(self);
11672
11673    /* Shortcut for single character strings */
11674    if (length == 1)
11675        return PyBool_FromLong(
11676            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11677
11678    /* Special case for empty strings */
11679    if (length == 0)
11680        return PyBool_FromLong(0);
11681
11682    for (i = 0; i < length; i++) {
11683        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11684            return PyBool_FromLong(0);
11685    }
11686    return PyBool_FromLong(1);
11687}
11688
11689PyDoc_STRVAR(isalnum__doc__,
11690             "S.isalnum() -> bool\n\
11691\n\
11692Return True if all characters in S are alphanumeric\n\
11693and there is at least one character in S, False otherwise.");
11694
11695static PyObject*
11696unicode_isalnum(PyObject *self)
11697{
11698    int kind;
11699    void *data;
11700    Py_ssize_t len, i;
11701
11702    if (PyUnicode_READY(self) == -1)
11703        return NULL;
11704
11705    kind = PyUnicode_KIND(self);
11706    data = PyUnicode_DATA(self);
11707    len = PyUnicode_GET_LENGTH(self);
11708
11709    /* Shortcut for single character strings */
11710    if (len == 1) {
11711        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11712        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11713    }
11714
11715    /* Special case for empty strings */
11716    if (len == 0)
11717        return PyBool_FromLong(0);
11718
11719    for (i = 0; i < len; i++) {
11720        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11721        if (!Py_UNICODE_ISALNUM(ch))
11722            return PyBool_FromLong(0);
11723    }
11724    return PyBool_FromLong(1);
11725}
11726
11727PyDoc_STRVAR(isdecimal__doc__,
11728             "S.isdecimal() -> bool\n\
11729\n\
11730Return True if there are only decimal characters in S,\n\
11731False otherwise.");
11732
11733static PyObject*
11734unicode_isdecimal(PyObject *self)
11735{
11736    Py_ssize_t i, length;
11737    int kind;
11738    void *data;
11739
11740    if (PyUnicode_READY(self) == -1)
11741        return NULL;
11742    length = PyUnicode_GET_LENGTH(self);
11743    kind = PyUnicode_KIND(self);
11744    data = PyUnicode_DATA(self);
11745
11746    /* Shortcut for single character strings */
11747    if (length == 1)
11748        return PyBool_FromLong(
11749            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11750
11751    /* Special case for empty strings */
11752    if (length == 0)
11753        return PyBool_FromLong(0);
11754
11755    for (i = 0; i < length; i++) {
11756        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11757            return PyBool_FromLong(0);
11758    }
11759    return PyBool_FromLong(1);
11760}
11761
11762PyDoc_STRVAR(isdigit__doc__,
11763             "S.isdigit() -> bool\n\
11764\n\
11765Return True if all characters in S are digits\n\
11766and there is at least one character in S, False otherwise.");
11767
11768static PyObject*
11769unicode_isdigit(PyObject *self)
11770{
11771    Py_ssize_t i, length;
11772    int kind;
11773    void *data;
11774
11775    if (PyUnicode_READY(self) == -1)
11776        return NULL;
11777    length = PyUnicode_GET_LENGTH(self);
11778    kind = PyUnicode_KIND(self);
11779    data = PyUnicode_DATA(self);
11780
11781    /* Shortcut for single character strings */
11782    if (length == 1) {
11783        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11784        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11785    }
11786
11787    /* Special case for empty strings */
11788    if (length == 0)
11789        return PyBool_FromLong(0);
11790
11791    for (i = 0; i < length; i++) {
11792        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11793            return PyBool_FromLong(0);
11794    }
11795    return PyBool_FromLong(1);
11796}
11797
11798PyDoc_STRVAR(isnumeric__doc__,
11799             "S.isnumeric() -> bool\n\
11800\n\
11801Return True if there are only numeric characters in S,\n\
11802False otherwise.");
11803
11804static PyObject*
11805unicode_isnumeric(PyObject *self)
11806{
11807    Py_ssize_t i, length;
11808    int kind;
11809    void *data;
11810
11811    if (PyUnicode_READY(self) == -1)
11812        return NULL;
11813    length = PyUnicode_GET_LENGTH(self);
11814    kind = PyUnicode_KIND(self);
11815    data = PyUnicode_DATA(self);
11816
11817    /* Shortcut for single character strings */
11818    if (length == 1)
11819        return PyBool_FromLong(
11820            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11821
11822    /* Special case for empty strings */
11823    if (length == 0)
11824        return PyBool_FromLong(0);
11825
11826    for (i = 0; i < length; i++) {
11827        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11828            return PyBool_FromLong(0);
11829    }
11830    return PyBool_FromLong(1);
11831}
11832
11833int
11834PyUnicode_IsIdentifier(PyObject *self)
11835{
11836    int kind;
11837    void *data;
11838    Py_ssize_t i;
11839    Py_UCS4 first;
11840
11841    if (PyUnicode_READY(self) == -1) {
11842        Py_FatalError("identifier not ready");
11843        return 0;
11844    }
11845
11846    /* Special case for empty strings */
11847    if (PyUnicode_GET_LENGTH(self) == 0)
11848        return 0;
11849    kind = PyUnicode_KIND(self);
11850    data = PyUnicode_DATA(self);
11851
11852    /* PEP 3131 says that the first character must be in
11853       XID_Start and subsequent characters in XID_Continue,
11854       and for the ASCII range, the 2.x rules apply (i.e
11855       start with letters and underscore, continue with
11856       letters, digits, underscore). However, given the current
11857       definition of XID_Start and XID_Continue, it is sufficient
11858       to check just for these, except that _ must be allowed
11859       as starting an identifier.  */
11860    first = PyUnicode_READ(kind, data, 0);
11861    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11862        return 0;
11863
11864    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11865        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11866            return 0;
11867    return 1;
11868}
11869
11870PyDoc_STRVAR(isidentifier__doc__,
11871             "S.isidentifier() -> bool\n\
11872\n\
11873Return True if S is a valid identifier according\n\
11874to the language definition.\n\
11875\n\
11876Use keyword.iskeyword() to test for reserved identifiers\n\
11877such as \"def\" and \"class\".\n");
11878
11879static PyObject*
11880unicode_isidentifier(PyObject *self)
11881{
11882    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11883}
11884
11885PyDoc_STRVAR(isprintable__doc__,
11886             "S.isprintable() -> bool\n\
11887\n\
11888Return True if all characters in S are considered\n\
11889printable in repr() or S is empty, False otherwise.");
11890
11891static PyObject*
11892unicode_isprintable(PyObject *self)
11893{
11894    Py_ssize_t i, length;
11895    int kind;
11896    void *data;
11897
11898    if (PyUnicode_READY(self) == -1)
11899        return NULL;
11900    length = PyUnicode_GET_LENGTH(self);
11901    kind = PyUnicode_KIND(self);
11902    data = PyUnicode_DATA(self);
11903
11904    /* Shortcut for single character strings */
11905    if (length == 1)
11906        return PyBool_FromLong(
11907            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11908
11909    for (i = 0; i < length; i++) {
11910        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11911            Py_RETURN_FALSE;
11912        }
11913    }
11914    Py_RETURN_TRUE;
11915}
11916
11917PyDoc_STRVAR(join__doc__,
11918             "S.join(iterable) -> str\n\
11919\n\
11920Return a string which is the concatenation of the strings in the\n\
11921iterable.  The separator between elements is S.");
11922
11923static PyObject*
11924unicode_join(PyObject *self, PyObject *data)
11925{
11926    return PyUnicode_Join(self, data);
11927}
11928
11929static Py_ssize_t
11930unicode_length(PyObject *self)
11931{
11932    if (PyUnicode_READY(self) == -1)
11933        return -1;
11934    return PyUnicode_GET_LENGTH(self);
11935}
11936
11937PyDoc_STRVAR(ljust__doc__,
11938             "S.ljust(width[, fillchar]) -> str\n\
11939\n\
11940Return S left-justified in a Unicode string of length width. Padding is\n\
11941done using the specified fill character (default is a space).");
11942
11943static PyObject *
11944unicode_ljust(PyObject *self, PyObject *args)
11945{
11946    Py_ssize_t width;
11947    Py_UCS4 fillchar = ' ';
11948
11949    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11950        return NULL;
11951
11952    if (PyUnicode_READY(self) == -1)
11953        return NULL;
11954
11955    if (PyUnicode_GET_LENGTH(self) >= width)
11956        return unicode_result_unchanged(self);
11957
11958    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11959}
11960
11961PyDoc_STRVAR(lower__doc__,
11962             "S.lower() -> str\n\
11963\n\
11964Return a copy of the string S converted to lowercase.");
11965
11966static PyObject*
11967unicode_lower(PyObject *self)
11968{
11969    if (PyUnicode_READY(self) == -1)
11970        return NULL;
11971    if (PyUnicode_IS_ASCII(self))
11972        return ascii_upper_or_lower(self, 1);
11973    return case_operation(self, do_lower);
11974}
11975
11976#define LEFTSTRIP 0
11977#define RIGHTSTRIP 1
11978#define BOTHSTRIP 2
11979
11980/* Arrays indexed by above */
11981static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11982
11983#define STRIPNAME(i) (stripformat[i]+3)
11984
11985/* externally visible for str.strip(unicode) */
11986PyObject *
11987_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11988{
11989    void *data;
11990    int kind;
11991    Py_ssize_t i, j, len;
11992    BLOOM_MASK sepmask;
11993    Py_ssize_t seplen;
11994
11995    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11996        return NULL;
11997
11998    kind = PyUnicode_KIND(self);
11999    data = PyUnicode_DATA(self);
12000    len = PyUnicode_GET_LENGTH(self);
12001    seplen = PyUnicode_GET_LENGTH(sepobj);
12002    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
12003                              PyUnicode_DATA(sepobj),
12004                              seplen);
12005
12006    i = 0;
12007    if (striptype != RIGHTSTRIP) {
12008        while (i < len) {
12009            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12010            if (!BLOOM(sepmask, ch))
12011                break;
12012            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12013                break;
12014            i++;
12015        }
12016    }
12017
12018    j = len;
12019    if (striptype != LEFTSTRIP) {
12020        j--;
12021        while (j >= i) {
12022            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12023            if (!BLOOM(sepmask, ch))
12024                break;
12025            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12026                break;
12027            j--;
12028        }
12029
12030        j++;
12031    }
12032
12033    return PyUnicode_Substring(self, i, j);
12034}
12035
12036PyObject*
12037PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12038{
12039    unsigned char *data;
12040    int kind;
12041    Py_ssize_t length;
12042
12043    if (PyUnicode_READY(self) == -1)
12044        return NULL;
12045
12046    length = PyUnicode_GET_LENGTH(self);
12047    end = Py_MIN(end, length);
12048
12049    if (start == 0 && end == length)
12050        return unicode_result_unchanged(self);
12051
12052    if (start < 0 || end < 0) {
12053        PyErr_SetString(PyExc_IndexError, "string index out of range");
12054        return NULL;
12055    }
12056    if (start >= length || end < start)
12057        _Py_RETURN_UNICODE_EMPTY();
12058
12059    length = end - start;
12060    if (PyUnicode_IS_ASCII(self)) {
12061        data = PyUnicode_1BYTE_DATA(self);
12062        return _PyUnicode_FromASCII((char*)(data + start), length);
12063    }
12064    else {
12065        kind = PyUnicode_KIND(self);
12066        data = PyUnicode_1BYTE_DATA(self);
12067        return PyUnicode_FromKindAndData(kind,
12068                                         data + kind * start,
12069                                         length);
12070    }
12071}
12072
12073static PyObject *
12074do_strip(PyObject *self, int striptype)
12075{
12076    Py_ssize_t len, i, j;
12077
12078    if (PyUnicode_READY(self) == -1)
12079        return NULL;
12080
12081    len = PyUnicode_GET_LENGTH(self);
12082
12083    if (PyUnicode_IS_ASCII(self)) {
12084        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12085
12086        i = 0;
12087        if (striptype != RIGHTSTRIP) {
12088            while (i < len) {
12089                Py_UCS1 ch = data[i];
12090                if (!_Py_ascii_whitespace[ch])
12091                    break;
12092                i++;
12093            }
12094        }
12095
12096        j = len;
12097        if (striptype != LEFTSTRIP) {
12098            j--;
12099            while (j >= i) {
12100                Py_UCS1 ch = data[j];
12101                if (!_Py_ascii_whitespace[ch])
12102                    break;
12103                j--;
12104            }
12105            j++;
12106        }
12107    }
12108    else {
12109        int kind = PyUnicode_KIND(self);
12110        void *data = PyUnicode_DATA(self);
12111
12112        i = 0;
12113        if (striptype != RIGHTSTRIP) {
12114            while (i < len) {
12115                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12116                if (!Py_UNICODE_ISSPACE(ch))
12117                    break;
12118                i++;
12119            }
12120        }
12121
12122        j = len;
12123        if (striptype != LEFTSTRIP) {
12124            j--;
12125            while (j >= i) {
12126                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12127                if (!Py_UNICODE_ISSPACE(ch))
12128                    break;
12129                j--;
12130            }
12131            j++;
12132        }
12133    }
12134
12135    return PyUnicode_Substring(self, i, j);
12136}
12137
12138
12139static PyObject *
12140do_argstrip(PyObject *self, int striptype, PyObject *args)
12141{
12142    PyObject *sep = NULL;
12143
12144    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12145        return NULL;
12146
12147    if (sep != NULL && sep != Py_None) {
12148        if (PyUnicode_Check(sep))
12149            return _PyUnicode_XStrip(self, striptype, sep);
12150        else {
12151            PyErr_Format(PyExc_TypeError,
12152                         "%s arg must be None or str",
12153                         STRIPNAME(striptype));
12154            return NULL;
12155        }
12156    }
12157
12158    return do_strip(self, striptype);
12159}
12160
12161
12162PyDoc_STRVAR(strip__doc__,
12163             "S.strip([chars]) -> str\n\
12164\n\
12165Return a copy of the string S with leading and trailing\n\
12166whitespace removed.\n\
12167If chars is given and not None, remove characters in chars instead.");
12168
12169static PyObject *
12170unicode_strip(PyObject *self, PyObject *args)
12171{
12172    if (PyTuple_GET_SIZE(args) == 0)
12173        return do_strip(self, BOTHSTRIP); /* Common case */
12174    else
12175        return do_argstrip(self, BOTHSTRIP, args);
12176}
12177
12178
12179PyDoc_STRVAR(lstrip__doc__,
12180             "S.lstrip([chars]) -> str\n\
12181\n\
12182Return a copy of the string S with leading whitespace removed.\n\
12183If chars is given and not None, remove characters in chars instead.");
12184
12185static PyObject *
12186unicode_lstrip(PyObject *self, PyObject *args)
12187{
12188    if (PyTuple_GET_SIZE(args) == 0)
12189        return do_strip(self, LEFTSTRIP); /* Common case */
12190    else
12191        return do_argstrip(self, LEFTSTRIP, args);
12192}
12193
12194
12195PyDoc_STRVAR(rstrip__doc__,
12196             "S.rstrip([chars]) -> str\n\
12197\n\
12198Return a copy of the string S with trailing whitespace removed.\n\
12199If chars is given and not None, remove characters in chars instead.");
12200
12201static PyObject *
12202unicode_rstrip(PyObject *self, PyObject *args)
12203{
12204    if (PyTuple_GET_SIZE(args) == 0)
12205        return do_strip(self, RIGHTSTRIP); /* Common case */
12206    else
12207        return do_argstrip(self, RIGHTSTRIP, args);
12208}
12209
12210
12211static PyObject*
12212unicode_repeat(PyObject *str, Py_ssize_t len)
12213{
12214    PyObject *u;
12215    Py_ssize_t nchars, n;
12216
12217    if (len < 1)
12218        _Py_RETURN_UNICODE_EMPTY();
12219
12220    /* no repeat, return original string */
12221    if (len == 1)
12222        return unicode_result_unchanged(str);
12223
12224    if (PyUnicode_READY(str) == -1)
12225        return NULL;
12226
12227    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12228        PyErr_SetString(PyExc_OverflowError,
12229                        "repeated string is too long");
12230        return NULL;
12231    }
12232    nchars = len * PyUnicode_GET_LENGTH(str);
12233
12234    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12235    if (!u)
12236        return NULL;
12237    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12238
12239    if (PyUnicode_GET_LENGTH(str) == 1) {
12240        const int kind = PyUnicode_KIND(str);
12241        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12242        if (kind == PyUnicode_1BYTE_KIND) {
12243            void *to = PyUnicode_DATA(u);
12244            memset(to, (unsigned char)fill_char, len);
12245        }
12246        else if (kind == PyUnicode_2BYTE_KIND) {
12247            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12248            for (n = 0; n < len; ++n)
12249                ucs2[n] = fill_char;
12250        } else {
12251            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12252            assert(kind == PyUnicode_4BYTE_KIND);
12253            for (n = 0; n < len; ++n)
12254                ucs4[n] = fill_char;
12255        }
12256    }
12257    else {
12258        /* number of characters copied this far */
12259        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12260        const Py_ssize_t char_size = PyUnicode_KIND(str);
12261        char *to = (char *) PyUnicode_DATA(u);
12262        Py_MEMCPY(to, PyUnicode_DATA(str),
12263                  PyUnicode_GET_LENGTH(str) * char_size);
12264        while (done < nchars) {
12265            n = (done <= nchars-done) ? done : nchars-done;
12266            Py_MEMCPY(to + (done * char_size), to, n * char_size);
12267            done += n;
12268        }
12269    }
12270
12271    assert(_PyUnicode_CheckConsistency(u, 1));
12272    return u;
12273}
12274
12275PyObject *
12276PyUnicode_Replace(PyObject *obj,
12277                  PyObject *subobj,
12278                  PyObject *replobj,
12279                  Py_ssize_t maxcount)
12280{
12281    PyObject *self;
12282    PyObject *str1;
12283    PyObject *str2;
12284    PyObject *result;
12285
12286    self = PyUnicode_FromObject(obj);
12287    if (self == NULL)
12288        return NULL;
12289    str1 = PyUnicode_FromObject(subobj);
12290    if (str1 == NULL) {
12291        Py_DECREF(self);
12292        return NULL;
12293    }
12294    str2 = PyUnicode_FromObject(replobj);
12295    if (str2 == NULL) {
12296        Py_DECREF(self);
12297        Py_DECREF(str1);
12298        return NULL;
12299    }
12300    if (PyUnicode_READY(self) == -1 ||
12301        PyUnicode_READY(str1) == -1 ||
12302        PyUnicode_READY(str2) == -1)
12303        result = NULL;
12304    else
12305        result = replace(self, str1, str2, maxcount);
12306    Py_DECREF(self);
12307    Py_DECREF(str1);
12308    Py_DECREF(str2);
12309    return result;
12310}
12311
12312PyDoc_STRVAR(replace__doc__,
12313             "S.replace(old, new[, count]) -> str\n\
12314\n\
12315Return a copy of S with all occurrences of substring\n\
12316old replaced by new.  If the optional argument count is\n\
12317given, only the first count occurrences are replaced.");
12318
12319static PyObject*
12320unicode_replace(PyObject *self, PyObject *args)
12321{
12322    PyObject *str1;
12323    PyObject *str2;
12324    Py_ssize_t maxcount = -1;
12325    PyObject *result;
12326
12327    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12328        return NULL;
12329    if (PyUnicode_READY(self) == -1)
12330        return NULL;
12331    str1 = PyUnicode_FromObject(str1);
12332    if (str1 == NULL)
12333        return NULL;
12334    str2 = PyUnicode_FromObject(str2);
12335    if (str2 == NULL) {
12336        Py_DECREF(str1);
12337        return NULL;
12338    }
12339    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12340        result = NULL;
12341    else
12342        result = replace(self, str1, str2, maxcount);
12343
12344    Py_DECREF(str1);
12345    Py_DECREF(str2);
12346    return result;
12347}
12348
12349static PyObject *
12350unicode_repr(PyObject *unicode)
12351{
12352    PyObject *repr;
12353    Py_ssize_t isize;
12354    Py_ssize_t osize, squote, dquote, i, o;
12355    Py_UCS4 max, quote;
12356    int ikind, okind, unchanged;
12357    void *idata, *odata;
12358
12359    if (PyUnicode_READY(unicode) == -1)
12360        return NULL;
12361
12362    isize = PyUnicode_GET_LENGTH(unicode);
12363    idata = PyUnicode_DATA(unicode);
12364
12365    /* Compute length of output, quote characters, and
12366       maximum character */
12367    osize = 0;
12368    max = 127;
12369    squote = dquote = 0;
12370    ikind = PyUnicode_KIND(unicode);
12371    for (i = 0; i < isize; i++) {
12372        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12373        Py_ssize_t incr = 1;
12374        switch (ch) {
12375        case '\'': squote++; break;
12376        case '"':  dquote++; break;
12377        case '\\': case '\t': case '\r': case '\n':
12378            incr = 2;
12379            break;
12380        default:
12381            /* Fast-path ASCII */
12382            if (ch < ' ' || ch == 0x7f)
12383                incr = 4; /* \xHH */
12384            else if (ch < 0x7f)
12385                ;
12386            else if (Py_UNICODE_ISPRINTABLE(ch))
12387                max = ch > max ? ch : max;
12388            else if (ch < 0x100)
12389                incr = 4; /* \xHH */
12390            else if (ch < 0x10000)
12391                incr = 6; /* \uHHHH */
12392            else
12393                incr = 10; /* \uHHHHHHHH */
12394        }
12395        if (osize > PY_SSIZE_T_MAX - incr) {
12396            PyErr_SetString(PyExc_OverflowError,
12397                            "string is too long to generate repr");
12398            return NULL;
12399        }
12400        osize += incr;
12401    }
12402
12403    quote = '\'';
12404    unchanged = (osize == isize);
12405    if (squote) {
12406        unchanged = 0;
12407        if (dquote)
12408            /* Both squote and dquote present. Use squote,
12409               and escape them */
12410            osize += squote;
12411        else
12412            quote = '"';
12413    }
12414    osize += 2;   /* quotes */
12415
12416    repr = PyUnicode_New(osize, max);
12417    if (repr == NULL)
12418        return NULL;
12419    okind = PyUnicode_KIND(repr);
12420    odata = PyUnicode_DATA(repr);
12421
12422    PyUnicode_WRITE(okind, odata, 0, quote);
12423    PyUnicode_WRITE(okind, odata, osize-1, quote);
12424    if (unchanged) {
12425        _PyUnicode_FastCopyCharacters(repr, 1,
12426                                      unicode, 0,
12427                                      isize);
12428    }
12429    else {
12430        for (i = 0, o = 1; i < isize; i++) {
12431            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12432
12433            /* Escape quotes and backslashes */
12434            if ((ch == quote) || (ch == '\\')) {
12435                PyUnicode_WRITE(okind, odata, o++, '\\');
12436                PyUnicode_WRITE(okind, odata, o++, ch);
12437                continue;
12438            }
12439
12440            /* Map special whitespace to '\t', \n', '\r' */
12441            if (ch == '\t') {
12442                PyUnicode_WRITE(okind, odata, o++, '\\');
12443                PyUnicode_WRITE(okind, odata, o++, 't');
12444            }
12445            else if (ch == '\n') {
12446                PyUnicode_WRITE(okind, odata, o++, '\\');
12447                PyUnicode_WRITE(okind, odata, o++, 'n');
12448            }
12449            else if (ch == '\r') {
12450                PyUnicode_WRITE(okind, odata, o++, '\\');
12451                PyUnicode_WRITE(okind, odata, o++, 'r');
12452            }
12453
12454            /* Map non-printable US ASCII to '\xhh' */
12455            else if (ch < ' ' || ch == 0x7F) {
12456                PyUnicode_WRITE(okind, odata, o++, '\\');
12457                PyUnicode_WRITE(okind, odata, o++, 'x');
12458                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12459                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12460            }
12461
12462            /* Copy ASCII characters as-is */
12463            else if (ch < 0x7F) {
12464                PyUnicode_WRITE(okind, odata, o++, ch);
12465            }
12466
12467            /* Non-ASCII characters */
12468            else {
12469                /* Map Unicode whitespace and control characters
12470                   (categories Z* and C* except ASCII space)
12471                */
12472                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12473                    PyUnicode_WRITE(okind, odata, o++, '\\');
12474                    /* Map 8-bit characters to '\xhh' */
12475                    if (ch <= 0xff) {
12476                        PyUnicode_WRITE(okind, odata, o++, 'x');
12477                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12478                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12479                    }
12480                    /* Map 16-bit characters to '\uxxxx' */
12481                    else if (ch <= 0xffff) {
12482                        PyUnicode_WRITE(okind, odata, o++, 'u');
12483                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12484                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12485                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12486                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12487                    }
12488                    /* Map 21-bit characters to '\U00xxxxxx' */
12489                    else {
12490                        PyUnicode_WRITE(okind, odata, o++, 'U');
12491                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12492                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12493                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12494                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12495                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12496                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12497                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12498                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12499                    }
12500                }
12501                /* Copy characters as-is */
12502                else {
12503                    PyUnicode_WRITE(okind, odata, o++, ch);
12504                }
12505            }
12506        }
12507    }
12508    /* Closing quote already added at the beginning */
12509    assert(_PyUnicode_CheckConsistency(repr, 1));
12510    return repr;
12511}
12512
12513PyDoc_STRVAR(rfind__doc__,
12514             "S.rfind(sub[, start[, end]]) -> int\n\
12515\n\
12516Return the highest index in S where substring sub is found,\n\
12517such that sub is contained within S[start:end].  Optional\n\
12518arguments start and end are interpreted as in slice notation.\n\
12519\n\
12520Return -1 on failure.");
12521
12522static PyObject *
12523unicode_rfind(PyObject *self, PyObject *args)
12524{
12525    PyObject *substring;
12526    Py_ssize_t start;
12527    Py_ssize_t end;
12528    Py_ssize_t result;
12529
12530    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12531                                            &start, &end))
12532        return NULL;
12533
12534    if (PyUnicode_READY(self) == -1) {
12535        Py_DECREF(substring);
12536        return NULL;
12537    }
12538    if (PyUnicode_READY(substring) == -1) {
12539        Py_DECREF(substring);
12540        return NULL;
12541    }
12542
12543    result = any_find_slice(-1, self, substring, start, end);
12544
12545    Py_DECREF(substring);
12546
12547    if (result == -2)
12548        return NULL;
12549
12550    return PyLong_FromSsize_t(result);
12551}
12552
12553PyDoc_STRVAR(rindex__doc__,
12554             "S.rindex(sub[, start[, end]]) -> int\n\
12555\n\
12556Like S.rfind() but raise ValueError when the substring is not found.");
12557
12558static PyObject *
12559unicode_rindex(PyObject *self, PyObject *args)
12560{
12561    PyObject *substring;
12562    Py_ssize_t start;
12563    Py_ssize_t end;
12564    Py_ssize_t result;
12565
12566    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12567                                            &start, &end))
12568        return NULL;
12569
12570    if (PyUnicode_READY(self) == -1) {
12571        Py_DECREF(substring);
12572        return NULL;
12573    }
12574    if (PyUnicode_READY(substring) == -1) {
12575        Py_DECREF(substring);
12576        return NULL;
12577    }
12578
12579    result = any_find_slice(-1, self, substring, start, end);
12580
12581    Py_DECREF(substring);
12582
12583    if (result == -2)
12584        return NULL;
12585
12586    if (result < 0) {
12587        PyErr_SetString(PyExc_ValueError, "substring not found");
12588        return NULL;
12589    }
12590
12591    return PyLong_FromSsize_t(result);
12592}
12593
12594PyDoc_STRVAR(rjust__doc__,
12595             "S.rjust(width[, fillchar]) -> str\n\
12596\n\
12597Return S right-justified in a string of length width. Padding is\n\
12598done using the specified fill character (default is a space).");
12599
12600static PyObject *
12601unicode_rjust(PyObject *self, PyObject *args)
12602{
12603    Py_ssize_t width;
12604    Py_UCS4 fillchar = ' ';
12605
12606    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12607        return NULL;
12608
12609    if (PyUnicode_READY(self) == -1)
12610        return NULL;
12611
12612    if (PyUnicode_GET_LENGTH(self) >= width)
12613        return unicode_result_unchanged(self);
12614
12615    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12616}
12617
12618PyObject *
12619PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12620{
12621    PyObject *result;
12622
12623    s = PyUnicode_FromObject(s);
12624    if (s == NULL)
12625        return NULL;
12626    if (sep != NULL) {
12627        sep = PyUnicode_FromObject(sep);
12628        if (sep == NULL) {
12629            Py_DECREF(s);
12630            return NULL;
12631        }
12632    }
12633
12634    result = split(s, sep, maxsplit);
12635
12636    Py_DECREF(s);
12637    Py_XDECREF(sep);
12638    return result;
12639}
12640
12641PyDoc_STRVAR(split__doc__,
12642             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12643\n\
12644Return a list of the words in S, using sep as the\n\
12645delimiter string.  If maxsplit is given, at most maxsplit\n\
12646splits are done. If sep is not specified or is None, any\n\
12647whitespace string is a separator and empty strings are\n\
12648removed from the result.");
12649
12650static PyObject*
12651unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12652{
12653    static char *kwlist[] = {"sep", "maxsplit", 0};
12654    PyObject *substring = Py_None;
12655    Py_ssize_t maxcount = -1;
12656
12657    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12658                                     kwlist, &substring, &maxcount))
12659        return NULL;
12660
12661    if (substring == Py_None)
12662        return split(self, NULL, maxcount);
12663    else if (PyUnicode_Check(substring))
12664        return split(self, substring, maxcount);
12665    else
12666        return PyUnicode_Split(self, substring, maxcount);
12667}
12668
12669PyObject *
12670PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12671{
12672    PyObject* str_obj;
12673    PyObject* sep_obj;
12674    PyObject* out;
12675    int kind1, kind2, kind;
12676    void *buf1 = NULL, *buf2 = NULL;
12677    Py_ssize_t len1, len2;
12678
12679    str_obj = PyUnicode_FromObject(str_in);
12680    if (!str_obj)
12681        return NULL;
12682    sep_obj = PyUnicode_FromObject(sep_in);
12683    if (!sep_obj) {
12684        Py_DECREF(str_obj);
12685        return NULL;
12686    }
12687    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12688        Py_DECREF(sep_obj);
12689        Py_DECREF(str_obj);
12690        return NULL;
12691    }
12692
12693    kind1 = PyUnicode_KIND(str_obj);
12694    kind2 = PyUnicode_KIND(sep_obj);
12695    kind = Py_MAX(kind1, kind2);
12696    buf1 = PyUnicode_DATA(str_obj);
12697    if (kind1 != kind)
12698        buf1 = _PyUnicode_AsKind(str_obj, kind);
12699    if (!buf1)
12700        goto onError;
12701    buf2 = PyUnicode_DATA(sep_obj);
12702    if (kind2 != kind)
12703        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12704    if (!buf2)
12705        goto onError;
12706    len1 = PyUnicode_GET_LENGTH(str_obj);
12707    len2 = PyUnicode_GET_LENGTH(sep_obj);
12708
12709    switch (PyUnicode_KIND(str_obj)) {
12710    case PyUnicode_1BYTE_KIND:
12711        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12712            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12713        else
12714            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12715        break;
12716    case PyUnicode_2BYTE_KIND:
12717        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12718        break;
12719    case PyUnicode_4BYTE_KIND:
12720        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12721        break;
12722    default:
12723        assert(0);
12724        out = 0;
12725    }
12726
12727    Py_DECREF(sep_obj);
12728    Py_DECREF(str_obj);
12729    if (kind1 != kind)
12730        PyMem_Free(buf1);
12731    if (kind2 != kind)
12732        PyMem_Free(buf2);
12733
12734    return out;
12735  onError:
12736    Py_DECREF(sep_obj);
12737    Py_DECREF(str_obj);
12738    if (kind1 != kind && buf1)
12739        PyMem_Free(buf1);
12740    if (kind2 != kind && buf2)
12741        PyMem_Free(buf2);
12742    return NULL;
12743}
12744
12745
12746PyObject *
12747PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12748{
12749    PyObject* str_obj;
12750    PyObject* sep_obj;
12751    PyObject* out;
12752    int kind1, kind2, kind;
12753    void *buf1 = NULL, *buf2 = NULL;
12754    Py_ssize_t len1, len2;
12755
12756    str_obj = PyUnicode_FromObject(str_in);
12757    if (!str_obj)
12758        return NULL;
12759    sep_obj = PyUnicode_FromObject(sep_in);
12760    if (!sep_obj) {
12761        Py_DECREF(str_obj);
12762        return NULL;
12763    }
12764
12765    kind1 = PyUnicode_KIND(str_in);
12766    kind2 = PyUnicode_KIND(sep_obj);
12767    kind = Py_MAX(kind1, kind2);
12768    buf1 = PyUnicode_DATA(str_in);
12769    if (kind1 != kind)
12770        buf1 = _PyUnicode_AsKind(str_in, kind);
12771    if (!buf1)
12772        goto onError;
12773    buf2 = PyUnicode_DATA(sep_obj);
12774    if (kind2 != kind)
12775        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12776    if (!buf2)
12777        goto onError;
12778    len1 = PyUnicode_GET_LENGTH(str_obj);
12779    len2 = PyUnicode_GET_LENGTH(sep_obj);
12780
12781    switch (PyUnicode_KIND(str_in)) {
12782    case PyUnicode_1BYTE_KIND:
12783        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12784            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12785        else
12786            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12787        break;
12788    case PyUnicode_2BYTE_KIND:
12789        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12790        break;
12791    case PyUnicode_4BYTE_KIND:
12792        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12793        break;
12794    default:
12795        assert(0);
12796        out = 0;
12797    }
12798
12799    Py_DECREF(sep_obj);
12800    Py_DECREF(str_obj);
12801    if (kind1 != kind)
12802        PyMem_Free(buf1);
12803    if (kind2 != kind)
12804        PyMem_Free(buf2);
12805
12806    return out;
12807  onError:
12808    Py_DECREF(sep_obj);
12809    Py_DECREF(str_obj);
12810    if (kind1 != kind && buf1)
12811        PyMem_Free(buf1);
12812    if (kind2 != kind && buf2)
12813        PyMem_Free(buf2);
12814    return NULL;
12815}
12816
12817PyDoc_STRVAR(partition__doc__,
12818             "S.partition(sep) -> (head, sep, tail)\n\
12819\n\
12820Search for the separator sep in S, and return the part before it,\n\
12821the separator itself, and the part after it.  If the separator is not\n\
12822found, return S and two empty strings.");
12823
12824static PyObject*
12825unicode_partition(PyObject *self, PyObject *separator)
12826{
12827    return PyUnicode_Partition(self, separator);
12828}
12829
12830PyDoc_STRVAR(rpartition__doc__,
12831             "S.rpartition(sep) -> (head, sep, tail)\n\
12832\n\
12833Search for the separator sep in S, starting at the end of S, and return\n\
12834the part before it, the separator itself, and the part after it.  If the\n\
12835separator is not found, return two empty strings and S.");
12836
12837static PyObject*
12838unicode_rpartition(PyObject *self, PyObject *separator)
12839{
12840    return PyUnicode_RPartition(self, separator);
12841}
12842
12843PyObject *
12844PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12845{
12846    PyObject *result;
12847
12848    s = PyUnicode_FromObject(s);
12849    if (s == NULL)
12850        return NULL;
12851    if (sep != NULL) {
12852        sep = PyUnicode_FromObject(sep);
12853        if (sep == NULL) {
12854            Py_DECREF(s);
12855            return NULL;
12856        }
12857    }
12858
12859    result = rsplit(s, sep, maxsplit);
12860
12861    Py_DECREF(s);
12862    Py_XDECREF(sep);
12863    return result;
12864}
12865
12866PyDoc_STRVAR(rsplit__doc__,
12867             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12868\n\
12869Return a list of the words in S, using sep as the\n\
12870delimiter string, starting at the end of the string and\n\
12871working to the front.  If maxsplit is given, at most maxsplit\n\
12872splits are done. If sep is not specified, any whitespace string\n\
12873is a separator.");
12874
12875static PyObject*
12876unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12877{
12878    static char *kwlist[] = {"sep", "maxsplit", 0};
12879    PyObject *substring = Py_None;
12880    Py_ssize_t maxcount = -1;
12881
12882    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12883                                     kwlist, &substring, &maxcount))
12884        return NULL;
12885
12886    if (substring == Py_None)
12887        return rsplit(self, NULL, maxcount);
12888    else if (PyUnicode_Check(substring))
12889        return rsplit(self, substring, maxcount);
12890    else
12891        return PyUnicode_RSplit(self, substring, maxcount);
12892}
12893
12894PyDoc_STRVAR(splitlines__doc__,
12895             "S.splitlines([keepends]) -> list of strings\n\
12896\n\
12897Return a list of the lines in S, breaking at line boundaries.\n\
12898Line breaks are not included in the resulting list unless keepends\n\
12899is given and true.");
12900
12901static PyObject*
12902unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12903{
12904    static char *kwlist[] = {"keepends", 0};
12905    int keepends = 0;
12906
12907    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12908                                     kwlist, &keepends))
12909        return NULL;
12910
12911    return PyUnicode_Splitlines(self, keepends);
12912}
12913
12914static
12915PyObject *unicode_str(PyObject *self)
12916{
12917    return unicode_result_unchanged(self);
12918}
12919
12920PyDoc_STRVAR(swapcase__doc__,
12921             "S.swapcase() -> str\n\
12922\n\
12923Return a copy of S with uppercase characters converted to lowercase\n\
12924and vice versa.");
12925
12926static PyObject*
12927unicode_swapcase(PyObject *self)
12928{
12929    if (PyUnicode_READY(self) == -1)
12930        return NULL;
12931    return case_operation(self, do_swapcase);
12932}
12933
12934/*[clinic input]
12935
12936@staticmethod
12937str.maketrans as unicode_maketrans
12938
12939  x: object
12940
12941  y: unicode=NULL
12942
12943  z: unicode=NULL
12944
12945  /
12946
12947Return a translation table usable for str.translate().
12948
12949If there is only one argument, it must be a dictionary mapping Unicode
12950ordinals (integers) or characters to Unicode ordinals, strings or None.
12951Character keys will be then converted to ordinals.
12952If there are two arguments, they must be strings of equal length, and
12953in the resulting dictionary, each character in x will be mapped to the
12954character at the same position in y. If there is a third argument, it
12955must be a string, whose characters will be mapped to None in the result.
12956[clinic start generated code]*/
12957
12958PyDoc_STRVAR(unicode_maketrans__doc__,
12959"maketrans(x, y=None, z=None, /)\n"
12960"--\n"
12961"\n"
12962"Return a translation table usable for str.translate().\n"
12963"\n"
12964"If there is only one argument, it must be a dictionary mapping Unicode\n"
12965"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12966"Character keys will be then converted to ordinals.\n"
12967"If there are two arguments, they must be strings of equal length, and\n"
12968"in the resulting dictionary, each character in x will be mapped to the\n"
12969"character at the same position in y. If there is a third argument, it\n"
12970"must be a string, whose characters will be mapped to None in the result.");
12971
12972#define UNICODE_MAKETRANS_METHODDEF    \
12973    {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12974
12975static PyObject *
12976unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12977
12978static PyObject *
12979unicode_maketrans(void *null, PyObject *args)
12980{
12981    PyObject *return_value = NULL;
12982    PyObject *x;
12983    PyObject *y = NULL;
12984    PyObject *z = NULL;
12985
12986    if (!PyArg_ParseTuple(args,
12987        "O|UU:maketrans",
12988        &x, &y, &z))
12989        goto exit;
12990    return_value = unicode_maketrans_impl(x, y, z);
12991
12992exit:
12993    return return_value;
12994}
12995
12996static PyObject *
12997unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12998/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
12999{
13000    PyObject *new = NULL, *key, *value;
13001    Py_ssize_t i = 0;
13002    int res;
13003
13004    new = PyDict_New();
13005    if (!new)
13006        return NULL;
13007    if (y != NULL) {
13008        int x_kind, y_kind, z_kind;
13009        void *x_data, *y_data, *z_data;
13010
13011        /* x must be a string too, of equal length */
13012        if (!PyUnicode_Check(x)) {
13013            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
13014                            "be a string if there is a second argument");
13015            goto err;
13016        }
13017        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
13018            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
13019                            "arguments must have equal length");
13020            goto err;
13021        }
13022        /* create entries for translating chars in x to those in y */
13023        x_kind = PyUnicode_KIND(x);
13024        y_kind = PyUnicode_KIND(y);
13025        x_data = PyUnicode_DATA(x);
13026        y_data = PyUnicode_DATA(y);
13027        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13028            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13029            if (!key)
13030                goto err;
13031            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13032            if (!value) {
13033                Py_DECREF(key);
13034                goto err;
13035            }
13036            res = PyDict_SetItem(new, key, value);
13037            Py_DECREF(key);
13038            Py_DECREF(value);
13039            if (res < 0)
13040                goto err;
13041        }
13042        /* create entries for deleting chars in z */
13043        if (z != NULL) {
13044            z_kind = PyUnicode_KIND(z);
13045            z_data = PyUnicode_DATA(z);
13046            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13047                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13048                if (!key)
13049                    goto err;
13050                res = PyDict_SetItem(new, key, Py_None);
13051                Py_DECREF(key);
13052                if (res < 0)
13053                    goto err;
13054            }
13055        }
13056    } else {
13057        int kind;
13058        void *data;
13059
13060        /* x must be a dict */
13061        if (!PyDict_CheckExact(x)) {
13062            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13063                            "to maketrans it must be a dict");
13064            goto err;
13065        }
13066        /* copy entries into the new dict, converting string keys to int keys */
13067        while (PyDict_Next(x, &i, &key, &value)) {
13068            if (PyUnicode_Check(key)) {
13069                /* convert string keys to integer keys */
13070                PyObject *newkey;
13071                if (PyUnicode_GET_LENGTH(key) != 1) {
13072                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13073                                    "table must be of length 1");
13074                    goto err;
13075                }
13076                kind = PyUnicode_KIND(key);
13077                data = PyUnicode_DATA(key);
13078                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13079                if (!newkey)
13080                    goto err;
13081                res = PyDict_SetItem(new, newkey, value);
13082                Py_DECREF(newkey);
13083                if (res < 0)
13084                    goto err;
13085            } else if (PyLong_Check(key)) {
13086                /* just keep integer keys */
13087                if (PyDict_SetItem(new, key, value) < 0)
13088                    goto err;
13089            } else {
13090                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13091                                "be strings or integers");
13092                goto err;
13093            }
13094        }
13095    }
13096    return new;
13097  err:
13098    Py_DECREF(new);
13099    return NULL;
13100}
13101
13102PyDoc_STRVAR(translate__doc__,
13103             "S.translate(table) -> str\n\
13104\n\
13105Return a copy of the string S, where all characters have been mapped\n\
13106through the given translation table, which must be a mapping of\n\
13107Unicode ordinals to Unicode ordinals, strings, or None.\n\
13108Unmapped characters are left untouched. Characters mapped to None\n\
13109are deleted.");
13110
13111static PyObject*
13112unicode_translate(PyObject *self, PyObject *table)
13113{
13114    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13115}
13116
13117PyDoc_STRVAR(upper__doc__,
13118             "S.upper() -> str\n\
13119\n\
13120Return a copy of S converted to uppercase.");
13121
13122static PyObject*
13123unicode_upper(PyObject *self)
13124{
13125    if (PyUnicode_READY(self) == -1)
13126        return NULL;
13127    if (PyUnicode_IS_ASCII(self))
13128        return ascii_upper_or_lower(self, 0);
13129    return case_operation(self, do_upper);
13130}
13131
13132PyDoc_STRVAR(zfill__doc__,
13133             "S.zfill(width) -> str\n\
13134\n\
13135Pad a numeric string S with zeros on the left, to fill a field\n\
13136of the specified width. The string S is never truncated.");
13137
13138static PyObject *
13139unicode_zfill(PyObject *self, PyObject *args)
13140{
13141    Py_ssize_t fill;
13142    PyObject *u;
13143    Py_ssize_t width;
13144    int kind;
13145    void *data;
13146    Py_UCS4 chr;
13147
13148    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13149        return NULL;
13150
13151    if (PyUnicode_READY(self) == -1)
13152        return NULL;
13153
13154    if (PyUnicode_GET_LENGTH(self) >= width)
13155        return unicode_result_unchanged(self);
13156
13157    fill = width - PyUnicode_GET_LENGTH(self);
13158
13159    u = pad(self, fill, 0, '0');
13160
13161    if (u == NULL)
13162        return NULL;
13163
13164    kind = PyUnicode_KIND(u);
13165    data = PyUnicode_DATA(u);
13166    chr = PyUnicode_READ(kind, data, fill);
13167
13168    if (chr == '+' || chr == '-') {
13169        /* move sign to beginning of string */
13170        PyUnicode_WRITE(kind, data, 0, chr);
13171        PyUnicode_WRITE(kind, data, fill, '0');
13172    }
13173
13174    assert(_PyUnicode_CheckConsistency(u, 1));
13175    return u;
13176}
13177
13178#if 0
13179static PyObject *
13180unicode__decimal2ascii(PyObject *self)
13181{
13182    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13183}
13184#endif
13185
13186PyDoc_STRVAR(startswith__doc__,
13187             "S.startswith(prefix[, start[, end]]) -> bool\n\
13188\n\
13189Return True if S starts with the specified prefix, False otherwise.\n\
13190With optional start, test S beginning at that position.\n\
13191With optional end, stop comparing S at that position.\n\
13192prefix can also be a tuple of strings to try.");
13193
13194static PyObject *
13195unicode_startswith(PyObject *self,
13196                   PyObject *args)
13197{
13198    PyObject *subobj;
13199    PyObject *substring;
13200    Py_ssize_t start = 0;
13201    Py_ssize_t end = PY_SSIZE_T_MAX;
13202    int result;
13203
13204    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13205        return NULL;
13206    if (PyTuple_Check(subobj)) {
13207        Py_ssize_t i;
13208        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13209            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
13210            if (substring == NULL)
13211                return NULL;
13212            result = tailmatch(self, substring, start, end, -1);
13213            Py_DECREF(substring);
13214            if (result == -1)
13215                return NULL;
13216            if (result) {
13217                Py_RETURN_TRUE;
13218            }
13219        }
13220        /* nothing matched */
13221        Py_RETURN_FALSE;
13222    }
13223    substring = PyUnicode_FromObject(subobj);
13224    if (substring == NULL) {
13225        if (PyErr_ExceptionMatches(PyExc_TypeError))
13226            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13227                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13228        return NULL;
13229    }
13230    result = tailmatch(self, substring, start, end, -1);
13231    Py_DECREF(substring);
13232    if (result == -1)
13233        return NULL;
13234    return PyBool_FromLong(result);
13235}
13236
13237
13238PyDoc_STRVAR(endswith__doc__,
13239             "S.endswith(suffix[, start[, end]]) -> bool\n\
13240\n\
13241Return True if S ends with the specified suffix, False otherwise.\n\
13242With optional start, test S beginning at that position.\n\
13243With optional end, stop comparing S at that position.\n\
13244suffix can also be a tuple of strings to try.");
13245
13246static PyObject *
13247unicode_endswith(PyObject *self,
13248                 PyObject *args)
13249{
13250    PyObject *subobj;
13251    PyObject *substring;
13252    Py_ssize_t start = 0;
13253    Py_ssize_t end = PY_SSIZE_T_MAX;
13254    int result;
13255
13256    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13257        return NULL;
13258    if (PyTuple_Check(subobj)) {
13259        Py_ssize_t i;
13260        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13261            substring = PyUnicode_FromObject(
13262                PyTuple_GET_ITEM(subobj, i));
13263            if (substring == NULL)
13264                return NULL;
13265            result = tailmatch(self, substring, start, end, +1);
13266            Py_DECREF(substring);
13267            if (result == -1)
13268                return NULL;
13269            if (result) {
13270                Py_RETURN_TRUE;
13271            }
13272        }
13273        Py_RETURN_FALSE;
13274    }
13275    substring = PyUnicode_FromObject(subobj);
13276    if (substring == NULL) {
13277        if (PyErr_ExceptionMatches(PyExc_TypeError))
13278            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13279                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13280        return NULL;
13281    }
13282    result = tailmatch(self, substring, start, end, +1);
13283    Py_DECREF(substring);
13284    if (result == -1)
13285        return NULL;
13286    return PyBool_FromLong(result);
13287}
13288
13289Py_LOCAL_INLINE(void)
13290_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13291{
13292    if (!writer->readonly)
13293        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13294    else {
13295        /* Copy-on-write mode: set buffer size to 0 so
13296         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13297         * next write. */
13298        writer->size = 0;
13299    }
13300    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13301    writer->data = PyUnicode_DATA(writer->buffer);
13302    writer->kind = PyUnicode_KIND(writer->buffer);
13303}
13304
13305void
13306_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13307{
13308    memset(writer, 0, sizeof(*writer));
13309#ifdef Py_DEBUG
13310    writer->kind = 5;    /* invalid kind */
13311#endif
13312    writer->min_char = 127;
13313}
13314
13315int
13316_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13317                                 Py_ssize_t length, Py_UCS4 maxchar)
13318{
13319#ifdef MS_WINDOWS
13320   /* On Windows, overallocate by 50% is the best factor */
13321#  define OVERALLOCATE_FACTOR 2
13322#else
13323   /* On Linux, overallocate by 25% is the best factor */
13324#  define OVERALLOCATE_FACTOR 4
13325#endif
13326    Py_ssize_t newlen;
13327    PyObject *newbuffer;
13328
13329    assert(length > 0);
13330
13331    if (length > PY_SSIZE_T_MAX - writer->pos) {
13332        PyErr_NoMemory();
13333        return -1;
13334    }
13335    newlen = writer->pos + length;
13336
13337    maxchar = Py_MAX(maxchar, writer->min_char);
13338
13339    if (writer->buffer == NULL) {
13340        assert(!writer->readonly);
13341        if (writer->overallocate
13342            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13343            /* overallocate to limit the number of realloc() */
13344            newlen += newlen / OVERALLOCATE_FACTOR;
13345        }
13346        if (newlen < writer->min_length)
13347            newlen = writer->min_length;
13348
13349        writer->buffer = PyUnicode_New(newlen, maxchar);
13350        if (writer->buffer == NULL)
13351            return -1;
13352    }
13353    else if (newlen > writer->size) {
13354        if (writer->overallocate
13355            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13356            /* overallocate to limit the number of realloc() */
13357            newlen += newlen / OVERALLOCATE_FACTOR;
13358        }
13359        if (newlen < writer->min_length)
13360            newlen = writer->min_length;
13361
13362        if (maxchar > writer->maxchar || writer->readonly) {
13363            /* resize + widen */
13364            newbuffer = PyUnicode_New(newlen, maxchar);
13365            if (newbuffer == NULL)
13366                return -1;
13367            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13368                                          writer->buffer, 0, writer->pos);
13369            Py_DECREF(writer->buffer);
13370            writer->readonly = 0;
13371        }
13372        else {
13373            newbuffer = resize_compact(writer->buffer, newlen);
13374            if (newbuffer == NULL)
13375                return -1;
13376        }
13377        writer->buffer = newbuffer;
13378    }
13379    else if (maxchar > writer->maxchar) {
13380        assert(!writer->readonly);
13381        newbuffer = PyUnicode_New(writer->size, maxchar);
13382        if (newbuffer == NULL)
13383            return -1;
13384        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13385                                      writer->buffer, 0, writer->pos);
13386        Py_DECREF(writer->buffer);
13387        writer->buffer = newbuffer;
13388    }
13389    _PyUnicodeWriter_Update(writer);
13390    return 0;
13391
13392#undef OVERALLOCATE_FACTOR
13393}
13394
13395Py_LOCAL_INLINE(int)
13396_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13397{
13398    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13399        return -1;
13400    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13401    writer->pos++;
13402    return 0;
13403}
13404
13405int
13406_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13407{
13408    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13409}
13410
13411int
13412_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13413{
13414    Py_UCS4 maxchar;
13415    Py_ssize_t len;
13416
13417    if (PyUnicode_READY(str) == -1)
13418        return -1;
13419    len = PyUnicode_GET_LENGTH(str);
13420    if (len == 0)
13421        return 0;
13422    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13423    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13424        if (writer->buffer == NULL && !writer->overallocate) {
13425            writer->readonly = 1;
13426            Py_INCREF(str);
13427            writer->buffer = str;
13428            _PyUnicodeWriter_Update(writer);
13429            writer->pos += len;
13430            return 0;
13431        }
13432        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13433            return -1;
13434    }
13435    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13436                                  str, 0, len);
13437    writer->pos += len;
13438    return 0;
13439}
13440
13441int
13442_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13443                                Py_ssize_t start, Py_ssize_t end)
13444{
13445    Py_UCS4 maxchar;
13446    Py_ssize_t len;
13447
13448    if (PyUnicode_READY(str) == -1)
13449        return -1;
13450
13451    assert(0 <= start);
13452    assert(end <= PyUnicode_GET_LENGTH(str));
13453    assert(start <= end);
13454
13455    if (end == 0)
13456        return 0;
13457
13458    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13459        return _PyUnicodeWriter_WriteStr(writer, str);
13460
13461    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13462        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13463    else
13464        maxchar = writer->maxchar;
13465    len = end - start;
13466
13467    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13468        return -1;
13469
13470    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13471                                  str, start, len);
13472    writer->pos += len;
13473    return 0;
13474}
13475
13476int
13477_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13478                                  const char *ascii, Py_ssize_t len)
13479{
13480    if (len == -1)
13481        len = strlen(ascii);
13482
13483    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13484
13485    if (writer->buffer == NULL && !writer->overallocate) {
13486        PyObject *str;
13487
13488        str = _PyUnicode_FromASCII(ascii, len);
13489        if (str == NULL)
13490            return -1;
13491
13492        writer->readonly = 1;
13493        writer->buffer = str;
13494        _PyUnicodeWriter_Update(writer);
13495        writer->pos += len;
13496        return 0;
13497    }
13498
13499    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13500        return -1;
13501
13502    switch (writer->kind)
13503    {
13504    case PyUnicode_1BYTE_KIND:
13505    {
13506        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13507        Py_UCS1 *data = writer->data;
13508
13509        Py_MEMCPY(data + writer->pos, str, len);
13510        break;
13511    }
13512    case PyUnicode_2BYTE_KIND:
13513    {
13514        _PyUnicode_CONVERT_BYTES(
13515            Py_UCS1, Py_UCS2,
13516            ascii, ascii + len,
13517            (Py_UCS2 *)writer->data + writer->pos);
13518        break;
13519    }
13520    case PyUnicode_4BYTE_KIND:
13521    {
13522        _PyUnicode_CONVERT_BYTES(
13523            Py_UCS1, Py_UCS4,
13524            ascii, ascii + len,
13525            (Py_UCS4 *)writer->data + writer->pos);
13526        break;
13527    }
13528    default:
13529        assert(0);
13530    }
13531
13532    writer->pos += len;
13533    return 0;
13534}
13535
13536int
13537_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13538                                   const char *str, Py_ssize_t len)
13539{
13540    Py_UCS4 maxchar;
13541
13542    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13543    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13544        return -1;
13545    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13546    writer->pos += len;
13547    return 0;
13548}
13549
13550PyObject *
13551_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13552{
13553    PyObject *str;
13554    if (writer->pos == 0) {
13555        Py_CLEAR(writer->buffer);
13556        _Py_RETURN_UNICODE_EMPTY();
13557    }
13558    if (writer->readonly) {
13559        str = writer->buffer;
13560        writer->buffer = NULL;
13561        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13562        return str;
13563    }
13564    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13565        PyObject *newbuffer;
13566        newbuffer = resize_compact(writer->buffer, writer->pos);
13567        if (newbuffer == NULL) {
13568            Py_CLEAR(writer->buffer);
13569            return NULL;
13570        }
13571        writer->buffer = newbuffer;
13572    }
13573    str = writer->buffer;
13574    writer->buffer = NULL;
13575    assert(_PyUnicode_CheckConsistency(str, 1));
13576    return unicode_result_ready(str);
13577}
13578
13579void
13580_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13581{
13582    Py_CLEAR(writer->buffer);
13583}
13584
13585#include "stringlib/unicode_format.h"
13586
13587PyDoc_STRVAR(format__doc__,
13588             "S.format(*args, **kwargs) -> str\n\
13589\n\
13590Return a formatted version of S, using substitutions from args and kwargs.\n\
13591The substitutions are identified by braces ('{' and '}').");
13592
13593PyDoc_STRVAR(format_map__doc__,
13594             "S.format_map(mapping) -> str\n\
13595\n\
13596Return a formatted version of S, using substitutions from mapping.\n\
13597The substitutions are identified by braces ('{' and '}').");
13598
13599static PyObject *
13600unicode__format__(PyObject* self, PyObject* args)
13601{
13602    PyObject *format_spec;
13603    _PyUnicodeWriter writer;
13604    int ret;
13605
13606    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13607        return NULL;
13608
13609    if (PyUnicode_READY(self) == -1)
13610        return NULL;
13611    _PyUnicodeWriter_Init(&writer);
13612    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13613                                          self, format_spec, 0,
13614                                          PyUnicode_GET_LENGTH(format_spec));
13615    if (ret == -1) {
13616        _PyUnicodeWriter_Dealloc(&writer);
13617        return NULL;
13618    }
13619    return _PyUnicodeWriter_Finish(&writer);
13620}
13621
13622PyDoc_STRVAR(p_format__doc__,
13623             "S.__format__(format_spec) -> str\n\
13624\n\
13625Return a formatted version of S as described by format_spec.");
13626
13627static PyObject *
13628unicode__sizeof__(PyObject *v)
13629{
13630    Py_ssize_t size;
13631
13632    /* If it's a compact object, account for base structure +
13633       character data. */
13634    if (PyUnicode_IS_COMPACT_ASCII(v))
13635        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13636    else if (PyUnicode_IS_COMPACT(v))
13637        size = sizeof(PyCompactUnicodeObject) +
13638            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13639    else {
13640        /* If it is a two-block object, account for base object, and
13641           for character block if present. */
13642        size = sizeof(PyUnicodeObject);
13643        if (_PyUnicode_DATA_ANY(v))
13644            size += (PyUnicode_GET_LENGTH(v) + 1) *
13645                PyUnicode_KIND(v);
13646    }
13647    /* If the wstr pointer is present, account for it unless it is shared
13648       with the data pointer. Check if the data is not shared. */
13649    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13650        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13651    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13652        size += PyUnicode_UTF8_LENGTH(v) + 1;
13653
13654    return PyLong_FromSsize_t(size);
13655}
13656
13657PyDoc_STRVAR(sizeof__doc__,
13658             "S.__sizeof__() -> size of S in memory, in bytes");
13659
13660static PyObject *
13661unicode_getnewargs(PyObject *v)
13662{
13663    PyObject *copy = _PyUnicode_Copy(v);
13664    if (!copy)
13665        return NULL;
13666    return Py_BuildValue("(N)", copy);
13667}
13668
13669static PyMethodDef unicode_methods[] = {
13670    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13671    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13672    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13673    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13674    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13675    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13676    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13677    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13678    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13679    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13680    {"expandtabs", (PyCFunction) unicode_expandtabs,
13681     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13682    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13683    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13684    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13685    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13686    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13687    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13688    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13689    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13690    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13691    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13692    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13693    {"splitlines", (PyCFunction) unicode_splitlines,
13694     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13695    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13696    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13697    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13698    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13699    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13700    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13701    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13702    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13703    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13704    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13705    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13706    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13707    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13708    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13709    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13710    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13711    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13712    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13713    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13714    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13715    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13716    UNICODE_MAKETRANS_METHODDEF
13717    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13718#if 0
13719    /* These methods are just used for debugging the implementation. */
13720    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13721#endif
13722
13723    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13724    {NULL, NULL}
13725};
13726
13727static PyObject *
13728unicode_mod(PyObject *v, PyObject *w)
13729{
13730    if (!PyUnicode_Check(v))
13731        Py_RETURN_NOTIMPLEMENTED;
13732    return PyUnicode_Format(v, w);
13733}
13734
13735static PyNumberMethods unicode_as_number = {
13736    0,              /*nb_add*/
13737    0,              /*nb_subtract*/
13738    0,              /*nb_multiply*/
13739    unicode_mod,            /*nb_remainder*/
13740};
13741
13742static PySequenceMethods unicode_as_sequence = {
13743    (lenfunc) unicode_length,       /* sq_length */
13744    PyUnicode_Concat,           /* sq_concat */
13745    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13746    (ssizeargfunc) unicode_getitem,     /* sq_item */
13747    0,                  /* sq_slice */
13748    0,                  /* sq_ass_item */
13749    0,                  /* sq_ass_slice */
13750    PyUnicode_Contains,         /* sq_contains */
13751};
13752
13753static PyObject*
13754unicode_subscript(PyObject* self, PyObject* item)
13755{
13756    if (PyUnicode_READY(self) == -1)
13757        return NULL;
13758
13759    if (PyIndex_Check(item)) {
13760        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13761        if (i == -1 && PyErr_Occurred())
13762            return NULL;
13763        if (i < 0)
13764            i += PyUnicode_GET_LENGTH(self);
13765        return unicode_getitem(self, i);
13766    } else if (PySlice_Check(item)) {
13767        Py_ssize_t start, stop, step, slicelength, cur, i;
13768        PyObject *result;
13769        void *src_data, *dest_data;
13770        int src_kind, dest_kind;
13771        Py_UCS4 ch, max_char, kind_limit;
13772
13773        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13774                                 &start, &stop, &step, &slicelength) < 0) {
13775            return NULL;
13776        }
13777
13778        if (slicelength <= 0) {
13779            _Py_RETURN_UNICODE_EMPTY();
13780        } else if (start == 0 && step == 1 &&
13781                   slicelength == PyUnicode_GET_LENGTH(self)) {
13782            return unicode_result_unchanged(self);
13783        } else if (step == 1) {
13784            return PyUnicode_Substring(self,
13785                                       start, start + slicelength);
13786        }
13787        /* General case */
13788        src_kind = PyUnicode_KIND(self);
13789        src_data = PyUnicode_DATA(self);
13790        if (!PyUnicode_IS_ASCII(self)) {
13791            kind_limit = kind_maxchar_limit(src_kind);
13792            max_char = 0;
13793            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13794                ch = PyUnicode_READ(src_kind, src_data, cur);
13795                if (ch > max_char) {
13796                    max_char = ch;
13797                    if (max_char >= kind_limit)
13798                        break;
13799                }
13800            }
13801        }
13802        else
13803            max_char = 127;
13804        result = PyUnicode_New(slicelength, max_char);
13805        if (result == NULL)
13806            return NULL;
13807        dest_kind = PyUnicode_KIND(result);
13808        dest_data = PyUnicode_DATA(result);
13809
13810        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13811            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13812            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13813        }
13814        assert(_PyUnicode_CheckConsistency(result, 1));
13815        return result;
13816    } else {
13817        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13818        return NULL;
13819    }
13820}
13821
13822static PyMappingMethods unicode_as_mapping = {
13823    (lenfunc)unicode_length,        /* mp_length */
13824    (binaryfunc)unicode_subscript,  /* mp_subscript */
13825    (objobjargproc)0,           /* mp_ass_subscript */
13826};
13827
13828
13829/* Helpers for PyUnicode_Format() */
13830
13831struct unicode_formatter_t {
13832    PyObject *args;
13833    int args_owned;
13834    Py_ssize_t arglen, argidx;
13835    PyObject *dict;
13836
13837    enum PyUnicode_Kind fmtkind;
13838    Py_ssize_t fmtcnt, fmtpos;
13839    void *fmtdata;
13840    PyObject *fmtstr;
13841
13842    _PyUnicodeWriter writer;
13843};
13844
13845struct unicode_format_arg_t {
13846    Py_UCS4 ch;
13847    int flags;
13848    Py_ssize_t width;
13849    int prec;
13850    int sign;
13851};
13852
13853static PyObject *
13854unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13855{
13856    Py_ssize_t argidx = ctx->argidx;
13857
13858    if (argidx < ctx->arglen) {
13859        ctx->argidx++;
13860        if (ctx->arglen < 0)
13861            return ctx->args;
13862        else
13863            return PyTuple_GetItem(ctx->args, argidx);
13864    }
13865    PyErr_SetString(PyExc_TypeError,
13866                    "not enough arguments for format string");
13867    return NULL;
13868}
13869
13870/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13871
13872/* Format a float into the writer if the writer is not NULL, or into *p_output
13873   otherwise.
13874
13875   Return 0 on success, raise an exception and return -1 on error. */
13876static int
13877formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13878            PyObject **p_output,
13879            _PyUnicodeWriter *writer)
13880{
13881    char *p;
13882    double x;
13883    Py_ssize_t len;
13884    int prec;
13885    int dtoa_flags;
13886
13887    x = PyFloat_AsDouble(v);
13888    if (x == -1.0 && PyErr_Occurred())
13889        return -1;
13890
13891    prec = arg->prec;
13892    if (prec < 0)
13893        prec = 6;
13894
13895    if (arg->flags & F_ALT)
13896        dtoa_flags = Py_DTSF_ALT;
13897    else
13898        dtoa_flags = 0;
13899    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13900    if (p == NULL)
13901        return -1;
13902    len = strlen(p);
13903    if (writer) {
13904        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13905            PyMem_Free(p);
13906            return -1;
13907        }
13908    }
13909    else
13910        *p_output = _PyUnicode_FromASCII(p, len);
13911    PyMem_Free(p);
13912    return 0;
13913}
13914
13915/* formatlong() emulates the format codes d, u, o, x and X, and
13916 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13917 * Python's regular ints.
13918 * Return value:  a new PyUnicodeObject*, or NULL if error.
13919 *     The output string is of the form
13920 *         "-"? ("0x" | "0X")? digit+
13921 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13922 *         set in flags.  The case of hex digits will be correct,
13923 *     There will be at least prec digits, zero-filled on the left if
13924 *         necessary to get that many.
13925 * val          object to be converted
13926 * flags        bitmask of format flags; only F_ALT is looked at
13927 * prec         minimum number of digits; 0-fill on left if needed
13928 * type         a character in [duoxX]; u acts the same as d
13929 *
13930 * CAUTION:  o, x and X conversions on regular ints can never
13931 * produce a '-' sign, but can for Python's unbounded ints.
13932 */
13933static PyObject*
13934formatlong(PyObject *val, struct unicode_format_arg_t *arg)
13935{
13936    PyObject *result = NULL;
13937    char *buf;
13938    Py_ssize_t i;
13939    int sign;           /* 1 if '-', else 0 */
13940    int len;            /* number of characters */
13941    Py_ssize_t llen;
13942    int numdigits;      /* len == numnondigits + numdigits */
13943    int numnondigits = 0;
13944    int prec = arg->prec;
13945    int type = arg->ch;
13946
13947    /* Avoid exceeding SSIZE_T_MAX */
13948    if (prec > INT_MAX-3) {
13949        PyErr_SetString(PyExc_OverflowError,
13950                        "precision too large");
13951        return NULL;
13952    }
13953
13954    assert(PyLong_Check(val));
13955
13956    switch (type) {
13957    default:
13958        assert(!"'type' not in [diuoxX]");
13959    case 'd':
13960    case 'i':
13961    case 'u':
13962        /* int and int subclasses should print numerically when a numeric */
13963        /* format code is used (see issue18780) */
13964        result = PyNumber_ToBase(val, 10);
13965        break;
13966    case 'o':
13967        numnondigits = 2;
13968        result = PyNumber_ToBase(val, 8);
13969        break;
13970    case 'x':
13971    case 'X':
13972        numnondigits = 2;
13973        result = PyNumber_ToBase(val, 16);
13974        break;
13975    }
13976    if (!result)
13977        return NULL;
13978
13979    assert(unicode_modifiable(result));
13980    assert(PyUnicode_IS_READY(result));
13981    assert(PyUnicode_IS_ASCII(result));
13982
13983    /* To modify the string in-place, there can only be one reference. */
13984    if (Py_REFCNT(result) != 1) {
13985        Py_DECREF(result);
13986        PyErr_BadInternalCall();
13987        return NULL;
13988    }
13989    buf = PyUnicode_DATA(result);
13990    llen = PyUnicode_GET_LENGTH(result);
13991    if (llen > INT_MAX) {
13992        Py_DECREF(result);
13993        PyErr_SetString(PyExc_ValueError,
13994                        "string too large in _PyBytes_FormatLong");
13995        return NULL;
13996    }
13997    len = (int)llen;
13998    sign = buf[0] == '-';
13999    numnondigits += sign;
14000    numdigits = len - numnondigits;
14001    assert(numdigits > 0);
14002
14003    /* Get rid of base marker unless F_ALT */
14004    if (((arg->flags & F_ALT) == 0 &&
14005        (type == 'o' || type == 'x' || type == 'X'))) {
14006        assert(buf[sign] == '0');
14007        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
14008               buf[sign+1] == 'o');
14009        numnondigits -= 2;
14010        buf += 2;
14011        len -= 2;
14012        if (sign)
14013            buf[0] = '-';
14014        assert(len == numnondigits + numdigits);
14015        assert(numdigits > 0);
14016    }
14017
14018    /* Fill with leading zeroes to meet minimum width. */
14019    if (prec > numdigits) {
14020        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
14021                                numnondigits + prec);
14022        char *b1;
14023        if (!r1) {
14024            Py_DECREF(result);
14025            return NULL;
14026        }
14027        b1 = PyBytes_AS_STRING(r1);
14028        for (i = 0; i < numnondigits; ++i)
14029            *b1++ = *buf++;
14030        for (i = 0; i < prec - numdigits; i++)
14031            *b1++ = '0';
14032        for (i = 0; i < numdigits; i++)
14033            *b1++ = *buf++;
14034        *b1 = '\0';
14035        Py_DECREF(result);
14036        result = r1;
14037        buf = PyBytes_AS_STRING(result);
14038        len = numnondigits + prec;
14039    }
14040
14041    /* Fix up case for hex conversions. */
14042    if (type == 'X') {
14043        /* Need to convert all lower case letters to upper case.
14044           and need to convert 0x to 0X (and -0x to -0X). */
14045        for (i = 0; i < len; i++)
14046            if (buf[i] >= 'a' && buf[i] <= 'x')
14047                buf[i] -= 'a'-'A';
14048    }
14049    if (!PyUnicode_Check(result)
14050        || buf != PyUnicode_DATA(result)) {
14051        PyObject *unicode;
14052        unicode = _PyUnicode_FromASCII(buf, len);
14053        Py_DECREF(result);
14054        result = unicode;
14055    }
14056    else if (len != PyUnicode_GET_LENGTH(result)) {
14057        if (PyUnicode_Resize(&result, len) < 0)
14058            Py_CLEAR(result);
14059    }
14060    return result;
14061}
14062
14063/* Format an integer or a float as an integer.
14064 * Return 1 if the number has been formatted into the writer,
14065 *        0 if the number has been formatted into *p_output
14066 *       -1 and raise an exception on error */
14067static int
14068mainformatlong(PyObject *v,
14069               struct unicode_format_arg_t *arg,
14070               PyObject **p_output,
14071               _PyUnicodeWriter *writer)
14072{
14073    PyObject *iobj, *res;
14074    char type = (char)arg->ch;
14075
14076    if (!PyNumber_Check(v))
14077        goto wrongtype;
14078
14079    /* make sure number is a type of integer for o, x, and X */
14080    if (!PyLong_Check(v)) {
14081        if (type == 'o' || type == 'x' || type == 'X') {
14082            iobj = PyNumber_Index(v);
14083            if (iobj == NULL) {
14084                if (PyErr_ExceptionMatches(PyExc_TypeError))
14085                    goto wrongtype;
14086                return -1;
14087            }
14088        }
14089        else {
14090            iobj = PyNumber_Long(v);
14091            if (iobj == NULL ) {
14092                if (PyErr_ExceptionMatches(PyExc_TypeError))
14093                    goto wrongtype;
14094                return -1;
14095            }
14096        }
14097        assert(PyLong_Check(iobj));
14098    }
14099    else {
14100        iobj = v;
14101        Py_INCREF(iobj);
14102    }
14103
14104    if (PyLong_CheckExact(v)
14105        && arg->width == -1 && arg->prec == -1
14106        && !(arg->flags & (F_SIGN | F_BLANK))
14107        && type != 'X')
14108    {
14109        /* Fast path */
14110        int alternate = arg->flags & F_ALT;
14111        int base;
14112
14113        switch(type)
14114        {
14115            default:
14116                assert(0 && "'type' not in [diuoxX]");
14117            case 'd':
14118            case 'i':
14119            case 'u':
14120                base = 10;
14121                break;
14122            case 'o':
14123                base = 8;
14124                break;
14125            case 'x':
14126            case 'X':
14127                base = 16;
14128                break;
14129        }
14130
14131        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14132            Py_DECREF(iobj);
14133            return -1;
14134        }
14135        Py_DECREF(iobj);
14136        return 1;
14137    }
14138
14139    res = formatlong(iobj, arg);
14140    Py_DECREF(iobj);
14141    if (res == NULL)
14142        return -1;
14143    *p_output = res;
14144    return 0;
14145
14146wrongtype:
14147    switch(type)
14148    {
14149        case 'o':
14150        case 'x':
14151        case 'X':
14152            PyErr_Format(PyExc_TypeError,
14153                    "%%%c format: an integer is required, "
14154                    "not %.200s",
14155                    type, Py_TYPE(v)->tp_name);
14156            break;
14157        default:
14158            PyErr_Format(PyExc_TypeError,
14159                    "%%%c format: a number is required, "
14160                    "not %.200s",
14161                    type, Py_TYPE(v)->tp_name);
14162            break;
14163    }
14164    return -1;
14165}
14166
14167static Py_UCS4
14168formatchar(PyObject *v)
14169{
14170    /* presume that the buffer is at least 3 characters long */
14171    if (PyUnicode_Check(v)) {
14172        if (PyUnicode_GET_LENGTH(v) == 1) {
14173            return PyUnicode_READ_CHAR(v, 0);
14174        }
14175        goto onError;
14176    }
14177    else {
14178        PyObject *iobj;
14179        long x;
14180        /* make sure number is a type of integer */
14181        if (!PyLong_Check(v)) {
14182            iobj = PyNumber_Index(v);
14183            if (iobj == NULL) {
14184                goto onError;
14185            }
14186            v = iobj;
14187            Py_DECREF(iobj);
14188        }
14189        /* Integer input truncated to a character */
14190        x = PyLong_AsLong(v);
14191        if (x == -1 && PyErr_Occurred())
14192            goto onError;
14193
14194        if (x < 0 || x > MAX_UNICODE) {
14195            PyErr_SetString(PyExc_OverflowError,
14196                            "%c arg not in range(0x110000)");
14197            return (Py_UCS4) -1;
14198        }
14199
14200        return (Py_UCS4) x;
14201    }
14202
14203  onError:
14204    PyErr_SetString(PyExc_TypeError,
14205                    "%c requires int or char");
14206    return (Py_UCS4) -1;
14207}
14208
14209/* Parse options of an argument: flags, width, precision.
14210   Handle also "%(name)" syntax.
14211
14212   Return 0 if the argument has been formatted into arg->str.
14213   Return 1 if the argument has been written into ctx->writer,
14214   Raise an exception and return -1 on error. */
14215static int
14216unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14217                         struct unicode_format_arg_t *arg)
14218{
14219#define FORMAT_READ(ctx) \
14220        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14221
14222    PyObject *v;
14223
14224    if (arg->ch == '(') {
14225        /* Get argument value from a dictionary. Example: "%(name)s". */
14226        Py_ssize_t keystart;
14227        Py_ssize_t keylen;
14228        PyObject *key;
14229        int pcount = 1;
14230
14231        if (ctx->dict == NULL) {
14232            PyErr_SetString(PyExc_TypeError,
14233                            "format requires a mapping");
14234            return -1;
14235        }
14236        ++ctx->fmtpos;
14237        --ctx->fmtcnt;
14238        keystart = ctx->fmtpos;
14239        /* Skip over balanced parentheses */
14240        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14241            arg->ch = FORMAT_READ(ctx);
14242            if (arg->ch == ')')
14243                --pcount;
14244            else if (arg->ch == '(')
14245                ++pcount;
14246            ctx->fmtpos++;
14247        }
14248        keylen = ctx->fmtpos - keystart - 1;
14249        if (ctx->fmtcnt < 0 || pcount > 0) {
14250            PyErr_SetString(PyExc_ValueError,
14251                            "incomplete format key");
14252            return -1;
14253        }
14254        key = PyUnicode_Substring(ctx->fmtstr,
14255                                  keystart, keystart + keylen);
14256        if (key == NULL)
14257            return -1;
14258        if (ctx->args_owned) {
14259            Py_DECREF(ctx->args);
14260            ctx->args_owned = 0;
14261        }
14262        ctx->args = PyObject_GetItem(ctx->dict, key);
14263        Py_DECREF(key);
14264        if (ctx->args == NULL)
14265            return -1;
14266        ctx->args_owned = 1;
14267        ctx->arglen = -1;
14268        ctx->argidx = -2;
14269    }
14270
14271    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14272    while (--ctx->fmtcnt >= 0) {
14273        arg->ch = FORMAT_READ(ctx);
14274        ctx->fmtpos++;
14275        switch (arg->ch) {
14276        case '-': arg->flags |= F_LJUST; continue;
14277        case '+': arg->flags |= F_SIGN; continue;
14278        case ' ': arg->flags |= F_BLANK; continue;
14279        case '#': arg->flags |= F_ALT; continue;
14280        case '0': arg->flags |= F_ZERO; continue;
14281        }
14282        break;
14283    }
14284
14285    /* Parse width. Example: "%10s" => width=10 */
14286    if (arg->ch == '*') {
14287        v = unicode_format_getnextarg(ctx);
14288        if (v == NULL)
14289            return -1;
14290        if (!PyLong_Check(v)) {
14291            PyErr_SetString(PyExc_TypeError,
14292                            "* wants int");
14293            return -1;
14294        }
14295        arg->width = PyLong_AsSsize_t(v);
14296        if (arg->width == -1 && PyErr_Occurred())
14297            return -1;
14298        if (arg->width < 0) {
14299            arg->flags |= F_LJUST;
14300            arg->width = -arg->width;
14301        }
14302        if (--ctx->fmtcnt >= 0) {
14303            arg->ch = FORMAT_READ(ctx);
14304            ctx->fmtpos++;
14305        }
14306    }
14307    else if (arg->ch >= '0' && arg->ch <= '9') {
14308        arg->width = arg->ch - '0';
14309        while (--ctx->fmtcnt >= 0) {
14310            arg->ch = FORMAT_READ(ctx);
14311            ctx->fmtpos++;
14312            if (arg->ch < '0' || arg->ch > '9')
14313                break;
14314            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14315               mixing signed and unsigned comparison. Since arg->ch is between
14316               '0' and '9', casting to int is safe. */
14317            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14318                PyErr_SetString(PyExc_ValueError,
14319                                "width too big");
14320                return -1;
14321            }
14322            arg->width = arg->width*10 + (arg->ch - '0');
14323        }
14324    }
14325
14326    /* Parse precision. Example: "%.3f" => prec=3 */
14327    if (arg->ch == '.') {
14328        arg->prec = 0;
14329        if (--ctx->fmtcnt >= 0) {
14330            arg->ch = FORMAT_READ(ctx);
14331            ctx->fmtpos++;
14332        }
14333        if (arg->ch == '*') {
14334            v = unicode_format_getnextarg(ctx);
14335            if (v == NULL)
14336                return -1;
14337            if (!PyLong_Check(v)) {
14338                PyErr_SetString(PyExc_TypeError,
14339                                "* wants int");
14340                return -1;
14341            }
14342            arg->prec = _PyLong_AsInt(v);
14343            if (arg->prec == -1 && PyErr_Occurred())
14344                return -1;
14345            if (arg->prec < 0)
14346                arg->prec = 0;
14347            if (--ctx->fmtcnt >= 0) {
14348                arg->ch = FORMAT_READ(ctx);
14349                ctx->fmtpos++;
14350            }
14351        }
14352        else if (arg->ch >= '0' && arg->ch <= '9') {
14353            arg->prec = arg->ch - '0';
14354            while (--ctx->fmtcnt >= 0) {
14355                arg->ch = FORMAT_READ(ctx);
14356                ctx->fmtpos++;
14357                if (arg->ch < '0' || arg->ch > '9')
14358                    break;
14359                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14360                    PyErr_SetString(PyExc_ValueError,
14361                                    "precision too big");
14362                    return -1;
14363                }
14364                arg->prec = arg->prec*10 + (arg->ch - '0');
14365            }
14366        }
14367    }
14368
14369    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14370    if (ctx->fmtcnt >= 0) {
14371        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14372            if (--ctx->fmtcnt >= 0) {
14373                arg->ch = FORMAT_READ(ctx);
14374                ctx->fmtpos++;
14375            }
14376        }
14377    }
14378    if (ctx->fmtcnt < 0) {
14379        PyErr_SetString(PyExc_ValueError,
14380                        "incomplete format");
14381        return -1;
14382    }
14383    return 0;
14384
14385#undef FORMAT_READ
14386}
14387
14388/* Format one argument. Supported conversion specifiers:
14389
14390   - "s", "r", "a": any type
14391   - "i", "d", "u": int or float
14392   - "o", "x", "X": int
14393   - "e", "E", "f", "F", "g", "G": float
14394   - "c": int or str (1 character)
14395
14396   When possible, the output is written directly into the Unicode writer
14397   (ctx->writer). A string is created when padding is required.
14398
14399   Return 0 if the argument has been formatted into *p_str,
14400          1 if the argument has been written into ctx->writer,
14401         -1 on error. */
14402static int
14403unicode_format_arg_format(struct unicode_formatter_t *ctx,
14404                          struct unicode_format_arg_t *arg,
14405                          PyObject **p_str)
14406{
14407    PyObject *v;
14408    _PyUnicodeWriter *writer = &ctx->writer;
14409
14410    if (ctx->fmtcnt == 0)
14411        ctx->writer.overallocate = 0;
14412
14413    if (arg->ch == '%') {
14414        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14415            return -1;
14416        return 1;
14417    }
14418
14419    v = unicode_format_getnextarg(ctx);
14420    if (v == NULL)
14421        return -1;
14422
14423
14424    switch (arg->ch) {
14425    case 's':
14426    case 'r':
14427    case 'a':
14428        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14429            /* Fast path */
14430            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14431                return -1;
14432            return 1;
14433        }
14434
14435        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14436            *p_str = v;
14437            Py_INCREF(*p_str);
14438        }
14439        else {
14440            if (arg->ch == 's')
14441                *p_str = PyObject_Str(v);
14442            else if (arg->ch == 'r')
14443                *p_str = PyObject_Repr(v);
14444            else
14445                *p_str = PyObject_ASCII(v);
14446        }
14447        break;
14448
14449    case 'i':
14450    case 'd':
14451    case 'u':
14452    case 'o':
14453    case 'x':
14454    case 'X':
14455    {
14456        int ret = mainformatlong(v, arg, p_str, writer);
14457        if (ret != 0)
14458            return ret;
14459        arg->sign = 1;
14460        break;
14461    }
14462
14463    case 'e':
14464    case 'E':
14465    case 'f':
14466    case 'F':
14467    case 'g':
14468    case 'G':
14469        if (arg->width == -1 && arg->prec == -1
14470            && !(arg->flags & (F_SIGN | F_BLANK)))
14471        {
14472            /* Fast path */
14473            if (formatfloat(v, arg, NULL, writer) == -1)
14474                return -1;
14475            return 1;
14476        }
14477
14478        arg->sign = 1;
14479        if (formatfloat(v, arg, p_str, NULL) == -1)
14480            return -1;
14481        break;
14482
14483    case 'c':
14484    {
14485        Py_UCS4 ch = formatchar(v);
14486        if (ch == (Py_UCS4) -1)
14487            return -1;
14488        if (arg->width == -1 && arg->prec == -1) {
14489            /* Fast path */
14490            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14491                return -1;
14492            return 1;
14493        }
14494        *p_str = PyUnicode_FromOrdinal(ch);
14495        break;
14496    }
14497
14498    default:
14499        PyErr_Format(PyExc_ValueError,
14500                     "unsupported format character '%c' (0x%x) "
14501                     "at index %zd",
14502                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14503                     (int)arg->ch,
14504                     ctx->fmtpos - 1);
14505        return -1;
14506    }
14507    if (*p_str == NULL)
14508        return -1;
14509    assert (PyUnicode_Check(*p_str));
14510    return 0;
14511}
14512
14513static int
14514unicode_format_arg_output(struct unicode_formatter_t *ctx,
14515                          struct unicode_format_arg_t *arg,
14516                          PyObject *str)
14517{
14518    Py_ssize_t len;
14519    enum PyUnicode_Kind kind;
14520    void *pbuf;
14521    Py_ssize_t pindex;
14522    Py_UCS4 signchar;
14523    Py_ssize_t buflen;
14524    Py_UCS4 maxchar;
14525    Py_ssize_t sublen;
14526    _PyUnicodeWriter *writer = &ctx->writer;
14527    Py_UCS4 fill;
14528
14529    fill = ' ';
14530    if (arg->sign && arg->flags & F_ZERO)
14531        fill = '0';
14532
14533    if (PyUnicode_READY(str) == -1)
14534        return -1;
14535
14536    len = PyUnicode_GET_LENGTH(str);
14537    if ((arg->width == -1 || arg->width <= len)
14538        && (arg->prec == -1 || arg->prec >= len)
14539        && !(arg->flags & (F_SIGN | F_BLANK)))
14540    {
14541        /* Fast path */
14542        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14543            return -1;
14544        return 0;
14545    }
14546
14547    /* Truncate the string for "s", "r" and "a" formats
14548       if the precision is set */
14549    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14550        if (arg->prec >= 0 && len > arg->prec)
14551            len = arg->prec;
14552    }
14553
14554    /* Adjust sign and width */
14555    kind = PyUnicode_KIND(str);
14556    pbuf = PyUnicode_DATA(str);
14557    pindex = 0;
14558    signchar = '\0';
14559    if (arg->sign) {
14560        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14561        if (ch == '-' || ch == '+') {
14562            signchar = ch;
14563            len--;
14564            pindex++;
14565        }
14566        else if (arg->flags & F_SIGN)
14567            signchar = '+';
14568        else if (arg->flags & F_BLANK)
14569            signchar = ' ';
14570        else
14571            arg->sign = 0;
14572    }
14573    if (arg->width < len)
14574        arg->width = len;
14575
14576    /* Prepare the writer */
14577    maxchar = writer->maxchar;
14578    if (!(arg->flags & F_LJUST)) {
14579        if (arg->sign) {
14580            if ((arg->width-1) > len)
14581                maxchar = Py_MAX(maxchar, fill);
14582        }
14583        else {
14584            if (arg->width > len)
14585                maxchar = Py_MAX(maxchar, fill);
14586        }
14587    }
14588    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14589        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14590        maxchar = Py_MAX(maxchar, strmaxchar);
14591    }
14592
14593    buflen = arg->width;
14594    if (arg->sign && len == arg->width)
14595        buflen++;
14596    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14597        return -1;
14598
14599    /* Write the sign if needed */
14600    if (arg->sign) {
14601        if (fill != ' ') {
14602            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14603            writer->pos += 1;
14604        }
14605        if (arg->width > len)
14606            arg->width--;
14607    }
14608
14609    /* Write the numeric prefix for "x", "X" and "o" formats
14610       if the alternate form is used.
14611       For example, write "0x" for the "%#x" format. */
14612    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14613        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14614        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14615        if (fill != ' ') {
14616            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14617            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14618            writer->pos += 2;
14619            pindex += 2;
14620        }
14621        arg->width -= 2;
14622        if (arg->width < 0)
14623            arg->width = 0;
14624        len -= 2;
14625    }
14626
14627    /* Pad left with the fill character if needed */
14628    if (arg->width > len && !(arg->flags & F_LJUST)) {
14629        sublen = arg->width - len;
14630        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14631        writer->pos += sublen;
14632        arg->width = len;
14633    }
14634
14635    /* If padding with spaces: write sign if needed and/or numeric prefix if
14636       the alternate form is used */
14637    if (fill == ' ') {
14638        if (arg->sign) {
14639            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14640            writer->pos += 1;
14641        }
14642        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14643            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14644            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14645            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14646            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14647            writer->pos += 2;
14648            pindex += 2;
14649        }
14650    }
14651
14652    /* Write characters */
14653    if (len) {
14654        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14655                                      str, pindex, len);
14656        writer->pos += len;
14657    }
14658
14659    /* Pad right with the fill character if needed */
14660    if (arg->width > len) {
14661        sublen = arg->width - len;
14662        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14663        writer->pos += sublen;
14664    }
14665    return 0;
14666}
14667
14668/* Helper of PyUnicode_Format(): format one arg.
14669   Return 0 on success, raise an exception and return -1 on error. */
14670static int
14671unicode_format_arg(struct unicode_formatter_t *ctx)
14672{
14673    struct unicode_format_arg_t arg;
14674    PyObject *str;
14675    int ret;
14676
14677    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14678    arg.flags = 0;
14679    arg.width = -1;
14680    arg.prec = -1;
14681    arg.sign = 0;
14682    str = NULL;
14683
14684    ret = unicode_format_arg_parse(ctx, &arg);
14685    if (ret == -1)
14686        return -1;
14687
14688    ret = unicode_format_arg_format(ctx, &arg, &str);
14689    if (ret == -1)
14690        return -1;
14691
14692    if (ret != 1) {
14693        ret = unicode_format_arg_output(ctx, &arg, str);
14694        Py_DECREF(str);
14695        if (ret == -1)
14696            return -1;
14697    }
14698
14699    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14700        PyErr_SetString(PyExc_TypeError,
14701                        "not all arguments converted during string formatting");
14702        return -1;
14703    }
14704    return 0;
14705}
14706
14707PyObject *
14708PyUnicode_Format(PyObject *format, PyObject *args)
14709{
14710    struct unicode_formatter_t ctx;
14711
14712    if (format == NULL || args == NULL) {
14713        PyErr_BadInternalCall();
14714        return NULL;
14715    }
14716
14717    ctx.fmtstr = PyUnicode_FromObject(format);
14718    if (ctx.fmtstr == NULL)
14719        return NULL;
14720    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14721        Py_DECREF(ctx.fmtstr);
14722        return NULL;
14723    }
14724    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14725    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14726    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14727    ctx.fmtpos = 0;
14728
14729    _PyUnicodeWriter_Init(&ctx.writer);
14730    ctx.writer.min_length = ctx.fmtcnt + 100;
14731    ctx.writer.overallocate = 1;
14732
14733    if (PyTuple_Check(args)) {
14734        ctx.arglen = PyTuple_Size(args);
14735        ctx.argidx = 0;
14736    }
14737    else {
14738        ctx.arglen = -1;
14739        ctx.argidx = -2;
14740    }
14741    ctx.args_owned = 0;
14742    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14743        ctx.dict = args;
14744    else
14745        ctx.dict = NULL;
14746    ctx.args = args;
14747
14748    while (--ctx.fmtcnt >= 0) {
14749        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14750            Py_ssize_t nonfmtpos;
14751
14752            nonfmtpos = ctx.fmtpos++;
14753            while (ctx.fmtcnt >= 0 &&
14754                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14755                ctx.fmtpos++;
14756                ctx.fmtcnt--;
14757            }
14758            if (ctx.fmtcnt < 0) {
14759                ctx.fmtpos--;
14760                ctx.writer.overallocate = 0;
14761            }
14762
14763            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14764                                                nonfmtpos, ctx.fmtpos) < 0)
14765                goto onError;
14766        }
14767        else {
14768            ctx.fmtpos++;
14769            if (unicode_format_arg(&ctx) == -1)
14770                goto onError;
14771        }
14772    }
14773
14774    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14775        PyErr_SetString(PyExc_TypeError,
14776                        "not all arguments converted during string formatting");
14777        goto onError;
14778    }
14779
14780    if (ctx.args_owned) {
14781        Py_DECREF(ctx.args);
14782    }
14783    Py_DECREF(ctx.fmtstr);
14784    return _PyUnicodeWriter_Finish(&ctx.writer);
14785
14786  onError:
14787    Py_DECREF(ctx.fmtstr);
14788    _PyUnicodeWriter_Dealloc(&ctx.writer);
14789    if (ctx.args_owned) {
14790        Py_DECREF(ctx.args);
14791    }
14792    return NULL;
14793}
14794
14795static PyObject *
14796unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14797
14798static PyObject *
14799unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14800{
14801    PyObject *x = NULL;
14802    static char *kwlist[] = {"object", "encoding", "errors", 0};
14803    char *encoding = NULL;
14804    char *errors = NULL;
14805
14806    if (type != &PyUnicode_Type)
14807        return unicode_subtype_new(type, args, kwds);
14808    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14809                                     kwlist, &x, &encoding, &errors))
14810        return NULL;
14811    if (x == NULL)
14812        _Py_RETURN_UNICODE_EMPTY();
14813    if (encoding == NULL && errors == NULL)
14814        return PyObject_Str(x);
14815    else
14816        return PyUnicode_FromEncodedObject(x, encoding, errors);
14817}
14818
14819static PyObject *
14820unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14821{
14822    PyObject *unicode, *self;
14823    Py_ssize_t length, char_size;
14824    int share_wstr, share_utf8;
14825    unsigned int kind;
14826    void *data;
14827
14828    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14829
14830    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14831    if (unicode == NULL)
14832        return NULL;
14833    assert(_PyUnicode_CHECK(unicode));
14834    if (PyUnicode_READY(unicode) == -1) {
14835        Py_DECREF(unicode);
14836        return NULL;
14837    }
14838
14839    self = type->tp_alloc(type, 0);
14840    if (self == NULL) {
14841        Py_DECREF(unicode);
14842        return NULL;
14843    }
14844    kind = PyUnicode_KIND(unicode);
14845    length = PyUnicode_GET_LENGTH(unicode);
14846
14847    _PyUnicode_LENGTH(self) = length;
14848#ifdef Py_DEBUG
14849    _PyUnicode_HASH(self) = -1;
14850#else
14851    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14852#endif
14853    _PyUnicode_STATE(self).interned = 0;
14854    _PyUnicode_STATE(self).kind = kind;
14855    _PyUnicode_STATE(self).compact = 0;
14856    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14857    _PyUnicode_STATE(self).ready = 1;
14858    _PyUnicode_WSTR(self) = NULL;
14859    _PyUnicode_UTF8_LENGTH(self) = 0;
14860    _PyUnicode_UTF8(self) = NULL;
14861    _PyUnicode_WSTR_LENGTH(self) = 0;
14862    _PyUnicode_DATA_ANY(self) = NULL;
14863
14864    share_utf8 = 0;
14865    share_wstr = 0;
14866    if (kind == PyUnicode_1BYTE_KIND) {
14867        char_size = 1;
14868        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14869            share_utf8 = 1;
14870    }
14871    else if (kind == PyUnicode_2BYTE_KIND) {
14872        char_size = 2;
14873        if (sizeof(wchar_t) == 2)
14874            share_wstr = 1;
14875    }
14876    else {
14877        assert(kind == PyUnicode_4BYTE_KIND);
14878        char_size = 4;
14879        if (sizeof(wchar_t) == 4)
14880            share_wstr = 1;
14881    }
14882
14883    /* Ensure we won't overflow the length. */
14884    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14885        PyErr_NoMemory();
14886        goto onError;
14887    }
14888    data = PyObject_MALLOC((length + 1) * char_size);
14889    if (data == NULL) {
14890        PyErr_NoMemory();
14891        goto onError;
14892    }
14893
14894    _PyUnicode_DATA_ANY(self) = data;
14895    if (share_utf8) {
14896        _PyUnicode_UTF8_LENGTH(self) = length;
14897        _PyUnicode_UTF8(self) = data;
14898    }
14899    if (share_wstr) {
14900        _PyUnicode_WSTR_LENGTH(self) = length;
14901        _PyUnicode_WSTR(self) = (wchar_t *)data;
14902    }
14903
14904    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14905              kind * (length + 1));
14906    assert(_PyUnicode_CheckConsistency(self, 1));
14907#ifdef Py_DEBUG
14908    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14909#endif
14910    Py_DECREF(unicode);
14911    return self;
14912
14913onError:
14914    Py_DECREF(unicode);
14915    Py_DECREF(self);
14916    return NULL;
14917}
14918
14919PyDoc_STRVAR(unicode_doc,
14920"str(object='') -> str\n\
14921str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14922\n\
14923Create a new string object from the given object. If encoding or\n\
14924errors is specified, then the object must expose a data buffer\n\
14925that will be decoded using the given encoding and error handler.\n\
14926Otherwise, returns the result of object.__str__() (if defined)\n\
14927or repr(object).\n\
14928encoding defaults to sys.getdefaultencoding().\n\
14929errors defaults to 'strict'.");
14930
14931static PyObject *unicode_iter(PyObject *seq);
14932
14933PyTypeObject PyUnicode_Type = {
14934    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14935    "str",              /* tp_name */
14936    sizeof(PyUnicodeObject),        /* tp_size */
14937    0,                  /* tp_itemsize */
14938    /* Slots */
14939    (destructor)unicode_dealloc,    /* tp_dealloc */
14940    0,                  /* tp_print */
14941    0,                  /* tp_getattr */
14942    0,                  /* tp_setattr */
14943    0,                  /* tp_reserved */
14944    unicode_repr,           /* tp_repr */
14945    &unicode_as_number,         /* tp_as_number */
14946    &unicode_as_sequence,       /* tp_as_sequence */
14947    &unicode_as_mapping,        /* tp_as_mapping */
14948    (hashfunc) unicode_hash,        /* tp_hash*/
14949    0,                  /* tp_call*/
14950    (reprfunc) unicode_str,     /* tp_str */
14951    PyObject_GenericGetAttr,        /* tp_getattro */
14952    0,                  /* tp_setattro */
14953    0,                  /* tp_as_buffer */
14954    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14955    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14956    unicode_doc,            /* tp_doc */
14957    0,                  /* tp_traverse */
14958    0,                  /* tp_clear */
14959    PyUnicode_RichCompare,      /* tp_richcompare */
14960    0,                  /* tp_weaklistoffset */
14961    unicode_iter,           /* tp_iter */
14962    0,                  /* tp_iternext */
14963    unicode_methods,            /* tp_methods */
14964    0,                  /* tp_members */
14965    0,                  /* tp_getset */
14966    &PyBaseObject_Type,         /* tp_base */
14967    0,                  /* tp_dict */
14968    0,                  /* tp_descr_get */
14969    0,                  /* tp_descr_set */
14970    0,                  /* tp_dictoffset */
14971    0,                  /* tp_init */
14972    0,                  /* tp_alloc */
14973    unicode_new,            /* tp_new */
14974    PyObject_Del,           /* tp_free */
14975};
14976
14977/* Initialize the Unicode implementation */
14978
14979int _PyUnicode_Init(void)
14980{
14981    /* XXX - move this array to unicodectype.c ? */
14982    Py_UCS2 linebreak[] = {
14983        0x000A, /* LINE FEED */
14984        0x000D, /* CARRIAGE RETURN */
14985        0x001C, /* FILE SEPARATOR */
14986        0x001D, /* GROUP SEPARATOR */
14987        0x001E, /* RECORD SEPARATOR */
14988        0x0085, /* NEXT LINE */
14989        0x2028, /* LINE SEPARATOR */
14990        0x2029, /* PARAGRAPH SEPARATOR */
14991    };
14992
14993    /* Init the implementation */
14994    _Py_INCREF_UNICODE_EMPTY();
14995    if (!unicode_empty)
14996        Py_FatalError("Can't create empty string");
14997    Py_DECREF(unicode_empty);
14998
14999    if (PyType_Ready(&PyUnicode_Type) < 0)
15000        Py_FatalError("Can't initialize 'unicode'");
15001
15002    /* initialize the linebreak bloom filter */
15003    bloom_linebreak = make_bloom_mask(
15004        PyUnicode_2BYTE_KIND, linebreak,
15005        Py_ARRAY_LENGTH(linebreak));
15006
15007    if (PyType_Ready(&EncodingMapType) < 0)
15008         Py_FatalError("Can't initialize encoding map type");
15009
15010    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
15011        Py_FatalError("Can't initialize field name iterator type");
15012
15013    if (PyType_Ready(&PyFormatterIter_Type) < 0)
15014        Py_FatalError("Can't initialize formatter iter type");
15015
15016#ifdef HAVE_MBCS
15017    winver.dwOSVersionInfoSize = sizeof(winver);
15018    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
15019        PyErr_SetFromWindowsErr(0);
15020        return -1;
15021    }
15022#endif
15023    return 0;
15024}
15025
15026/* Finalize the Unicode implementation */
15027
15028int
15029PyUnicode_ClearFreeList(void)
15030{
15031    return 0;
15032}
15033
15034void
15035_PyUnicode_Fini(void)
15036{
15037    int i;
15038
15039    Py_CLEAR(unicode_empty);
15040
15041    for (i = 0; i < 256; i++)
15042        Py_CLEAR(unicode_latin1[i]);
15043    _PyUnicode_ClearStaticStrings();
15044    (void)PyUnicode_ClearFreeList();
15045}
15046
15047void
15048PyUnicode_InternInPlace(PyObject **p)
15049{
15050    PyObject *s = *p;
15051    PyObject *t;
15052#ifdef Py_DEBUG
15053    assert(s != NULL);
15054    assert(_PyUnicode_CHECK(s));
15055#else
15056    if (s == NULL || !PyUnicode_Check(s))
15057        return;
15058#endif
15059    /* If it's a subclass, we don't really know what putting
15060       it in the interned dict might do. */
15061    if (!PyUnicode_CheckExact(s))
15062        return;
15063    if (PyUnicode_CHECK_INTERNED(s))
15064        return;
15065    if (interned == NULL) {
15066        interned = PyDict_New();
15067        if (interned == NULL) {
15068            PyErr_Clear(); /* Don't leave an exception */
15069            return;
15070        }
15071    }
15072    /* It might be that the GetItem call fails even
15073       though the key is present in the dictionary,
15074       namely when this happens during a stack overflow. */
15075    Py_ALLOW_RECURSION
15076    t = PyDict_GetItem(interned, s);
15077    Py_END_ALLOW_RECURSION
15078
15079    if (t) {
15080        Py_INCREF(t);
15081        Py_DECREF(*p);
15082        *p = t;
15083        return;
15084    }
15085
15086    PyThreadState_GET()->recursion_critical = 1;
15087    if (PyDict_SetItem(interned, s, s) < 0) {
15088        PyErr_Clear();
15089        PyThreadState_GET()->recursion_critical = 0;
15090        return;
15091    }
15092    PyThreadState_GET()->recursion_critical = 0;
15093    /* The two references in interned are not counted by refcnt.
15094       The deallocator will take care of this */
15095    Py_REFCNT(s) -= 2;
15096    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15097}
15098
15099void
15100PyUnicode_InternImmortal(PyObject **p)
15101{
15102    PyUnicode_InternInPlace(p);
15103    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15104        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15105        Py_INCREF(*p);
15106    }
15107}
15108
15109PyObject *
15110PyUnicode_InternFromString(const char *cp)
15111{
15112    PyObject *s = PyUnicode_FromString(cp);
15113    if (s == NULL)
15114        return NULL;
15115    PyUnicode_InternInPlace(&s);
15116    return s;
15117}
15118
15119void
15120_Py_ReleaseInternedUnicodeStrings(void)
15121{
15122    PyObject *keys;
15123    PyObject *s;
15124    Py_ssize_t i, n;
15125    Py_ssize_t immortal_size = 0, mortal_size = 0;
15126
15127    if (interned == NULL || !PyDict_Check(interned))
15128        return;
15129    keys = PyDict_Keys(interned);
15130    if (keys == NULL || !PyList_Check(keys)) {
15131        PyErr_Clear();
15132        return;
15133    }
15134
15135    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15136       detector, interned unicode strings are not forcibly deallocated;
15137       rather, we give them their stolen references back, and then clear
15138       and DECREF the interned dict. */
15139
15140    n = PyList_GET_SIZE(keys);
15141    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15142            n);
15143    for (i = 0; i < n; i++) {
15144        s = PyList_GET_ITEM(keys, i);
15145        if (PyUnicode_READY(s) == -1) {
15146            assert(0 && "could not ready string");
15147            fprintf(stderr, "could not ready string\n");
15148        }
15149        switch (PyUnicode_CHECK_INTERNED(s)) {
15150        case SSTATE_NOT_INTERNED:
15151            /* XXX Shouldn't happen */
15152            break;
15153        case SSTATE_INTERNED_IMMORTAL:
15154            Py_REFCNT(s) += 1;
15155            immortal_size += PyUnicode_GET_LENGTH(s);
15156            break;
15157        case SSTATE_INTERNED_MORTAL:
15158            Py_REFCNT(s) += 2;
15159            mortal_size += PyUnicode_GET_LENGTH(s);
15160            break;
15161        default:
15162            Py_FatalError("Inconsistent interned string state.");
15163        }
15164        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15165    }
15166    fprintf(stderr, "total size of all interned strings: "
15167            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15168            "mortal/immortal\n", mortal_size, immortal_size);
15169    Py_DECREF(keys);
15170    PyDict_Clear(interned);
15171    Py_CLEAR(interned);
15172}
15173
15174
15175/********************* Unicode Iterator **************************/
15176
15177typedef struct {
15178    PyObject_HEAD
15179    Py_ssize_t it_index;
15180    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15181} unicodeiterobject;
15182
15183static void
15184unicodeiter_dealloc(unicodeiterobject *it)
15185{
15186    _PyObject_GC_UNTRACK(it);
15187    Py_XDECREF(it->it_seq);
15188    PyObject_GC_Del(it);
15189}
15190
15191static int
15192unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15193{
15194    Py_VISIT(it->it_seq);
15195    return 0;
15196}
15197
15198static PyObject *
15199unicodeiter_next(unicodeiterobject *it)
15200{
15201    PyObject *seq, *item;
15202
15203    assert(it != NULL);
15204    seq = it->it_seq;
15205    if (seq == NULL)
15206        return NULL;
15207    assert(_PyUnicode_CHECK(seq));
15208
15209    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15210        int kind = PyUnicode_KIND(seq);
15211        void *data = PyUnicode_DATA(seq);
15212        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15213        item = PyUnicode_FromOrdinal(chr);
15214        if (item != NULL)
15215            ++it->it_index;
15216        return item;
15217    }
15218
15219    Py_DECREF(seq);
15220    it->it_seq = NULL;
15221    return NULL;
15222}
15223
15224static PyObject *
15225unicodeiter_len(unicodeiterobject *it)
15226{
15227    Py_ssize_t len = 0;
15228    if (it->it_seq)
15229        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15230    return PyLong_FromSsize_t(len);
15231}
15232
15233PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15234
15235static PyObject *
15236unicodeiter_reduce(unicodeiterobject *it)
15237{
15238    if (it->it_seq != NULL) {
15239        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15240                             it->it_seq, it->it_index);
15241    } else {
15242        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15243        if (u == NULL)
15244            return NULL;
15245        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15246    }
15247}
15248
15249PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15250
15251static PyObject *
15252unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15253{
15254    Py_ssize_t index = PyLong_AsSsize_t(state);
15255    if (index == -1 && PyErr_Occurred())
15256        return NULL;
15257    if (it->it_seq != NULL) {
15258        if (index < 0)
15259            index = 0;
15260        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15261            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15262        it->it_index = index;
15263    }
15264    Py_RETURN_NONE;
15265}
15266
15267PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15268
15269static PyMethodDef unicodeiter_methods[] = {
15270    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15271     length_hint_doc},
15272    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15273     reduce_doc},
15274    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15275     setstate_doc},
15276    {NULL,      NULL}       /* sentinel */
15277};
15278
15279PyTypeObject PyUnicodeIter_Type = {
15280    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15281    "str_iterator",         /* tp_name */
15282    sizeof(unicodeiterobject),      /* tp_basicsize */
15283    0,                  /* tp_itemsize */
15284    /* methods */
15285    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15286    0,                  /* tp_print */
15287    0,                  /* tp_getattr */
15288    0,                  /* tp_setattr */
15289    0,                  /* tp_reserved */
15290    0,                  /* tp_repr */
15291    0,                  /* tp_as_number */
15292    0,                  /* tp_as_sequence */
15293    0,                  /* tp_as_mapping */
15294    0,                  /* tp_hash */
15295    0,                  /* tp_call */
15296    0,                  /* tp_str */
15297    PyObject_GenericGetAttr,        /* tp_getattro */
15298    0,                  /* tp_setattro */
15299    0,                  /* tp_as_buffer */
15300    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15301    0,                  /* tp_doc */
15302    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15303    0,                  /* tp_clear */
15304    0,                  /* tp_richcompare */
15305    0,                  /* tp_weaklistoffset */
15306    PyObject_SelfIter,          /* tp_iter */
15307    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15308    unicodeiter_methods,            /* tp_methods */
15309    0,
15310};
15311
15312static PyObject *
15313unicode_iter(PyObject *seq)
15314{
15315    unicodeiterobject *it;
15316
15317    if (!PyUnicode_Check(seq)) {
15318        PyErr_BadInternalCall();
15319        return NULL;
15320    }
15321    if (PyUnicode_READY(seq) == -1)
15322        return NULL;
15323    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15324    if (it == NULL)
15325        return NULL;
15326    it->it_index = 0;
15327    Py_INCREF(seq);
15328    it->it_seq = seq;
15329    _PyObject_GC_TRACK(it);
15330    return (PyObject *)it;
15331}
15332
15333
15334size_t
15335Py_UNICODE_strlen(const Py_UNICODE *u)
15336{
15337    int res = 0;
15338    while(*u++)
15339        res++;
15340    return res;
15341}
15342
15343Py_UNICODE*
15344Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15345{
15346    Py_UNICODE *u = s1;
15347    while ((*u++ = *s2++));
15348    return s1;
15349}
15350
15351Py_UNICODE*
15352Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15353{
15354    Py_UNICODE *u = s1;
15355    while ((*u++ = *s2++))
15356        if (n-- == 0)
15357            break;
15358    return s1;
15359}
15360
15361Py_UNICODE*
15362Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15363{
15364    Py_UNICODE *u1 = s1;
15365    u1 += Py_UNICODE_strlen(u1);
15366    Py_UNICODE_strcpy(u1, s2);
15367    return s1;
15368}
15369
15370int
15371Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15372{
15373    while (*s1 && *s2 && *s1 == *s2)
15374        s1++, s2++;
15375    if (*s1 && *s2)
15376        return (*s1 < *s2) ? -1 : +1;
15377    if (*s1)
15378        return 1;
15379    if (*s2)
15380        return -1;
15381    return 0;
15382}
15383
15384int
15385Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15386{
15387    Py_UNICODE u1, u2;
15388    for (; n != 0; n--) {
15389        u1 = *s1;
15390        u2 = *s2;
15391        if (u1 != u2)
15392            return (u1 < u2) ? -1 : +1;
15393        if (u1 == '\0')
15394            return 0;
15395        s1++;
15396        s2++;
15397    }
15398    return 0;
15399}
15400
15401Py_UNICODE*
15402Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15403{
15404    const Py_UNICODE *p;
15405    for (p = s; *p; p++)
15406        if (*p == c)
15407            return (Py_UNICODE*)p;
15408    return NULL;
15409}
15410
15411Py_UNICODE*
15412Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15413{
15414    const Py_UNICODE *p;
15415    p = s + Py_UNICODE_strlen(s);
15416    while (p != s) {
15417        p--;
15418        if (*p == c)
15419            return (Py_UNICODE*)p;
15420    }
15421    return NULL;
15422}
15423
15424Py_UNICODE*
15425PyUnicode_AsUnicodeCopy(PyObject *unicode)
15426{
15427    Py_UNICODE *u, *copy;
15428    Py_ssize_t len, size;
15429
15430    if (!PyUnicode_Check(unicode)) {
15431        PyErr_BadArgument();
15432        return NULL;
15433    }
15434    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15435    if (u == NULL)
15436        return NULL;
15437    /* Ensure we won't overflow the size. */
15438    if (len > ((PY_SSIZE_T_MAX / (Py_ssize_t)sizeof(Py_UNICODE)) - 1)) {
15439        PyErr_NoMemory();
15440        return NULL;
15441    }
15442    size = len + 1; /* copy the null character */
15443    size *= sizeof(Py_UNICODE);
15444    copy = PyMem_Malloc(size);
15445    if (copy == NULL) {
15446        PyErr_NoMemory();
15447        return NULL;
15448    }
15449    memcpy(copy, u, size);
15450    return copy;
15451}
15452
15453/* A _string module, to export formatter_parser and formatter_field_name_split
15454   to the string.Formatter class implemented in Python. */
15455
15456static PyMethodDef _string_methods[] = {
15457    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15458     METH_O, PyDoc_STR("split the argument as a field name")},
15459    {"formatter_parser", (PyCFunction) formatter_parser,
15460     METH_O, PyDoc_STR("parse the argument as a format string")},
15461    {NULL, NULL}
15462};
15463
15464static struct PyModuleDef _string_module = {
15465    PyModuleDef_HEAD_INIT,
15466    "_string",
15467    PyDoc_STR("string helper module"),
15468    0,
15469    _string_methods,
15470    NULL,
15471    NULL,
15472    NULL,
15473    NULL
15474};
15475
15476PyMODINIT_FUNC
15477PyInit__string(void)
15478{
15479    return PyModule_Create(&_string_module);
15480}
15481
15482
15483#ifdef __cplusplus
15484}
15485#endif
15486