unicodeobject.c revision ea71a525c34784d188252947f497ed251f9d4d5c
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* --- Globals ------------------------------------------------------------
51
52NOTE: In the interpreter's initialization phase, some globals are currently
53      initialized dynamically as needed. In the process Unicode objects may
54      be created before the Unicode type is ready.
55
56*/
57
58
59#ifdef __cplusplus
60extern "C" {
61#endif
62
63/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
64#define MAX_UNICODE 0x10ffff
65
66#ifdef Py_DEBUG
67#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
68#else
69#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
70#endif
71
72#define _PyUnicode_UTF8(op)                             \
73    (((PyCompactUnicodeObject*)(op))->utf8)
74#define PyUnicode_UTF8(op)                              \
75    (assert(_PyUnicode_CHECK(op)),                      \
76     assert(PyUnicode_IS_READY(op)),                    \
77     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
78         ((char*)((PyASCIIObject*)(op) + 1)) :          \
79         _PyUnicode_UTF8(op))
80#define _PyUnicode_UTF8_LENGTH(op)                      \
81    (((PyCompactUnicodeObject*)(op))->utf8_length)
82#define PyUnicode_UTF8_LENGTH(op)                       \
83    (assert(_PyUnicode_CHECK(op)),                      \
84     assert(PyUnicode_IS_READY(op)),                    \
85     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
86         ((PyASCIIObject*)(op))->length :               \
87         _PyUnicode_UTF8_LENGTH(op))
88#define _PyUnicode_WSTR(op)                             \
89    (((PyASCIIObject*)(op))->wstr)
90#define _PyUnicode_WSTR_LENGTH(op)                      \
91    (((PyCompactUnicodeObject*)(op))->wstr_length)
92#define _PyUnicode_LENGTH(op)                           \
93    (((PyASCIIObject *)(op))->length)
94#define _PyUnicode_STATE(op)                            \
95    (((PyASCIIObject *)(op))->state)
96#define _PyUnicode_HASH(op)                             \
97    (((PyASCIIObject *)(op))->hash)
98#define _PyUnicode_KIND(op)                             \
99    (assert(_PyUnicode_CHECK(op)),                      \
100     ((PyASCIIObject *)(op))->state.kind)
101#define _PyUnicode_GET_LENGTH(op)                       \
102    (assert(_PyUnicode_CHECK(op)),                      \
103     ((PyASCIIObject *)(op))->length)
104#define _PyUnicode_DATA_ANY(op)                         \
105    (((PyUnicodeObject*)(op))->data.any)
106
107#undef PyUnicode_READY
108#define PyUnicode_READY(op)                             \
109    (assert(_PyUnicode_CHECK(op)),                      \
110     (PyUnicode_IS_READY(op) ?                          \
111      0 :                                               \
112      _PyUnicode_Ready(op)))
113
114#define _PyUnicode_SHARE_UTF8(op)                       \
115    (assert(_PyUnicode_CHECK(op)),                      \
116     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
117     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
118#define _PyUnicode_SHARE_WSTR(op)                       \
119    (assert(_PyUnicode_CHECK(op)),                      \
120     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
121
122/* true if the Unicode object has an allocated UTF-8 memory block
123   (not shared with other data) */
124#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
125    (assert(_PyUnicode_CHECK(op)),                      \
126     (!PyUnicode_IS_COMPACT_ASCII(op)                   \
127      && _PyUnicode_UTF8(op)                            \
128      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
129
130/* true if the Unicode object has an allocated wstr memory block
131   (not shared with other data) */
132#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
133    (assert(_PyUnicode_CHECK(op)),                      \
134     (_PyUnicode_WSTR(op) &&                            \
135      (!PyUnicode_IS_READY(op) ||                       \
136       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
137
138/* Generic helper macro to convert characters of different types.
139   from_type and to_type have to be valid type names, begin and end
140   are pointers to the source characters which should be of type
141   "from_type *".  to is a pointer of type "to_type *" and points to the
142   buffer where the result characters are written to. */
143#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
144    do {                                                \
145        to_type *_to = (to_type *) to;                  \
146        const from_type *_iter = (begin);               \
147        const from_type *_end = (end);                  \
148        Py_ssize_t n = (_end) - (_iter);                \
149        const from_type *_unrolled_end =                \
150            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
151        while (_iter < (_unrolled_end)) {               \
152            _to[0] = (to_type) _iter[0];                \
153            _to[1] = (to_type) _iter[1];                \
154            _to[2] = (to_type) _iter[2];                \
155            _to[3] = (to_type) _iter[3];                \
156            _iter += 4; _to += 4;                       \
157        }                                               \
158        while (_iter < (_end))                          \
159            *_to++ = (to_type) *_iter++;                \
160    } while (0)
161
162/* This dictionary holds all interned unicode strings.  Note that references
163   to strings in this dictionary are *not* counted in the string's ob_refcnt.
164   When the interned string reaches a refcnt of 0 the string deallocation
165   function will delete the reference from this dictionary.
166
167   Another way to look at this is that to say that the actual reference
168   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
169*/
170static PyObject *interned = NULL;
171
172/* The empty Unicode object is shared to improve performance. */
173static PyObject *unicode_empty = NULL;
174
175#define _Py_INCREF_UNICODE_EMPTY()                      \
176    do {                                                \
177        if (unicode_empty != NULL)                      \
178            Py_INCREF(unicode_empty);                   \
179        else {                                          \
180            unicode_empty = PyUnicode_New(0, 0);        \
181            if (unicode_empty != NULL) {                \
182                Py_INCREF(unicode_empty);               \
183                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
184            }                                           \
185        }                                               \
186    } while (0)
187
188#define _Py_RETURN_UNICODE_EMPTY()                      \
189    do {                                                \
190        _Py_INCREF_UNICODE_EMPTY();                     \
191        return unicode_empty;                           \
192    } while (0)
193
194/* Forward declaration */
195Py_LOCAL_INLINE(int)
196_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
197
198/* List of static strings. */
199static _Py_Identifier *static_strings = NULL;
200
201/* Single character Unicode strings in the Latin-1 range are being
202   shared as well. */
203static PyObject *unicode_latin1[256] = {NULL};
204
205/* Fast detection of the most frequent whitespace characters */
206const unsigned char _Py_ascii_whitespace[] = {
207    0, 0, 0, 0, 0, 0, 0, 0,
208/*     case 0x0009: * CHARACTER TABULATION */
209/*     case 0x000A: * LINE FEED */
210/*     case 0x000B: * LINE TABULATION */
211/*     case 0x000C: * FORM FEED */
212/*     case 0x000D: * CARRIAGE RETURN */
213    0, 1, 1, 1, 1, 1, 0, 0,
214    0, 0, 0, 0, 0, 0, 0, 0,
215/*     case 0x001C: * FILE SEPARATOR */
216/*     case 0x001D: * GROUP SEPARATOR */
217/*     case 0x001E: * RECORD SEPARATOR */
218/*     case 0x001F: * UNIT SEPARATOR */
219    0, 0, 0, 0, 1, 1, 1, 1,
220/*     case 0x0020: * SPACE */
221    1, 0, 0, 0, 0, 0, 0, 0,
222    0, 0, 0, 0, 0, 0, 0, 0,
223    0, 0, 0, 0, 0, 0, 0, 0,
224    0, 0, 0, 0, 0, 0, 0, 0,
225
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228    0, 0, 0, 0, 0, 0, 0, 0,
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0
234};
235
236/* forward */
237static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
238static PyObject* get_latin1_char(unsigned char ch);
239static int unicode_modifiable(PyObject *unicode);
240
241
242static PyObject *
243_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
244static PyObject *
245_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
246static PyObject *
247_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
248
249static PyObject *
250unicode_encode_call_errorhandler(const char *errors,
251       PyObject **errorHandler,const char *encoding, const char *reason,
252       PyObject *unicode, PyObject **exceptionObject,
253       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
254
255static void
256raise_encode_exception(PyObject **exceptionObject,
257                       const char *encoding,
258                       PyObject *unicode,
259                       Py_ssize_t startpos, Py_ssize_t endpos,
260                       const char *reason);
261
262/* Same for linebreaks */
263static unsigned char ascii_linebreak[] = {
264    0, 0, 0, 0, 0, 0, 0, 0,
265/*         0x000A, * LINE FEED */
266/*         0x000B, * LINE TABULATION */
267/*         0x000C, * FORM FEED */
268/*         0x000D, * CARRIAGE RETURN */
269    0, 0, 1, 1, 1, 1, 0, 0,
270    0, 0, 0, 0, 0, 0, 0, 0,
271/*         0x001C, * FILE SEPARATOR */
272/*         0x001D, * GROUP SEPARATOR */
273/*         0x001E, * RECORD SEPARATOR */
274    0, 0, 0, 0, 1, 1, 1, 0,
275    0, 0, 0, 0, 0, 0, 0, 0,
276    0, 0, 0, 0, 0, 0, 0, 0,
277    0, 0, 0, 0, 0, 0, 0, 0,
278    0, 0, 0, 0, 0, 0, 0, 0,
279
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282    0, 0, 0, 0, 0, 0, 0, 0,
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0
288};
289
290/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
291   This function is kept for backward compatibility with the old API. */
292Py_UNICODE
293PyUnicode_GetMax(void)
294{
295#ifdef Py_UNICODE_WIDE
296    return 0x10FFFF;
297#else
298    /* This is actually an illegal character, so it should
299       not be passed to unichr. */
300    return 0xFFFF;
301#endif
302}
303
304#ifdef Py_DEBUG
305int
306_PyUnicode_CheckConsistency(PyObject *op, int check_content)
307{
308    PyASCIIObject *ascii;
309    unsigned int kind;
310
311    assert(PyUnicode_Check(op));
312
313    ascii = (PyASCIIObject *)op;
314    kind = ascii->state.kind;
315
316    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
317        assert(kind == PyUnicode_1BYTE_KIND);
318        assert(ascii->state.ready == 1);
319    }
320    else {
321        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
322        void *data;
323
324        if (ascii->state.compact == 1) {
325            data = compact + 1;
326            assert(kind == PyUnicode_1BYTE_KIND
327                   || kind == PyUnicode_2BYTE_KIND
328                   || kind == PyUnicode_4BYTE_KIND);
329            assert(ascii->state.ascii == 0);
330            assert(ascii->state.ready == 1);
331            assert (compact->utf8 != data);
332        }
333        else {
334            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
335
336            data = unicode->data.any;
337            if (kind == PyUnicode_WCHAR_KIND) {
338                assert(ascii->length == 0);
339                assert(ascii->hash == -1);
340                assert(ascii->state.compact == 0);
341                assert(ascii->state.ascii == 0);
342                assert(ascii->state.ready == 0);
343                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
344                assert(ascii->wstr != NULL);
345                assert(data == NULL);
346                assert(compact->utf8 == NULL);
347            }
348            else {
349                assert(kind == PyUnicode_1BYTE_KIND
350                       || kind == PyUnicode_2BYTE_KIND
351                       || kind == PyUnicode_4BYTE_KIND);
352                assert(ascii->state.compact == 0);
353                assert(ascii->state.ready == 1);
354                assert(data != NULL);
355                if (ascii->state.ascii) {
356                    assert (compact->utf8 == data);
357                    assert (compact->utf8_length == ascii->length);
358                }
359                else
360                    assert (compact->utf8 != data);
361            }
362        }
363        if (kind != PyUnicode_WCHAR_KIND) {
364            if (
365#if SIZEOF_WCHAR_T == 2
366                kind == PyUnicode_2BYTE_KIND
367#else
368                kind == PyUnicode_4BYTE_KIND
369#endif
370               )
371            {
372                assert(ascii->wstr == data);
373                assert(compact->wstr_length == ascii->length);
374            } else
375                assert(ascii->wstr != data);
376        }
377
378        if (compact->utf8 == NULL)
379            assert(compact->utf8_length == 0);
380        if (ascii->wstr == NULL)
381            assert(compact->wstr_length == 0);
382    }
383    /* check that the best kind is used */
384    if (check_content && kind != PyUnicode_WCHAR_KIND)
385    {
386        Py_ssize_t i;
387        Py_UCS4 maxchar = 0;
388        void *data;
389        Py_UCS4 ch;
390
391        data = PyUnicode_DATA(ascii);
392        for (i=0; i < ascii->length; i++)
393        {
394            ch = PyUnicode_READ(kind, data, i);
395            if (ch > maxchar)
396                maxchar = ch;
397        }
398        if (kind == PyUnicode_1BYTE_KIND) {
399            if (ascii->state.ascii == 0) {
400                assert(maxchar >= 128);
401                assert(maxchar <= 255);
402            }
403            else
404                assert(maxchar < 128);
405        }
406        else if (kind == PyUnicode_2BYTE_KIND) {
407            assert(maxchar >= 0x100);
408            assert(maxchar <= 0xFFFF);
409        }
410        else {
411            assert(maxchar >= 0x10000);
412            assert(maxchar <= MAX_UNICODE);
413        }
414        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
415    }
416    return 1;
417}
418#endif
419
420static PyObject*
421unicode_result_wchar(PyObject *unicode)
422{
423#ifndef Py_DEBUG
424    Py_ssize_t len;
425
426    len = _PyUnicode_WSTR_LENGTH(unicode);
427    if (len == 0) {
428        Py_DECREF(unicode);
429        _Py_RETURN_UNICODE_EMPTY();
430    }
431
432    if (len == 1) {
433        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
434        if ((Py_UCS4)ch < 256) {
435            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
436            Py_DECREF(unicode);
437            return latin1_char;
438        }
439    }
440
441    if (_PyUnicode_Ready(unicode) < 0) {
442        Py_DECREF(unicode);
443        return NULL;
444    }
445#else
446    assert(Py_REFCNT(unicode) == 1);
447
448    /* don't make the result ready in debug mode to ensure that the caller
449       makes the string ready before using it */
450    assert(_PyUnicode_CheckConsistency(unicode, 1));
451#endif
452    return unicode;
453}
454
455static PyObject*
456unicode_result_ready(PyObject *unicode)
457{
458    Py_ssize_t length;
459
460    length = PyUnicode_GET_LENGTH(unicode);
461    if (length == 0) {
462        if (unicode != unicode_empty) {
463            Py_DECREF(unicode);
464            _Py_RETURN_UNICODE_EMPTY();
465        }
466        return unicode_empty;
467    }
468
469    if (length == 1) {
470        void *data = PyUnicode_DATA(unicode);
471        int kind = PyUnicode_KIND(unicode);
472        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
473        if (ch < 256) {
474            PyObject *latin1_char = unicode_latin1[ch];
475            if (latin1_char != NULL) {
476                if (unicode != latin1_char) {
477                    Py_INCREF(latin1_char);
478                    Py_DECREF(unicode);
479                }
480                return latin1_char;
481            }
482            else {
483                assert(_PyUnicode_CheckConsistency(unicode, 1));
484                Py_INCREF(unicode);
485                unicode_latin1[ch] = unicode;
486                return unicode;
487            }
488        }
489    }
490
491    assert(_PyUnicode_CheckConsistency(unicode, 1));
492    return unicode;
493}
494
495static PyObject*
496unicode_result(PyObject *unicode)
497{
498    assert(_PyUnicode_CHECK(unicode));
499    if (PyUnicode_IS_READY(unicode))
500        return unicode_result_ready(unicode);
501    else
502        return unicode_result_wchar(unicode);
503}
504
505static PyObject*
506unicode_result_unchanged(PyObject *unicode)
507{
508    if (PyUnicode_CheckExact(unicode)) {
509        if (PyUnicode_READY(unicode) == -1)
510            return NULL;
511        Py_INCREF(unicode);
512        return unicode;
513    }
514    else
515        /* Subtype -- return genuine unicode string with the same value. */
516        return _PyUnicode_Copy(unicode);
517}
518
519#ifdef HAVE_MBCS
520static OSVERSIONINFOEX winver;
521#endif
522
523/* --- Bloom Filters ----------------------------------------------------- */
524
525/* stuff to implement simple "bloom filters" for Unicode characters.
526   to keep things simple, we use a single bitmask, using the least 5
527   bits from each unicode characters as the bit index. */
528
529/* the linebreak mask is set up by Unicode_Init below */
530
531#if LONG_BIT >= 128
532#define BLOOM_WIDTH 128
533#elif LONG_BIT >= 64
534#define BLOOM_WIDTH 64
535#elif LONG_BIT >= 32
536#define BLOOM_WIDTH 32
537#else
538#error "LONG_BIT is smaller than 32"
539#endif
540
541#define BLOOM_MASK unsigned long
542
543static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
544
545#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
546
547#define BLOOM_LINEBREAK(ch)                                             \
548    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
549     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
550
551Py_LOCAL_INLINE(BLOOM_MASK)
552make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
553{
554#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
555    do {                                               \
556        TYPE *data = (TYPE *)PTR;                      \
557        TYPE *end = data + LEN;                        \
558        Py_UCS4 ch;                                    \
559        for (; data != end; data++) {                  \
560            ch = *data;                                \
561            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
562        }                                              \
563        break;                                         \
564    } while (0)
565
566    /* calculate simple bloom-style bitmask for a given unicode string */
567
568    BLOOM_MASK mask;
569
570    mask = 0;
571    switch (kind) {
572    case PyUnicode_1BYTE_KIND:
573        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
574        break;
575    case PyUnicode_2BYTE_KIND:
576        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
577        break;
578    case PyUnicode_4BYTE_KIND:
579        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
580        break;
581    default:
582        assert(0);
583    }
584    return mask;
585
586#undef BLOOM_UPDATE
587}
588
589/* Compilation of templated routines */
590
591#include "stringlib/asciilib.h"
592#include "stringlib/fastsearch.h"
593#include "stringlib/partition.h"
594#include "stringlib/split.h"
595#include "stringlib/count.h"
596#include "stringlib/find.h"
597#include "stringlib/find_max_char.h"
598#include "stringlib/localeutil.h"
599#include "stringlib/undef.h"
600
601#include "stringlib/ucs1lib.h"
602#include "stringlib/fastsearch.h"
603#include "stringlib/partition.h"
604#include "stringlib/split.h"
605#include "stringlib/count.h"
606#include "stringlib/find.h"
607#include "stringlib/replace.h"
608#include "stringlib/find_max_char.h"
609#include "stringlib/localeutil.h"
610#include "stringlib/undef.h"
611
612#include "stringlib/ucs2lib.h"
613#include "stringlib/fastsearch.h"
614#include "stringlib/partition.h"
615#include "stringlib/split.h"
616#include "stringlib/count.h"
617#include "stringlib/find.h"
618#include "stringlib/replace.h"
619#include "stringlib/find_max_char.h"
620#include "stringlib/localeutil.h"
621#include "stringlib/undef.h"
622
623#include "stringlib/ucs4lib.h"
624#include "stringlib/fastsearch.h"
625#include "stringlib/partition.h"
626#include "stringlib/split.h"
627#include "stringlib/count.h"
628#include "stringlib/find.h"
629#include "stringlib/replace.h"
630#include "stringlib/find_max_char.h"
631#include "stringlib/localeutil.h"
632#include "stringlib/undef.h"
633
634#include "stringlib/unicodedefs.h"
635#include "stringlib/fastsearch.h"
636#include "stringlib/count.h"
637#include "stringlib/find.h"
638#include "stringlib/undef.h"
639
640/* --- Unicode Object ----------------------------------------------------- */
641
642static PyObject *
643fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
644
645Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
646                                     Py_ssize_t size, Py_UCS4 ch,
647                                     int direction)
648{
649    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
650
651    switch (kind) {
652    case PyUnicode_1BYTE_KIND:
653        {
654            Py_UCS1 ch1 = (Py_UCS1) ch;
655            if (ch1 == ch)
656                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
657            else
658                return -1;
659        }
660    case PyUnicode_2BYTE_KIND:
661        {
662            Py_UCS2 ch2 = (Py_UCS2) ch;
663            if (ch2 == ch)
664                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
665            else
666                return -1;
667        }
668    case PyUnicode_4BYTE_KIND:
669        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
670    default:
671        assert(0);
672        return -1;
673    }
674}
675
676#ifdef Py_DEBUG
677/* Fill the data of an Unicode string with invalid characters to detect bugs
678   earlier.
679
680   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
681   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
682   invalid character in Unicode 6.0. */
683static void
684unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
685{
686    int kind = PyUnicode_KIND(unicode);
687    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
688    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
689    if (length <= old_length)
690        return;
691    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
692}
693#endif
694
695static PyObject*
696resize_compact(PyObject *unicode, Py_ssize_t length)
697{
698    Py_ssize_t char_size;
699    Py_ssize_t struct_size;
700    Py_ssize_t new_size;
701    int share_wstr;
702    PyObject *new_unicode;
703#ifdef Py_DEBUG
704    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
705#endif
706
707    assert(unicode_modifiable(unicode));
708    assert(PyUnicode_IS_READY(unicode));
709    assert(PyUnicode_IS_COMPACT(unicode));
710
711    char_size = PyUnicode_KIND(unicode);
712    if (PyUnicode_IS_ASCII(unicode))
713        struct_size = sizeof(PyASCIIObject);
714    else
715        struct_size = sizeof(PyCompactUnicodeObject);
716    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
717
718    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
719        PyErr_NoMemory();
720        return NULL;
721    }
722    new_size = (struct_size + (length + 1) * char_size);
723
724    _Py_DEC_REFTOTAL;
725    _Py_ForgetReference(unicode);
726
727    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
728    if (new_unicode == NULL) {
729        _Py_NewReference(unicode);
730        PyErr_NoMemory();
731        return NULL;
732    }
733    unicode = new_unicode;
734    _Py_NewReference(unicode);
735
736    _PyUnicode_LENGTH(unicode) = length;
737    if (share_wstr) {
738        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
739        if (!PyUnicode_IS_ASCII(unicode))
740            _PyUnicode_WSTR_LENGTH(unicode) = length;
741    }
742    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
743        PyObject_DEL(_PyUnicode_WSTR(unicode));
744        _PyUnicode_WSTR(unicode) = NULL;
745    }
746#ifdef Py_DEBUG
747    unicode_fill_invalid(unicode, old_length);
748#endif
749    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
750                    length, 0);
751    assert(_PyUnicode_CheckConsistency(unicode, 0));
752    return unicode;
753}
754
755static int
756resize_inplace(PyObject *unicode, Py_ssize_t length)
757{
758    wchar_t *wstr;
759    Py_ssize_t new_size;
760    assert(!PyUnicode_IS_COMPACT(unicode));
761    assert(Py_REFCNT(unicode) == 1);
762
763    if (PyUnicode_IS_READY(unicode)) {
764        Py_ssize_t char_size;
765        int share_wstr, share_utf8;
766        void *data;
767#ifdef Py_DEBUG
768        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
769#endif
770
771        data = _PyUnicode_DATA_ANY(unicode);
772        char_size = PyUnicode_KIND(unicode);
773        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
774        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
775
776        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
777            PyErr_NoMemory();
778            return -1;
779        }
780        new_size = (length + 1) * char_size;
781
782        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
783        {
784            PyObject_DEL(_PyUnicode_UTF8(unicode));
785            _PyUnicode_UTF8(unicode) = NULL;
786            _PyUnicode_UTF8_LENGTH(unicode) = 0;
787        }
788
789        data = (PyObject *)PyObject_REALLOC(data, new_size);
790        if (data == NULL) {
791            PyErr_NoMemory();
792            return -1;
793        }
794        _PyUnicode_DATA_ANY(unicode) = data;
795        if (share_wstr) {
796            _PyUnicode_WSTR(unicode) = data;
797            _PyUnicode_WSTR_LENGTH(unicode) = length;
798        }
799        if (share_utf8) {
800            _PyUnicode_UTF8(unicode) = data;
801            _PyUnicode_UTF8_LENGTH(unicode) = length;
802        }
803        _PyUnicode_LENGTH(unicode) = length;
804        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
805#ifdef Py_DEBUG
806        unicode_fill_invalid(unicode, old_length);
807#endif
808        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
809            assert(_PyUnicode_CheckConsistency(unicode, 0));
810            return 0;
811        }
812    }
813    assert(_PyUnicode_WSTR(unicode) != NULL);
814
815    /* check for integer overflow */
816    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
817        PyErr_NoMemory();
818        return -1;
819    }
820    new_size = sizeof(wchar_t) * (length + 1);
821    wstr =  _PyUnicode_WSTR(unicode);
822    wstr = PyObject_REALLOC(wstr, new_size);
823    if (!wstr) {
824        PyErr_NoMemory();
825        return -1;
826    }
827    _PyUnicode_WSTR(unicode) = wstr;
828    _PyUnicode_WSTR(unicode)[length] = 0;
829    _PyUnicode_WSTR_LENGTH(unicode) = length;
830    assert(_PyUnicode_CheckConsistency(unicode, 0));
831    return 0;
832}
833
834static PyObject*
835resize_copy(PyObject *unicode, Py_ssize_t length)
836{
837    Py_ssize_t copy_length;
838    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
839        PyObject *copy;
840
841        if (PyUnicode_READY(unicode) == -1)
842            return NULL;
843
844        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
845        if (copy == NULL)
846            return NULL;
847
848        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
849        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
850        return copy;
851    }
852    else {
853        PyObject *w;
854
855        w = (PyObject*)_PyUnicode_New(length);
856        if (w == NULL)
857            return NULL;
858        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
859        copy_length = Py_MIN(copy_length, length);
860        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
861                  copy_length * sizeof(wchar_t));
862        return w;
863    }
864}
865
866/* We allocate one more byte to make sure the string is
867   Ux0000 terminated; some code (e.g. new_identifier)
868   relies on that.
869
870   XXX This allocator could further be enhanced by assuring that the
871   free list never reduces its size below 1.
872
873*/
874
875static PyUnicodeObject *
876_PyUnicode_New(Py_ssize_t length)
877{
878    register PyUnicodeObject *unicode;
879    size_t new_size;
880
881    /* Optimization for empty strings */
882    if (length == 0 && unicode_empty != NULL) {
883        Py_INCREF(unicode_empty);
884        return (PyUnicodeObject*)unicode_empty;
885    }
886
887    /* Ensure we won't overflow the size. */
888    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
889        return (PyUnicodeObject *)PyErr_NoMemory();
890    }
891    if (length < 0) {
892        PyErr_SetString(PyExc_SystemError,
893                        "Negative size passed to _PyUnicode_New");
894        return NULL;
895    }
896
897    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
898    if (unicode == NULL)
899        return NULL;
900    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
901    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
902    if (!_PyUnicode_WSTR(unicode)) {
903        Py_DECREF(unicode);
904        PyErr_NoMemory();
905        return NULL;
906    }
907
908    /* Initialize the first element to guard against cases where
909     * the caller fails before initializing str -- unicode_resize()
910     * reads str[0], and the Keep-Alive optimization can keep memory
911     * allocated for str alive across a call to unicode_dealloc(unicode).
912     * We don't want unicode_resize to read uninitialized memory in
913     * that case.
914     */
915    _PyUnicode_WSTR(unicode)[0] = 0;
916    _PyUnicode_WSTR(unicode)[length] = 0;
917    _PyUnicode_WSTR_LENGTH(unicode) = length;
918    _PyUnicode_HASH(unicode) = -1;
919    _PyUnicode_STATE(unicode).interned = 0;
920    _PyUnicode_STATE(unicode).kind = 0;
921    _PyUnicode_STATE(unicode).compact = 0;
922    _PyUnicode_STATE(unicode).ready = 0;
923    _PyUnicode_STATE(unicode).ascii = 0;
924    _PyUnicode_DATA_ANY(unicode) = NULL;
925    _PyUnicode_LENGTH(unicode) = 0;
926    _PyUnicode_UTF8(unicode) = NULL;
927    _PyUnicode_UTF8_LENGTH(unicode) = 0;
928    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
929    return unicode;
930}
931
932static const char*
933unicode_kind_name(PyObject *unicode)
934{
935    /* don't check consistency: unicode_kind_name() is called from
936       _PyUnicode_Dump() */
937    if (!PyUnicode_IS_COMPACT(unicode))
938    {
939        if (!PyUnicode_IS_READY(unicode))
940            return "wstr";
941        switch (PyUnicode_KIND(unicode))
942        {
943        case PyUnicode_1BYTE_KIND:
944            if (PyUnicode_IS_ASCII(unicode))
945                return "legacy ascii";
946            else
947                return "legacy latin1";
948        case PyUnicode_2BYTE_KIND:
949            return "legacy UCS2";
950        case PyUnicode_4BYTE_KIND:
951            return "legacy UCS4";
952        default:
953            return "<legacy invalid kind>";
954        }
955    }
956    assert(PyUnicode_IS_READY(unicode));
957    switch (PyUnicode_KIND(unicode)) {
958    case PyUnicode_1BYTE_KIND:
959        if (PyUnicode_IS_ASCII(unicode))
960            return "ascii";
961        else
962            return "latin1";
963    case PyUnicode_2BYTE_KIND:
964        return "UCS2";
965    case PyUnicode_4BYTE_KIND:
966        return "UCS4";
967    default:
968        return "<invalid compact kind>";
969    }
970}
971
972#ifdef Py_DEBUG
973/* Functions wrapping macros for use in debugger */
974char *_PyUnicode_utf8(void *unicode){
975    return PyUnicode_UTF8(unicode);
976}
977
978void *_PyUnicode_compact_data(void *unicode) {
979    return _PyUnicode_COMPACT_DATA(unicode);
980}
981void *_PyUnicode_data(void *unicode){
982    printf("obj %p\n", unicode);
983    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
984    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
985    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
986    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
987    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
988    return PyUnicode_DATA(unicode);
989}
990
991void
992_PyUnicode_Dump(PyObject *op)
993{
994    PyASCIIObject *ascii = (PyASCIIObject *)op;
995    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
996    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
997    void *data;
998
999    if (ascii->state.compact)
1000    {
1001        if (ascii->state.ascii)
1002            data = (ascii + 1);
1003        else
1004            data = (compact + 1);
1005    }
1006    else
1007        data = unicode->data.any;
1008    printf("%s: len=%zu, ",unicode_kind_name(op), ascii->length);
1009
1010    if (ascii->wstr == data)
1011        printf("shared ");
1012    printf("wstr=%p", ascii->wstr);
1013
1014    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1015        printf(" (%zu), ", compact->wstr_length);
1016        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1017            printf("shared ");
1018        printf("utf8=%p (%zu)", compact->utf8, compact->utf8_length);
1019    }
1020    printf(", data=%p\n", data);
1021}
1022#endif
1023
1024PyObject *
1025PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1026{
1027    PyObject *obj;
1028    PyCompactUnicodeObject *unicode;
1029    void *data;
1030    enum PyUnicode_Kind kind;
1031    int is_sharing, is_ascii;
1032    Py_ssize_t char_size;
1033    Py_ssize_t struct_size;
1034
1035    /* Optimization for empty strings */
1036    if (size == 0 && unicode_empty != NULL) {
1037        Py_INCREF(unicode_empty);
1038        return unicode_empty;
1039    }
1040
1041    is_ascii = 0;
1042    is_sharing = 0;
1043    struct_size = sizeof(PyCompactUnicodeObject);
1044    if (maxchar < 128) {
1045        kind = PyUnicode_1BYTE_KIND;
1046        char_size = 1;
1047        is_ascii = 1;
1048        struct_size = sizeof(PyASCIIObject);
1049    }
1050    else if (maxchar < 256) {
1051        kind = PyUnicode_1BYTE_KIND;
1052        char_size = 1;
1053    }
1054    else if (maxchar < 65536) {
1055        kind = PyUnicode_2BYTE_KIND;
1056        char_size = 2;
1057        if (sizeof(wchar_t) == 2)
1058            is_sharing = 1;
1059    }
1060    else {
1061        if (maxchar > MAX_UNICODE) {
1062            PyErr_SetString(PyExc_SystemError,
1063                            "invalid maximum character passed to PyUnicode_New");
1064            return NULL;
1065        }
1066        kind = PyUnicode_4BYTE_KIND;
1067        char_size = 4;
1068        if (sizeof(wchar_t) == 4)
1069            is_sharing = 1;
1070    }
1071
1072    /* Ensure we won't overflow the size. */
1073    if (size < 0) {
1074        PyErr_SetString(PyExc_SystemError,
1075                        "Negative size passed to PyUnicode_New");
1076        return NULL;
1077    }
1078    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1079        return PyErr_NoMemory();
1080
1081    /* Duplicated allocation code from _PyObject_New() instead of a call to
1082     * PyObject_New() so we are able to allocate space for the object and
1083     * it's data buffer.
1084     */
1085    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1086    if (obj == NULL)
1087        return PyErr_NoMemory();
1088    obj = PyObject_INIT(obj, &PyUnicode_Type);
1089    if (obj == NULL)
1090        return NULL;
1091
1092    unicode = (PyCompactUnicodeObject *)obj;
1093    if (is_ascii)
1094        data = ((PyASCIIObject*)obj) + 1;
1095    else
1096        data = unicode + 1;
1097    _PyUnicode_LENGTH(unicode) = size;
1098    _PyUnicode_HASH(unicode) = -1;
1099    _PyUnicode_STATE(unicode).interned = 0;
1100    _PyUnicode_STATE(unicode).kind = kind;
1101    _PyUnicode_STATE(unicode).compact = 1;
1102    _PyUnicode_STATE(unicode).ready = 1;
1103    _PyUnicode_STATE(unicode).ascii = is_ascii;
1104    if (is_ascii) {
1105        ((char*)data)[size] = 0;
1106        _PyUnicode_WSTR(unicode) = NULL;
1107    }
1108    else if (kind == PyUnicode_1BYTE_KIND) {
1109        ((char*)data)[size] = 0;
1110        _PyUnicode_WSTR(unicode) = NULL;
1111        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1112        unicode->utf8 = NULL;
1113        unicode->utf8_length = 0;
1114    }
1115    else {
1116        unicode->utf8 = NULL;
1117        unicode->utf8_length = 0;
1118        if (kind == PyUnicode_2BYTE_KIND)
1119            ((Py_UCS2*)data)[size] = 0;
1120        else /* kind == PyUnicode_4BYTE_KIND */
1121            ((Py_UCS4*)data)[size] = 0;
1122        if (is_sharing) {
1123            _PyUnicode_WSTR_LENGTH(unicode) = size;
1124            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1125        }
1126        else {
1127            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1128            _PyUnicode_WSTR(unicode) = NULL;
1129        }
1130    }
1131#ifdef Py_DEBUG
1132    unicode_fill_invalid((PyObject*)unicode, 0);
1133#endif
1134    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1135    return obj;
1136}
1137
1138#if SIZEOF_WCHAR_T == 2
1139/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1140   will decode surrogate pairs, the other conversions are implemented as macros
1141   for efficiency.
1142
1143   This function assumes that unicode can hold one more code point than wstr
1144   characters for a terminating null character. */
1145static void
1146unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1147                              PyObject *unicode)
1148{
1149    const wchar_t *iter;
1150    Py_UCS4 *ucs4_out;
1151
1152    assert(unicode != NULL);
1153    assert(_PyUnicode_CHECK(unicode));
1154    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1155    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1156
1157    for (iter = begin; iter < end; ) {
1158        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1159                           _PyUnicode_GET_LENGTH(unicode)));
1160        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1161            && (iter+1) < end
1162            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1163        {
1164            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1165            iter += 2;
1166        }
1167        else {
1168            *ucs4_out++ = *iter;
1169            iter++;
1170        }
1171    }
1172    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1173                        _PyUnicode_GET_LENGTH(unicode)));
1174
1175}
1176#endif
1177
1178static int
1179unicode_check_modifiable(PyObject *unicode)
1180{
1181    if (!unicode_modifiable(unicode)) {
1182        PyErr_SetString(PyExc_SystemError,
1183                        "Cannot modify a string currently used");
1184        return -1;
1185    }
1186    return 0;
1187}
1188
1189static int
1190_copy_characters(PyObject *to, Py_ssize_t to_start,
1191                 PyObject *from, Py_ssize_t from_start,
1192                 Py_ssize_t how_many, int check_maxchar)
1193{
1194    unsigned int from_kind, to_kind;
1195    void *from_data, *to_data;
1196
1197    assert(0 <= how_many);
1198    assert(0 <= from_start);
1199    assert(0 <= to_start);
1200    assert(PyUnicode_Check(from));
1201    assert(PyUnicode_IS_READY(from));
1202    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1203
1204    assert(PyUnicode_Check(to));
1205    assert(PyUnicode_IS_READY(to));
1206    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1207
1208    if (how_many == 0)
1209        return 0;
1210
1211    from_kind = PyUnicode_KIND(from);
1212    from_data = PyUnicode_DATA(from);
1213    to_kind = PyUnicode_KIND(to);
1214    to_data = PyUnicode_DATA(to);
1215
1216#ifdef Py_DEBUG
1217    if (!check_maxchar
1218        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1219    {
1220        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1221        Py_UCS4 ch;
1222        Py_ssize_t i;
1223        for (i=0; i < how_many; i++) {
1224            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1225            assert(ch <= to_maxchar);
1226        }
1227    }
1228#endif
1229
1230    if (from_kind == to_kind) {
1231        if (check_maxchar
1232            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1233        {
1234            /* Writing Latin-1 characters into an ASCII string requires to
1235               check that all written characters are pure ASCII */
1236            Py_UCS4 max_char;
1237            max_char = ucs1lib_find_max_char(from_data,
1238                                             (Py_UCS1*)from_data + how_many);
1239            if (max_char >= 128)
1240                return -1;
1241        }
1242        Py_MEMCPY((char*)to_data + to_kind * to_start,
1243                  (char*)from_data + from_kind * from_start,
1244                  to_kind * how_many);
1245    }
1246    else if (from_kind == PyUnicode_1BYTE_KIND
1247             && to_kind == PyUnicode_2BYTE_KIND)
1248    {
1249        _PyUnicode_CONVERT_BYTES(
1250            Py_UCS1, Py_UCS2,
1251            PyUnicode_1BYTE_DATA(from) + from_start,
1252            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1253            PyUnicode_2BYTE_DATA(to) + to_start
1254            );
1255    }
1256    else if (from_kind == PyUnicode_1BYTE_KIND
1257             && to_kind == PyUnicode_4BYTE_KIND)
1258    {
1259        _PyUnicode_CONVERT_BYTES(
1260            Py_UCS1, Py_UCS4,
1261            PyUnicode_1BYTE_DATA(from) + from_start,
1262            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1263            PyUnicode_4BYTE_DATA(to) + to_start
1264            );
1265    }
1266    else if (from_kind == PyUnicode_2BYTE_KIND
1267             && to_kind == PyUnicode_4BYTE_KIND)
1268    {
1269        _PyUnicode_CONVERT_BYTES(
1270            Py_UCS2, Py_UCS4,
1271            PyUnicode_2BYTE_DATA(from) + from_start,
1272            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1273            PyUnicode_4BYTE_DATA(to) + to_start
1274            );
1275    }
1276    else {
1277        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1278
1279        if (!check_maxchar) {
1280            if (from_kind == PyUnicode_2BYTE_KIND
1281                && to_kind == PyUnicode_1BYTE_KIND)
1282            {
1283                _PyUnicode_CONVERT_BYTES(
1284                    Py_UCS2, Py_UCS1,
1285                    PyUnicode_2BYTE_DATA(from) + from_start,
1286                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1287                    PyUnicode_1BYTE_DATA(to) + to_start
1288                    );
1289            }
1290            else if (from_kind == PyUnicode_4BYTE_KIND
1291                     && to_kind == PyUnicode_1BYTE_KIND)
1292            {
1293                _PyUnicode_CONVERT_BYTES(
1294                    Py_UCS4, Py_UCS1,
1295                    PyUnicode_4BYTE_DATA(from) + from_start,
1296                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1297                    PyUnicode_1BYTE_DATA(to) + to_start
1298                    );
1299            }
1300            else if (from_kind == PyUnicode_4BYTE_KIND
1301                     && to_kind == PyUnicode_2BYTE_KIND)
1302            {
1303                _PyUnicode_CONVERT_BYTES(
1304                    Py_UCS4, Py_UCS2,
1305                    PyUnicode_4BYTE_DATA(from) + from_start,
1306                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1307                    PyUnicode_2BYTE_DATA(to) + to_start
1308                    );
1309            }
1310            else {
1311                assert(0);
1312                return -1;
1313            }
1314        }
1315        else {
1316            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1317            Py_UCS4 ch;
1318            Py_ssize_t i;
1319
1320            for (i=0; i < how_many; i++) {
1321                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1322                if (ch > to_maxchar)
1323                    return -1;
1324                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1325            }
1326        }
1327    }
1328    return 0;
1329}
1330
1331void
1332_PyUnicode_FastCopyCharacters(
1333    PyObject *to, Py_ssize_t to_start,
1334    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1335{
1336    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1337}
1338
1339Py_ssize_t
1340PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1341                         PyObject *from, Py_ssize_t from_start,
1342                         Py_ssize_t how_many)
1343{
1344    int err;
1345
1346    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1347        PyErr_BadInternalCall();
1348        return -1;
1349    }
1350
1351    if (PyUnicode_READY(from) == -1)
1352        return -1;
1353    if (PyUnicode_READY(to) == -1)
1354        return -1;
1355
1356    if (from_start < 0) {
1357        PyErr_SetString(PyExc_IndexError, "string index out of range");
1358        return -1;
1359    }
1360    if (to_start < 0) {
1361        PyErr_SetString(PyExc_IndexError, "string index out of range");
1362        return -1;
1363    }
1364    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1365    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1366        PyErr_Format(PyExc_SystemError,
1367                     "Cannot write %zi characters at %zi "
1368                     "in a string of %zi characters",
1369                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1370        return -1;
1371    }
1372
1373    if (how_many == 0)
1374        return 0;
1375
1376    if (unicode_check_modifiable(to))
1377        return -1;
1378
1379    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1380    if (err) {
1381        PyErr_Format(PyExc_SystemError,
1382                     "Cannot copy %s characters "
1383                     "into a string of %s characters",
1384                     unicode_kind_name(from),
1385                     unicode_kind_name(to));
1386        return -1;
1387    }
1388    return how_many;
1389}
1390
1391/* Find the maximum code point and count the number of surrogate pairs so a
1392   correct string length can be computed before converting a string to UCS4.
1393   This function counts single surrogates as a character and not as a pair.
1394
1395   Return 0 on success, or -1 on error. */
1396static int
1397find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1398                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1399{
1400    const wchar_t *iter;
1401    Py_UCS4 ch;
1402
1403    assert(num_surrogates != NULL && maxchar != NULL);
1404    *num_surrogates = 0;
1405    *maxchar = 0;
1406
1407    for (iter = begin; iter < end; ) {
1408#if SIZEOF_WCHAR_T == 2
1409        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1410            && (iter+1) < end
1411            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1412        {
1413            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1414            ++(*num_surrogates);
1415            iter += 2;
1416        }
1417        else
1418#endif
1419        {
1420            ch = *iter;
1421            iter++;
1422        }
1423        if (ch > *maxchar) {
1424            *maxchar = ch;
1425            if (*maxchar > MAX_UNICODE) {
1426                PyErr_Format(PyExc_ValueError,
1427                             "character U+%x is not in range [U+0000; U+10ffff]",
1428                             ch);
1429                return -1;
1430            }
1431        }
1432    }
1433    return 0;
1434}
1435
1436int
1437_PyUnicode_Ready(PyObject *unicode)
1438{
1439    wchar_t *end;
1440    Py_UCS4 maxchar = 0;
1441    Py_ssize_t num_surrogates;
1442#if SIZEOF_WCHAR_T == 2
1443    Py_ssize_t length_wo_surrogates;
1444#endif
1445
1446    /* _PyUnicode_Ready() is only intended for old-style API usage where
1447       strings were created using _PyObject_New() and where no canonical
1448       representation (the str field) has been set yet aka strings
1449       which are not yet ready. */
1450    assert(_PyUnicode_CHECK(unicode));
1451    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1452    assert(_PyUnicode_WSTR(unicode) != NULL);
1453    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1454    assert(_PyUnicode_UTF8(unicode) == NULL);
1455    /* Actually, it should neither be interned nor be anything else: */
1456    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1457
1458    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1459    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1460                                &maxchar, &num_surrogates) == -1)
1461        return -1;
1462
1463    if (maxchar < 256) {
1464        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1465        if (!_PyUnicode_DATA_ANY(unicode)) {
1466            PyErr_NoMemory();
1467            return -1;
1468        }
1469        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1470                                _PyUnicode_WSTR(unicode), end,
1471                                PyUnicode_1BYTE_DATA(unicode));
1472        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1473        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1474        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1475        if (maxchar < 128) {
1476            _PyUnicode_STATE(unicode).ascii = 1;
1477            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1478            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1479        }
1480        else {
1481            _PyUnicode_STATE(unicode).ascii = 0;
1482            _PyUnicode_UTF8(unicode) = NULL;
1483            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1484        }
1485        PyObject_FREE(_PyUnicode_WSTR(unicode));
1486        _PyUnicode_WSTR(unicode) = NULL;
1487        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1488    }
1489    /* In this case we might have to convert down from 4-byte native
1490       wchar_t to 2-byte unicode. */
1491    else if (maxchar < 65536) {
1492        assert(num_surrogates == 0 &&
1493               "FindMaxCharAndNumSurrogatePairs() messed up");
1494
1495#if SIZEOF_WCHAR_T == 2
1496        /* We can share representations and are done. */
1497        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1498        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1499        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1500        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1501        _PyUnicode_UTF8(unicode) = NULL;
1502        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1503#else
1504        /* sizeof(wchar_t) == 4 */
1505        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1506            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1507        if (!_PyUnicode_DATA_ANY(unicode)) {
1508            PyErr_NoMemory();
1509            return -1;
1510        }
1511        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1512                                _PyUnicode_WSTR(unicode), end,
1513                                PyUnicode_2BYTE_DATA(unicode));
1514        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1515        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1516        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1517        _PyUnicode_UTF8(unicode) = NULL;
1518        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1519        PyObject_FREE(_PyUnicode_WSTR(unicode));
1520        _PyUnicode_WSTR(unicode) = NULL;
1521        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1522#endif
1523    }
1524    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1525    else {
1526#if SIZEOF_WCHAR_T == 2
1527        /* in case the native representation is 2-bytes, we need to allocate a
1528           new normalized 4-byte version. */
1529        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1530        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1531        if (!_PyUnicode_DATA_ANY(unicode)) {
1532            PyErr_NoMemory();
1533            return -1;
1534        }
1535        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1536        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1537        _PyUnicode_UTF8(unicode) = NULL;
1538        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1539        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1540        _PyUnicode_STATE(unicode).ready = 1;
1541        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1542        PyObject_FREE(_PyUnicode_WSTR(unicode));
1543        _PyUnicode_WSTR(unicode) = NULL;
1544        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1545#else
1546        assert(num_surrogates == 0);
1547
1548        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1549        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1550        _PyUnicode_UTF8(unicode) = NULL;
1551        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1552        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1553#endif
1554        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1555    }
1556    _PyUnicode_STATE(unicode).ready = 1;
1557    assert(_PyUnicode_CheckConsistency(unicode, 1));
1558    return 0;
1559}
1560
1561static void
1562unicode_dealloc(register PyObject *unicode)
1563{
1564    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1565    case SSTATE_NOT_INTERNED:
1566        break;
1567
1568    case SSTATE_INTERNED_MORTAL:
1569        /* revive dead object temporarily for DelItem */
1570        Py_REFCNT(unicode) = 3;
1571        if (PyDict_DelItem(interned, unicode) != 0)
1572            Py_FatalError(
1573                "deletion of interned string failed");
1574        break;
1575
1576    case SSTATE_INTERNED_IMMORTAL:
1577        Py_FatalError("Immortal interned string died.");
1578
1579    default:
1580        Py_FatalError("Inconsistent interned string state.");
1581    }
1582
1583    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1584        PyObject_DEL(_PyUnicode_WSTR(unicode));
1585    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1586        PyObject_DEL(_PyUnicode_UTF8(unicode));
1587    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1588        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1589
1590    Py_TYPE(unicode)->tp_free(unicode);
1591}
1592
1593#ifdef Py_DEBUG
1594static int
1595unicode_is_singleton(PyObject *unicode)
1596{
1597    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1598    if (unicode == unicode_empty)
1599        return 1;
1600    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1601    {
1602        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1603        if (ch < 256 && unicode_latin1[ch] == unicode)
1604            return 1;
1605    }
1606    return 0;
1607}
1608#endif
1609
1610static int
1611unicode_modifiable(PyObject *unicode)
1612{
1613    assert(_PyUnicode_CHECK(unicode));
1614    if (Py_REFCNT(unicode) != 1)
1615        return 0;
1616    if (_PyUnicode_HASH(unicode) != -1)
1617        return 0;
1618    if (PyUnicode_CHECK_INTERNED(unicode))
1619        return 0;
1620    if (!PyUnicode_CheckExact(unicode))
1621        return 0;
1622#ifdef Py_DEBUG
1623    /* singleton refcount is greater than 1 */
1624    assert(!unicode_is_singleton(unicode));
1625#endif
1626    return 1;
1627}
1628
1629static int
1630unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1631{
1632    PyObject *unicode;
1633    Py_ssize_t old_length;
1634
1635    assert(p_unicode != NULL);
1636    unicode = *p_unicode;
1637
1638    assert(unicode != NULL);
1639    assert(PyUnicode_Check(unicode));
1640    assert(0 <= length);
1641
1642    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1643        old_length = PyUnicode_WSTR_LENGTH(unicode);
1644    else
1645        old_length = PyUnicode_GET_LENGTH(unicode);
1646    if (old_length == length)
1647        return 0;
1648
1649    if (length == 0) {
1650        _Py_INCREF_UNICODE_EMPTY();
1651        if (!unicode_empty)
1652            return -1;
1653        Py_DECREF(*p_unicode);
1654        *p_unicode = unicode_empty;
1655        return 0;
1656    }
1657
1658    if (!unicode_modifiable(unicode)) {
1659        PyObject *copy = resize_copy(unicode, length);
1660        if (copy == NULL)
1661            return -1;
1662        Py_DECREF(*p_unicode);
1663        *p_unicode = copy;
1664        return 0;
1665    }
1666
1667    if (PyUnicode_IS_COMPACT(unicode)) {
1668        PyObject *new_unicode = resize_compact(unicode, length);
1669        if (new_unicode == NULL)
1670            return -1;
1671        *p_unicode = new_unicode;
1672        return 0;
1673    }
1674    return resize_inplace(unicode, length);
1675}
1676
1677int
1678PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1679{
1680    PyObject *unicode;
1681    if (p_unicode == NULL) {
1682        PyErr_BadInternalCall();
1683        return -1;
1684    }
1685    unicode = *p_unicode;
1686    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1687    {
1688        PyErr_BadInternalCall();
1689        return -1;
1690    }
1691    return unicode_resize(p_unicode, length);
1692}
1693
1694/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1695
1696   WARNING: The function doesn't copy the terminating null character and
1697   doesn't check the maximum character (may write a latin1 character in an
1698   ASCII string). */
1699static void
1700unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1701                   const char *str, Py_ssize_t len)
1702{
1703    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1704    void *data = PyUnicode_DATA(unicode);
1705    const char *end = str + len;
1706
1707    switch (kind) {
1708    case PyUnicode_1BYTE_KIND: {
1709        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1710#ifdef Py_DEBUG
1711        if (PyUnicode_IS_ASCII(unicode)) {
1712            Py_UCS4 maxchar = ucs1lib_find_max_char(
1713                (const Py_UCS1*)str,
1714                (const Py_UCS1*)str + len);
1715            assert(maxchar < 128);
1716        }
1717#endif
1718        memcpy((char *) data + index, str, len);
1719        break;
1720    }
1721    case PyUnicode_2BYTE_KIND: {
1722        Py_UCS2 *start = (Py_UCS2 *)data + index;
1723        Py_UCS2 *ucs2 = start;
1724        assert(index <= PyUnicode_GET_LENGTH(unicode));
1725
1726        for (; str < end; ++ucs2, ++str)
1727            *ucs2 = (Py_UCS2)*str;
1728
1729        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1730        break;
1731    }
1732    default: {
1733        Py_UCS4 *start = (Py_UCS4 *)data + index;
1734        Py_UCS4 *ucs4 = start;
1735        assert(kind == PyUnicode_4BYTE_KIND);
1736        assert(index <= PyUnicode_GET_LENGTH(unicode));
1737
1738        for (; str < end; ++ucs4, ++str)
1739            *ucs4 = (Py_UCS4)*str;
1740
1741        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1742    }
1743    }
1744}
1745
1746
1747static PyObject*
1748get_latin1_char(unsigned char ch)
1749{
1750    PyObject *unicode = unicode_latin1[ch];
1751    if (!unicode) {
1752        unicode = PyUnicode_New(1, ch);
1753        if (!unicode)
1754            return NULL;
1755        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1756        assert(_PyUnicode_CheckConsistency(unicode, 1));
1757        unicode_latin1[ch] = unicode;
1758    }
1759    Py_INCREF(unicode);
1760    return unicode;
1761}
1762
1763PyObject *
1764PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1765{
1766    PyObject *unicode;
1767    Py_UCS4 maxchar = 0;
1768    Py_ssize_t num_surrogates;
1769
1770    if (u == NULL)
1771        return (PyObject*)_PyUnicode_New(size);
1772
1773    /* If the Unicode data is known at construction time, we can apply
1774       some optimizations which share commonly used objects. */
1775
1776    /* Optimization for empty strings */
1777    if (size == 0)
1778        _Py_RETURN_UNICODE_EMPTY();
1779
1780    /* Single character Unicode objects in the Latin-1 range are
1781       shared when using this constructor */
1782    if (size == 1 && (Py_UCS4)*u < 256)
1783        return get_latin1_char((unsigned char)*u);
1784
1785    /* If not empty and not single character, copy the Unicode data
1786       into the new object */
1787    if (find_maxchar_surrogates(u, u + size,
1788                                &maxchar, &num_surrogates) == -1)
1789        return NULL;
1790
1791    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1792    if (!unicode)
1793        return NULL;
1794
1795    switch (PyUnicode_KIND(unicode)) {
1796    case PyUnicode_1BYTE_KIND:
1797        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1798                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1799        break;
1800    case PyUnicode_2BYTE_KIND:
1801#if Py_UNICODE_SIZE == 2
1802        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1803#else
1804        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1805                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1806#endif
1807        break;
1808    case PyUnicode_4BYTE_KIND:
1809#if SIZEOF_WCHAR_T == 2
1810        /* This is the only case which has to process surrogates, thus
1811           a simple copy loop is not enough and we need a function. */
1812        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1813#else
1814        assert(num_surrogates == 0);
1815        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1816#endif
1817        break;
1818    default:
1819        assert(0 && "Impossible state");
1820    }
1821
1822    return unicode_result(unicode);
1823}
1824
1825PyObject *
1826PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1827{
1828    if (size < 0) {
1829        PyErr_SetString(PyExc_SystemError,
1830                        "Negative size passed to PyUnicode_FromStringAndSize");
1831        return NULL;
1832    }
1833    if (u != NULL)
1834        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1835    else
1836        return (PyObject *)_PyUnicode_New(size);
1837}
1838
1839PyObject *
1840PyUnicode_FromString(const char *u)
1841{
1842    size_t size = strlen(u);
1843    if (size > PY_SSIZE_T_MAX) {
1844        PyErr_SetString(PyExc_OverflowError, "input too long");
1845        return NULL;
1846    }
1847    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1848}
1849
1850PyObject *
1851_PyUnicode_FromId(_Py_Identifier *id)
1852{
1853    if (!id->object) {
1854        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1855                                                  strlen(id->string),
1856                                                  NULL, NULL);
1857        if (!id->object)
1858            return NULL;
1859        PyUnicode_InternInPlace(&id->object);
1860        assert(!id->next);
1861        id->next = static_strings;
1862        static_strings = id;
1863    }
1864    return id->object;
1865}
1866
1867void
1868_PyUnicode_ClearStaticStrings()
1869{
1870    _Py_Identifier *tmp, *s = static_strings;
1871    while (s) {
1872        Py_DECREF(s->object);
1873        s->object = NULL;
1874        tmp = s->next;
1875        s->next = NULL;
1876        s = tmp;
1877    }
1878    static_strings = NULL;
1879}
1880
1881/* Internal function, doesn't check maximum character */
1882
1883PyObject*
1884_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1885{
1886    const unsigned char *s = (const unsigned char *)buffer;
1887    PyObject *unicode;
1888    if (size == 1) {
1889#ifdef Py_DEBUG
1890        assert((unsigned char)s[0] < 128);
1891#endif
1892        return get_latin1_char(s[0]);
1893    }
1894    unicode = PyUnicode_New(size, 127);
1895    if (!unicode)
1896        return NULL;
1897    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1898    assert(_PyUnicode_CheckConsistency(unicode, 1));
1899    return unicode;
1900}
1901
1902static Py_UCS4
1903kind_maxchar_limit(unsigned int kind)
1904{
1905    switch (kind) {
1906    case PyUnicode_1BYTE_KIND:
1907        return 0x80;
1908    case PyUnicode_2BYTE_KIND:
1909        return 0x100;
1910    case PyUnicode_4BYTE_KIND:
1911        return 0x10000;
1912    default:
1913        assert(0 && "invalid kind");
1914        return MAX_UNICODE;
1915    }
1916}
1917
1918Py_LOCAL_INLINE(Py_UCS4)
1919align_maxchar(Py_UCS4 maxchar)
1920{
1921    if (maxchar <= 127)
1922        return 127;
1923    else if (maxchar <= 255)
1924        return 255;
1925    else if (maxchar <= 65535)
1926        return 65535;
1927    else
1928        return MAX_UNICODE;
1929}
1930
1931static PyObject*
1932_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1933{
1934    PyObject *res;
1935    unsigned char max_char;
1936
1937    if (size == 0)
1938        _Py_RETURN_UNICODE_EMPTY();
1939    assert(size > 0);
1940    if (size == 1)
1941        return get_latin1_char(u[0]);
1942
1943    max_char = ucs1lib_find_max_char(u, u + size);
1944    res = PyUnicode_New(size, max_char);
1945    if (!res)
1946        return NULL;
1947    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1948    assert(_PyUnicode_CheckConsistency(res, 1));
1949    return res;
1950}
1951
1952static PyObject*
1953_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1954{
1955    PyObject *res;
1956    Py_UCS2 max_char;
1957
1958    if (size == 0)
1959        _Py_RETURN_UNICODE_EMPTY();
1960    assert(size > 0);
1961    if (size == 1) {
1962        Py_UCS4 ch = u[0];
1963        int kind;
1964        void *data;
1965        if (ch < 256)
1966            return get_latin1_char((unsigned char)ch);
1967
1968        res = PyUnicode_New(1, ch);
1969        if (res == NULL)
1970            return NULL;
1971        kind = PyUnicode_KIND(res);
1972        data = PyUnicode_DATA(res);
1973        PyUnicode_WRITE(kind, data, 0, ch);
1974        assert(_PyUnicode_CheckConsistency(res, 1));
1975        return res;
1976    }
1977
1978    max_char = ucs2lib_find_max_char(u, u + size);
1979    res = PyUnicode_New(size, max_char);
1980    if (!res)
1981        return NULL;
1982    if (max_char >= 256)
1983        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
1984    else {
1985        _PyUnicode_CONVERT_BYTES(
1986            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
1987    }
1988    assert(_PyUnicode_CheckConsistency(res, 1));
1989    return res;
1990}
1991
1992static PyObject*
1993_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
1994{
1995    PyObject *res;
1996    Py_UCS4 max_char;
1997
1998    if (size == 0)
1999        _Py_RETURN_UNICODE_EMPTY();
2000    assert(size > 0);
2001    if (size == 1) {
2002        Py_UCS4 ch = u[0];
2003        int kind;
2004        void *data;
2005        if (ch < 256)
2006            return get_latin1_char((unsigned char)ch);
2007
2008        res = PyUnicode_New(1, ch);
2009        if (res == NULL)
2010            return NULL;
2011        kind = PyUnicode_KIND(res);
2012        data = PyUnicode_DATA(res);
2013        PyUnicode_WRITE(kind, data, 0, ch);
2014        assert(_PyUnicode_CheckConsistency(res, 1));
2015        return res;
2016    }
2017
2018    max_char = ucs4lib_find_max_char(u, u + size);
2019    res = PyUnicode_New(size, max_char);
2020    if (!res)
2021        return NULL;
2022    if (max_char < 256)
2023        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2024                                 PyUnicode_1BYTE_DATA(res));
2025    else if (max_char < 0x10000)
2026        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2027                                 PyUnicode_2BYTE_DATA(res));
2028    else
2029        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2030    assert(_PyUnicode_CheckConsistency(res, 1));
2031    return res;
2032}
2033
2034PyObject*
2035PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2036{
2037    if (size < 0) {
2038        PyErr_SetString(PyExc_ValueError, "size must be positive");
2039        return NULL;
2040    }
2041    switch (kind) {
2042    case PyUnicode_1BYTE_KIND:
2043        return _PyUnicode_FromUCS1(buffer, size);
2044    case PyUnicode_2BYTE_KIND:
2045        return _PyUnicode_FromUCS2(buffer, size);
2046    case PyUnicode_4BYTE_KIND:
2047        return _PyUnicode_FromUCS4(buffer, size);
2048    default:
2049        PyErr_SetString(PyExc_SystemError, "invalid kind");
2050        return NULL;
2051    }
2052}
2053
2054Py_UCS4
2055_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2056{
2057    enum PyUnicode_Kind kind;
2058    void *startptr, *endptr;
2059
2060    assert(PyUnicode_IS_READY(unicode));
2061    assert(0 <= start);
2062    assert(end <= PyUnicode_GET_LENGTH(unicode));
2063    assert(start <= end);
2064
2065    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2066        return PyUnicode_MAX_CHAR_VALUE(unicode);
2067
2068    if (start == end)
2069        return 127;
2070
2071    if (PyUnicode_IS_ASCII(unicode))
2072        return 127;
2073
2074    kind = PyUnicode_KIND(unicode);
2075    startptr = PyUnicode_DATA(unicode);
2076    endptr = (char *)startptr + end * kind;
2077    startptr = (char *)startptr + start * kind;
2078    switch(kind) {
2079    case PyUnicode_1BYTE_KIND:
2080        return ucs1lib_find_max_char(startptr, endptr);
2081    case PyUnicode_2BYTE_KIND:
2082        return ucs2lib_find_max_char(startptr, endptr);
2083    case PyUnicode_4BYTE_KIND:
2084        return ucs4lib_find_max_char(startptr, endptr);
2085    default:
2086        assert(0);
2087        return 0;
2088    }
2089}
2090
2091/* Ensure that a string uses the most efficient storage, if it is not the
2092   case: create a new string with of the right kind. Write NULL into *p_unicode
2093   on error. */
2094static void
2095unicode_adjust_maxchar(PyObject **p_unicode)
2096{
2097    PyObject *unicode, *copy;
2098    Py_UCS4 max_char;
2099    Py_ssize_t len;
2100    unsigned int kind;
2101
2102    assert(p_unicode != NULL);
2103    unicode = *p_unicode;
2104    assert(PyUnicode_IS_READY(unicode));
2105    if (PyUnicode_IS_ASCII(unicode))
2106        return;
2107
2108    len = PyUnicode_GET_LENGTH(unicode);
2109    kind = PyUnicode_KIND(unicode);
2110    if (kind == PyUnicode_1BYTE_KIND) {
2111        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2112        max_char = ucs1lib_find_max_char(u, u + len);
2113        if (max_char >= 128)
2114            return;
2115    }
2116    else if (kind == PyUnicode_2BYTE_KIND) {
2117        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2118        max_char = ucs2lib_find_max_char(u, u + len);
2119        if (max_char >= 256)
2120            return;
2121    }
2122    else {
2123        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2124        assert(kind == PyUnicode_4BYTE_KIND);
2125        max_char = ucs4lib_find_max_char(u, u + len);
2126        if (max_char >= 0x10000)
2127            return;
2128    }
2129    copy = PyUnicode_New(len, max_char);
2130    if (copy != NULL)
2131        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2132    Py_DECREF(unicode);
2133    *p_unicode = copy;
2134}
2135
2136PyObject*
2137_PyUnicode_Copy(PyObject *unicode)
2138{
2139    Py_ssize_t length;
2140    PyObject *copy;
2141
2142    if (!PyUnicode_Check(unicode)) {
2143        PyErr_BadInternalCall();
2144        return NULL;
2145    }
2146    if (PyUnicode_READY(unicode) == -1)
2147        return NULL;
2148
2149    length = PyUnicode_GET_LENGTH(unicode);
2150    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2151    if (!copy)
2152        return NULL;
2153    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2154
2155    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2156              length * PyUnicode_KIND(unicode));
2157    assert(_PyUnicode_CheckConsistency(copy, 1));
2158    return copy;
2159}
2160
2161
2162/* Widen Unicode objects to larger buffers. Don't write terminating null
2163   character. Return NULL on error. */
2164
2165void*
2166_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2167{
2168    Py_ssize_t len;
2169    void *result;
2170    unsigned int skind;
2171
2172    if (PyUnicode_READY(s) == -1)
2173        return NULL;
2174
2175    len = PyUnicode_GET_LENGTH(s);
2176    skind = PyUnicode_KIND(s);
2177    if (skind >= kind) {
2178        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2179        return NULL;
2180    }
2181    switch (kind) {
2182    case PyUnicode_2BYTE_KIND:
2183        result = PyMem_Malloc(len * sizeof(Py_UCS2));
2184        if (!result)
2185            return PyErr_NoMemory();
2186        assert(skind == PyUnicode_1BYTE_KIND);
2187        _PyUnicode_CONVERT_BYTES(
2188            Py_UCS1, Py_UCS2,
2189            PyUnicode_1BYTE_DATA(s),
2190            PyUnicode_1BYTE_DATA(s) + len,
2191            result);
2192        return result;
2193    case PyUnicode_4BYTE_KIND:
2194        result = PyMem_Malloc(len * sizeof(Py_UCS4));
2195        if (!result)
2196            return PyErr_NoMemory();
2197        if (skind == PyUnicode_2BYTE_KIND) {
2198            _PyUnicode_CONVERT_BYTES(
2199                Py_UCS2, Py_UCS4,
2200                PyUnicode_2BYTE_DATA(s),
2201                PyUnicode_2BYTE_DATA(s) + len,
2202                result);
2203        }
2204        else {
2205            assert(skind == PyUnicode_1BYTE_KIND);
2206            _PyUnicode_CONVERT_BYTES(
2207                Py_UCS1, Py_UCS4,
2208                PyUnicode_1BYTE_DATA(s),
2209                PyUnicode_1BYTE_DATA(s) + len,
2210                result);
2211        }
2212        return result;
2213    default:
2214        break;
2215    }
2216    PyErr_SetString(PyExc_SystemError, "invalid kind");
2217    return NULL;
2218}
2219
2220static Py_UCS4*
2221as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2222        int copy_null)
2223{
2224    int kind;
2225    void *data;
2226    Py_ssize_t len, targetlen;
2227    if (PyUnicode_READY(string) == -1)
2228        return NULL;
2229    kind = PyUnicode_KIND(string);
2230    data = PyUnicode_DATA(string);
2231    len = PyUnicode_GET_LENGTH(string);
2232    targetlen = len;
2233    if (copy_null)
2234        targetlen++;
2235    if (!target) {
2236        if (PY_SSIZE_T_MAX / sizeof(Py_UCS4) < targetlen) {
2237            PyErr_NoMemory();
2238            return NULL;
2239        }
2240        target = PyMem_Malloc(targetlen * sizeof(Py_UCS4));
2241        if (!target) {
2242            PyErr_NoMemory();
2243            return NULL;
2244        }
2245    }
2246    else {
2247        if (targetsize < targetlen) {
2248            PyErr_Format(PyExc_SystemError,
2249                         "string is longer than the buffer");
2250            if (copy_null && 0 < targetsize)
2251                target[0] = 0;
2252            return NULL;
2253        }
2254    }
2255    if (kind == PyUnicode_1BYTE_KIND) {
2256        Py_UCS1 *start = (Py_UCS1 *) data;
2257        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2258    }
2259    else if (kind == PyUnicode_2BYTE_KIND) {
2260        Py_UCS2 *start = (Py_UCS2 *) data;
2261        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2262    }
2263    else {
2264        assert(kind == PyUnicode_4BYTE_KIND);
2265        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2266    }
2267    if (copy_null)
2268        target[len] = 0;
2269    return target;
2270}
2271
2272Py_UCS4*
2273PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2274                 int copy_null)
2275{
2276    if (target == NULL || targetsize < 0) {
2277        PyErr_BadInternalCall();
2278        return NULL;
2279    }
2280    return as_ucs4(string, target, targetsize, copy_null);
2281}
2282
2283Py_UCS4*
2284PyUnicode_AsUCS4Copy(PyObject *string)
2285{
2286    return as_ucs4(string, NULL, 0, 1);
2287}
2288
2289#ifdef HAVE_WCHAR_H
2290
2291PyObject *
2292PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
2293{
2294    if (w == NULL) {
2295        if (size == 0)
2296            _Py_RETURN_UNICODE_EMPTY();
2297        PyErr_BadInternalCall();
2298        return NULL;
2299    }
2300
2301    if (size == -1) {
2302        size = wcslen(w);
2303    }
2304
2305    return PyUnicode_FromUnicode(w, size);
2306}
2307
2308#endif /* HAVE_WCHAR_H */
2309
2310static void
2311makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2312        char c)
2313{
2314    *fmt++ = '%';
2315    if (longflag)
2316        *fmt++ = 'l';
2317    else if (longlongflag) {
2318        /* longlongflag should only ever be nonzero on machines with
2319           HAVE_LONG_LONG defined */
2320#ifdef HAVE_LONG_LONG
2321        char *f = PY_FORMAT_LONG_LONG;
2322        while (*f)
2323            *fmt++ = *f++;
2324#else
2325        /* we shouldn't ever get here */
2326        assert(0);
2327        *fmt++ = 'l';
2328#endif
2329    }
2330    else if (size_tflag) {
2331        char *f = PY_FORMAT_SIZE_T;
2332        while (*f)
2333            *fmt++ = *f++;
2334    }
2335    *fmt++ = c;
2336    *fmt = '\0';
2337}
2338
2339/* maximum number of characters required for output of %lld or %p.
2340   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2341   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2342#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2343
2344static int
2345unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2346                             Py_ssize_t width, Py_ssize_t precision)
2347{
2348    Py_ssize_t length, fill, arglen;
2349    Py_UCS4 maxchar;
2350
2351    if (PyUnicode_READY(str) == -1)
2352        return -1;
2353
2354    length = PyUnicode_GET_LENGTH(str);
2355    if ((precision == -1 || precision >= length)
2356        && width <= length)
2357        return _PyUnicodeWriter_WriteStr(writer, str);
2358
2359    if (precision != -1)
2360        length = Py_MIN(precision, length);
2361
2362    arglen = Py_MAX(length, width);
2363    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2364        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2365    else
2366        maxchar = writer->maxchar;
2367
2368    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2369        return -1;
2370
2371    if (width > length) {
2372        fill = width - length;
2373        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2374            return -1;
2375        writer->pos += fill;
2376    }
2377
2378    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2379                                  str, 0, length);
2380    writer->pos += length;
2381    return 0;
2382}
2383
2384static int
2385unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2386                              Py_ssize_t width, Py_ssize_t precision)
2387{
2388    /* UTF-8 */
2389    Py_ssize_t length;
2390    PyObject *unicode;
2391    int res;
2392
2393    length = strlen(str);
2394    if (precision != -1)
2395        length = Py_MIN(length, precision);
2396    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2397    if (unicode == NULL)
2398        return -1;
2399
2400    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2401    Py_DECREF(unicode);
2402    return res;
2403}
2404
2405static const char*
2406unicode_fromformat_arg(_PyUnicodeWriter *writer,
2407                       const char *f, va_list *vargs)
2408{
2409    const char *p;
2410    Py_ssize_t len;
2411    int zeropad;
2412    Py_ssize_t width;
2413    Py_ssize_t precision;
2414    int longflag;
2415    int longlongflag;
2416    int size_tflag;
2417    Py_ssize_t fill;
2418
2419    p = f;
2420    f++;
2421    zeropad = 0;
2422    if (*f == '0') {
2423        zeropad = 1;
2424        f++;
2425    }
2426
2427    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2428    width = -1;
2429    if (Py_ISDIGIT((unsigned)*f)) {
2430        width = *f - '0';
2431        f++;
2432        while (Py_ISDIGIT((unsigned)*f)) {
2433            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2434                PyErr_SetString(PyExc_ValueError,
2435                                "width too big");
2436                return NULL;
2437            }
2438            width = (width * 10) + (*f - '0');
2439            f++;
2440        }
2441    }
2442    precision = -1;
2443    if (*f == '.') {
2444        f++;
2445        if (Py_ISDIGIT((unsigned)*f)) {
2446            precision = (*f - '0');
2447            f++;
2448            while (Py_ISDIGIT((unsigned)*f)) {
2449                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2450                    PyErr_SetString(PyExc_ValueError,
2451                                    "precision too big");
2452                    return NULL;
2453                }
2454                precision = (precision * 10) + (*f - '0');
2455                f++;
2456            }
2457        }
2458        if (*f == '%') {
2459            /* "%.3%s" => f points to "3" */
2460            f--;
2461        }
2462    }
2463    if (*f == '\0') {
2464        /* bogus format "%.123" => go backward, f points to "3" */
2465        f--;
2466    }
2467
2468    /* Handle %ld, %lu, %lld and %llu. */
2469    longflag = 0;
2470    longlongflag = 0;
2471    size_tflag = 0;
2472    if (*f == 'l') {
2473        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2474            longflag = 1;
2475            ++f;
2476        }
2477#ifdef HAVE_LONG_LONG
2478        else if (f[1] == 'l' &&
2479                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2480            longlongflag = 1;
2481            f += 2;
2482        }
2483#endif
2484    }
2485    /* handle the size_t flag. */
2486    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2487        size_tflag = 1;
2488        ++f;
2489    }
2490
2491    if (f[1] == '\0')
2492        writer->overallocate = 0;
2493
2494    switch (*f) {
2495    case 'c':
2496    {
2497        int ordinal = va_arg(*vargs, int);
2498        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2499            PyErr_SetString(PyExc_OverflowError,
2500                            "character argument not in range(0x110000)");
2501            return NULL;
2502        }
2503        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2504            return NULL;
2505        break;
2506    }
2507
2508    case 'i':
2509    case 'd':
2510    case 'u':
2511    case 'x':
2512    {
2513        /* used by sprintf */
2514        char fmt[10]; /* should be enough for "%0lld\0" */
2515        char buffer[MAX_LONG_LONG_CHARS];
2516        Py_ssize_t arglen;
2517
2518        if (*f == 'u') {
2519            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2520
2521            if (longflag)
2522                len = sprintf(buffer, fmt,
2523                        va_arg(*vargs, unsigned long));
2524#ifdef HAVE_LONG_LONG
2525            else if (longlongflag)
2526                len = sprintf(buffer, fmt,
2527                        va_arg(*vargs, unsigned PY_LONG_LONG));
2528#endif
2529            else if (size_tflag)
2530                len = sprintf(buffer, fmt,
2531                        va_arg(*vargs, size_t));
2532            else
2533                len = sprintf(buffer, fmt,
2534                        va_arg(*vargs, unsigned int));
2535        }
2536        else if (*f == 'x') {
2537            makefmt(fmt, 0, 0, 0, 'x');
2538            len = sprintf(buffer, fmt, va_arg(*vargs, int));
2539        }
2540        else {
2541            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2542
2543            if (longflag)
2544                len = sprintf(buffer, fmt,
2545                        va_arg(*vargs, long));
2546#ifdef HAVE_LONG_LONG
2547            else if (longlongflag)
2548                len = sprintf(buffer, fmt,
2549                        va_arg(*vargs, PY_LONG_LONG));
2550#endif
2551            else if (size_tflag)
2552                len = sprintf(buffer, fmt,
2553                        va_arg(*vargs, Py_ssize_t));
2554            else
2555                len = sprintf(buffer, fmt,
2556                        va_arg(*vargs, int));
2557        }
2558        assert(len >= 0);
2559
2560        if (precision < len)
2561            precision = len;
2562
2563        arglen = Py_MAX(precision, width);
2564        assert(ucs1lib_find_max_char((Py_UCS1*)buffer, (Py_UCS1*)buffer + len) <= 127);
2565        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2566            return NULL;
2567
2568        if (width > precision) {
2569            Py_UCS4 fillchar;
2570            fill = width - precision;
2571            fillchar = zeropad?'0':' ';
2572            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2573                return NULL;
2574            writer->pos += fill;
2575        }
2576        if (precision > len) {
2577            fill = precision - len;
2578            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2579                return NULL;
2580            writer->pos += fill;
2581        }
2582
2583        unicode_write_cstr(writer->buffer, writer->pos, buffer, len);
2584        writer->pos += len;
2585        break;
2586    }
2587
2588    case 'p':
2589    {
2590        char number[MAX_LONG_LONG_CHARS];
2591
2592        len = sprintf(number, "%p", va_arg(*vargs, void*));
2593        assert(len >= 0);
2594
2595        /* %p is ill-defined:  ensure leading 0x. */
2596        if (number[1] == 'X')
2597            number[1] = 'x';
2598        else if (number[1] != 'x') {
2599            memmove(number + 2, number,
2600                    strlen(number) + 1);
2601            number[0] = '0';
2602            number[1] = 'x';
2603            len += 2;
2604        }
2605
2606        assert(ucs1lib_find_max_char((Py_UCS1*)number, (Py_UCS1*)number + len) <= 127);
2607        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
2608            return NULL;
2609        unicode_write_cstr(writer->buffer, writer->pos, number, len);
2610        writer->pos += len;
2611        break;
2612    }
2613
2614    case 's':
2615    {
2616        /* UTF-8 */
2617        const char *s = va_arg(*vargs, const char*);
2618        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2619            return NULL;
2620        break;
2621    }
2622
2623    case 'U':
2624    {
2625        PyObject *obj = va_arg(*vargs, PyObject *);
2626        assert(obj && _PyUnicode_CHECK(obj));
2627
2628        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2629            return NULL;
2630        break;
2631    }
2632
2633    case 'V':
2634    {
2635        PyObject *obj = va_arg(*vargs, PyObject *);
2636        const char *str = va_arg(*vargs, const char *);
2637        if (obj) {
2638            assert(_PyUnicode_CHECK(obj));
2639            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2640                return NULL;
2641        }
2642        else {
2643            assert(str != NULL);
2644            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2645                return NULL;
2646        }
2647        break;
2648    }
2649
2650    case 'S':
2651    {
2652        PyObject *obj = va_arg(*vargs, PyObject *);
2653        PyObject *str;
2654        assert(obj);
2655        str = PyObject_Str(obj);
2656        if (!str)
2657            return NULL;
2658        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2659            Py_DECREF(str);
2660            return NULL;
2661        }
2662        Py_DECREF(str);
2663        break;
2664    }
2665
2666    case 'R':
2667    {
2668        PyObject *obj = va_arg(*vargs, PyObject *);
2669        PyObject *repr;
2670        assert(obj);
2671        repr = PyObject_Repr(obj);
2672        if (!repr)
2673            return NULL;
2674        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2675            Py_DECREF(repr);
2676            return NULL;
2677        }
2678        Py_DECREF(repr);
2679        break;
2680    }
2681
2682    case 'A':
2683    {
2684        PyObject *obj = va_arg(*vargs, PyObject *);
2685        PyObject *ascii;
2686        assert(obj);
2687        ascii = PyObject_ASCII(obj);
2688        if (!ascii)
2689            return NULL;
2690        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2691            Py_DECREF(ascii);
2692            return NULL;
2693        }
2694        Py_DECREF(ascii);
2695        break;
2696    }
2697
2698    case '%':
2699        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2700            return NULL;
2701        break;
2702
2703    default:
2704        /* if we stumble upon an unknown formatting code, copy the rest
2705           of the format string to the output string. (we cannot just
2706           skip the code, since there's no way to know what's in the
2707           argument list) */
2708        len = strlen(p);
2709        if (_PyUnicodeWriter_WriteCstr(writer, p, len) == -1)
2710            return NULL;
2711        f = p+len;
2712        return f;
2713    }
2714
2715    f++;
2716    return f;
2717}
2718
2719PyObject *
2720PyUnicode_FromFormatV(const char *format, va_list vargs)
2721{
2722    va_list vargs2;
2723    const char *f;
2724    _PyUnicodeWriter writer;
2725
2726    _PyUnicodeWriter_Init(&writer);
2727    writer.min_length = strlen(format) + 100;
2728    writer.overallocate = 1;
2729
2730    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2731       Copy it to be able to pass a reference to a subfunction. */
2732    Py_VA_COPY(vargs2, vargs);
2733
2734    for (f = format; *f; ) {
2735        if (*f == '%') {
2736            f = unicode_fromformat_arg(&writer, f, &vargs2);
2737            if (f == NULL)
2738                goto fail;
2739        }
2740        else {
2741            const char *p;
2742            Py_ssize_t len;
2743
2744            p = f;
2745            do
2746            {
2747                if ((unsigned char)*p > 127) {
2748                    PyErr_Format(PyExc_ValueError,
2749                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2750                        "string, got a non-ASCII byte: 0x%02x",
2751                        (unsigned char)*p);
2752                    return NULL;
2753                }
2754                p++;
2755            }
2756            while (*p != '\0' && *p != '%');
2757            len = p - f;
2758
2759            if (*p == '\0')
2760                writer.overallocate = 0;
2761            if (_PyUnicodeWriter_Prepare(&writer, len, 127) == -1)
2762                goto fail;
2763            unicode_write_cstr(writer.buffer, writer.pos, f, len);
2764            writer.pos += len;
2765
2766            f = p;
2767        }
2768    }
2769    return _PyUnicodeWriter_Finish(&writer);
2770
2771  fail:
2772    _PyUnicodeWriter_Dealloc(&writer);
2773    return NULL;
2774}
2775
2776PyObject *
2777PyUnicode_FromFormat(const char *format, ...)
2778{
2779    PyObject* ret;
2780    va_list vargs;
2781
2782#ifdef HAVE_STDARG_PROTOTYPES
2783    va_start(vargs, format);
2784#else
2785    va_start(vargs);
2786#endif
2787    ret = PyUnicode_FromFormatV(format, vargs);
2788    va_end(vargs);
2789    return ret;
2790}
2791
2792#ifdef HAVE_WCHAR_H
2793
2794/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2795   convert a Unicode object to a wide character string.
2796
2797   - If w is NULL: return the number of wide characters (including the null
2798     character) required to convert the unicode object. Ignore size argument.
2799
2800   - Otherwise: return the number of wide characters (excluding the null
2801     character) written into w. Write at most size wide characters (including
2802     the null character). */
2803static Py_ssize_t
2804unicode_aswidechar(PyObject *unicode,
2805                   wchar_t *w,
2806                   Py_ssize_t size)
2807{
2808    Py_ssize_t res;
2809    const wchar_t *wstr;
2810
2811    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2812    if (wstr == NULL)
2813        return -1;
2814
2815    if (w != NULL) {
2816        if (size > res)
2817            size = res + 1;
2818        else
2819            res = size;
2820        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2821        return res;
2822    }
2823    else
2824        return res + 1;
2825}
2826
2827Py_ssize_t
2828PyUnicode_AsWideChar(PyObject *unicode,
2829                     wchar_t *w,
2830                     Py_ssize_t size)
2831{
2832    if (unicode == NULL) {
2833        PyErr_BadInternalCall();
2834        return -1;
2835    }
2836    return unicode_aswidechar(unicode, w, size);
2837}
2838
2839wchar_t*
2840PyUnicode_AsWideCharString(PyObject *unicode,
2841                           Py_ssize_t *size)
2842{
2843    wchar_t* buffer;
2844    Py_ssize_t buflen;
2845
2846    if (unicode == NULL) {
2847        PyErr_BadInternalCall();
2848        return NULL;
2849    }
2850
2851    buflen = unicode_aswidechar(unicode, NULL, 0);
2852    if (buflen == -1)
2853        return NULL;
2854    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
2855        PyErr_NoMemory();
2856        return NULL;
2857    }
2858
2859    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
2860    if (buffer == NULL) {
2861        PyErr_NoMemory();
2862        return NULL;
2863    }
2864    buflen = unicode_aswidechar(unicode, buffer, buflen);
2865    if (buflen == -1) {
2866        PyMem_FREE(buffer);
2867        return NULL;
2868    }
2869    if (size != NULL)
2870        *size = buflen;
2871    return buffer;
2872}
2873
2874#endif /* HAVE_WCHAR_H */
2875
2876PyObject *
2877PyUnicode_FromOrdinal(int ordinal)
2878{
2879    PyObject *v;
2880    void *data;
2881    int kind;
2882
2883    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2884        PyErr_SetString(PyExc_ValueError,
2885                        "chr() arg not in range(0x110000)");
2886        return NULL;
2887    }
2888
2889    if ((Py_UCS4)ordinal < 256)
2890        return get_latin1_char((unsigned char)ordinal);
2891
2892    v = PyUnicode_New(1, ordinal);
2893    if (v == NULL)
2894        return NULL;
2895    kind = PyUnicode_KIND(v);
2896    data = PyUnicode_DATA(v);
2897    PyUnicode_WRITE(kind, data, 0, ordinal);
2898    assert(_PyUnicode_CheckConsistency(v, 1));
2899    return v;
2900}
2901
2902PyObject *
2903PyUnicode_FromObject(register PyObject *obj)
2904{
2905    /* XXX Perhaps we should make this API an alias of
2906       PyObject_Str() instead ?! */
2907    if (PyUnicode_CheckExact(obj)) {
2908        if (PyUnicode_READY(obj) == -1)
2909            return NULL;
2910        Py_INCREF(obj);
2911        return obj;
2912    }
2913    if (PyUnicode_Check(obj)) {
2914        /* For a Unicode subtype that's not a Unicode object,
2915           return a true Unicode object with the same data. */
2916        return _PyUnicode_Copy(obj);
2917    }
2918    PyErr_Format(PyExc_TypeError,
2919                 "Can't convert '%.100s' object to str implicitly",
2920                 Py_TYPE(obj)->tp_name);
2921    return NULL;
2922}
2923
2924PyObject *
2925PyUnicode_FromEncodedObject(register PyObject *obj,
2926                            const char *encoding,
2927                            const char *errors)
2928{
2929    Py_buffer buffer;
2930    PyObject *v;
2931
2932    if (obj == NULL) {
2933        PyErr_BadInternalCall();
2934        return NULL;
2935    }
2936
2937    /* Decoding bytes objects is the most common case and should be fast */
2938    if (PyBytes_Check(obj)) {
2939        if (PyBytes_GET_SIZE(obj) == 0)
2940            _Py_RETURN_UNICODE_EMPTY();
2941        v = PyUnicode_Decode(
2942                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2943                encoding, errors);
2944        return v;
2945    }
2946
2947    if (PyUnicode_Check(obj)) {
2948        PyErr_SetString(PyExc_TypeError,
2949                        "decoding str is not supported");
2950        return NULL;
2951    }
2952
2953    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2954    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2955        PyErr_Format(PyExc_TypeError,
2956                     "coercing to str: need bytes, bytearray "
2957                     "or buffer-like object, %.80s found",
2958                     Py_TYPE(obj)->tp_name);
2959        return NULL;
2960    }
2961
2962    if (buffer.len == 0) {
2963        PyBuffer_Release(&buffer);
2964        _Py_RETURN_UNICODE_EMPTY();
2965    }
2966
2967    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2968    PyBuffer_Release(&buffer);
2969    return v;
2970}
2971
2972/* Convert encoding to lower case and replace '_' with '-' in order to
2973   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2974   1 on success. */
2975int
2976_Py_normalize_encoding(const char *encoding,
2977                       char *lower,
2978                       size_t lower_len)
2979{
2980    const char *e;
2981    char *l;
2982    char *l_end;
2983
2984    if (encoding == NULL) {
2985        strcpy(lower, "utf-8");
2986        return 1;
2987    }
2988    e = encoding;
2989    l = lower;
2990    l_end = &lower[lower_len - 1];
2991    while (*e) {
2992        if (l == l_end)
2993            return 0;
2994        if (Py_ISUPPER(*e)) {
2995            *l++ = Py_TOLOWER(*e++);
2996        }
2997        else if (*e == '_') {
2998            *l++ = '-';
2999            e++;
3000        }
3001        else {
3002            *l++ = *e++;
3003        }
3004    }
3005    *l = '\0';
3006    return 1;
3007}
3008
3009PyObject *
3010PyUnicode_Decode(const char *s,
3011                 Py_ssize_t size,
3012                 const char *encoding,
3013                 const char *errors)
3014{
3015    PyObject *buffer = NULL, *unicode;
3016    Py_buffer info;
3017    char lower[11];  /* Enough for any encoding shortcut */
3018
3019    /* Shortcuts for common default encodings */
3020    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3021        if ((strcmp(lower, "utf-8") == 0) ||
3022            (strcmp(lower, "utf8") == 0))
3023            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3024        else if ((strcmp(lower, "latin-1") == 0) ||
3025                 (strcmp(lower, "latin1") == 0) ||
3026                 (strcmp(lower, "iso-8859-1") == 0))
3027            return PyUnicode_DecodeLatin1(s, size, errors);
3028#ifdef HAVE_MBCS
3029        else if (strcmp(lower, "mbcs") == 0)
3030            return PyUnicode_DecodeMBCS(s, size, errors);
3031#endif
3032        else if (strcmp(lower, "ascii") == 0)
3033            return PyUnicode_DecodeASCII(s, size, errors);
3034        else if (strcmp(lower, "utf-16") == 0)
3035            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3036        else if (strcmp(lower, "utf-32") == 0)
3037            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3038    }
3039
3040    /* Decode via the codec registry */
3041    buffer = NULL;
3042    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3043        goto onError;
3044    buffer = PyMemoryView_FromBuffer(&info);
3045    if (buffer == NULL)
3046        goto onError;
3047    unicode = PyCodec_Decode(buffer, encoding, errors);
3048    if (unicode == NULL)
3049        goto onError;
3050    if (!PyUnicode_Check(unicode)) {
3051        PyErr_Format(PyExc_TypeError,
3052                     "decoder did not return a str object (type=%.400s)",
3053                     Py_TYPE(unicode)->tp_name);
3054        Py_DECREF(unicode);
3055        goto onError;
3056    }
3057    Py_DECREF(buffer);
3058    return unicode_result(unicode);
3059
3060  onError:
3061    Py_XDECREF(buffer);
3062    return NULL;
3063}
3064
3065PyObject *
3066PyUnicode_AsDecodedObject(PyObject *unicode,
3067                          const char *encoding,
3068                          const char *errors)
3069{
3070    PyObject *v;
3071
3072    if (!PyUnicode_Check(unicode)) {
3073        PyErr_BadArgument();
3074        goto onError;
3075    }
3076
3077    if (encoding == NULL)
3078        encoding = PyUnicode_GetDefaultEncoding();
3079
3080    /* Decode via the codec registry */
3081    v = PyCodec_Decode(unicode, encoding, errors);
3082    if (v == NULL)
3083        goto onError;
3084    return unicode_result(v);
3085
3086  onError:
3087    return NULL;
3088}
3089
3090PyObject *
3091PyUnicode_AsDecodedUnicode(PyObject *unicode,
3092                           const char *encoding,
3093                           const char *errors)
3094{
3095    PyObject *v;
3096
3097    if (!PyUnicode_Check(unicode)) {
3098        PyErr_BadArgument();
3099        goto onError;
3100    }
3101
3102    if (encoding == NULL)
3103        encoding = PyUnicode_GetDefaultEncoding();
3104
3105    /* Decode via the codec registry */
3106    v = PyCodec_Decode(unicode, encoding, errors);
3107    if (v == NULL)
3108        goto onError;
3109    if (!PyUnicode_Check(v)) {
3110        PyErr_Format(PyExc_TypeError,
3111                     "decoder did not return a str object (type=%.400s)",
3112                     Py_TYPE(v)->tp_name);
3113        Py_DECREF(v);
3114        goto onError;
3115    }
3116    return unicode_result(v);
3117
3118  onError:
3119    return NULL;
3120}
3121
3122PyObject *
3123PyUnicode_Encode(const Py_UNICODE *s,
3124                 Py_ssize_t size,
3125                 const char *encoding,
3126                 const char *errors)
3127{
3128    PyObject *v, *unicode;
3129
3130    unicode = PyUnicode_FromUnicode(s, size);
3131    if (unicode == NULL)
3132        return NULL;
3133    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3134    Py_DECREF(unicode);
3135    return v;
3136}
3137
3138PyObject *
3139PyUnicode_AsEncodedObject(PyObject *unicode,
3140                          const char *encoding,
3141                          const char *errors)
3142{
3143    PyObject *v;
3144
3145    if (!PyUnicode_Check(unicode)) {
3146        PyErr_BadArgument();
3147        goto onError;
3148    }
3149
3150    if (encoding == NULL)
3151        encoding = PyUnicode_GetDefaultEncoding();
3152
3153    /* Encode via the codec registry */
3154    v = PyCodec_Encode(unicode, encoding, errors);
3155    if (v == NULL)
3156        goto onError;
3157    return v;
3158
3159  onError:
3160    return NULL;
3161}
3162
3163static size_t
3164wcstombs_errorpos(const wchar_t *wstr)
3165{
3166    size_t len;
3167#if SIZEOF_WCHAR_T == 2
3168    wchar_t buf[3];
3169#else
3170    wchar_t buf[2];
3171#endif
3172    char outbuf[MB_LEN_MAX];
3173    const wchar_t *start, *previous;
3174
3175#if SIZEOF_WCHAR_T == 2
3176    buf[2] = 0;
3177#else
3178    buf[1] = 0;
3179#endif
3180    start = wstr;
3181    while (*wstr != L'\0')
3182    {
3183        previous = wstr;
3184#if SIZEOF_WCHAR_T == 2
3185        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3186            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3187        {
3188            buf[0] = wstr[0];
3189            buf[1] = wstr[1];
3190            wstr += 2;
3191        }
3192        else {
3193            buf[0] = *wstr;
3194            buf[1] = 0;
3195            wstr++;
3196        }
3197#else
3198        buf[0] = *wstr;
3199        wstr++;
3200#endif
3201        len = wcstombs(outbuf, buf, sizeof(outbuf));
3202        if (len == (size_t)-1)
3203            return previous - start;
3204    }
3205
3206    /* failed to find the unencodable character */
3207    return 0;
3208}
3209
3210static int
3211locale_error_handler(const char *errors, int *surrogateescape)
3212{
3213    if (errors == NULL) {
3214        *surrogateescape = 0;
3215        return 0;
3216    }
3217
3218    if (strcmp(errors, "strict") == 0) {
3219        *surrogateescape = 0;
3220        return 0;
3221    }
3222    if (strcmp(errors, "surrogateescape") == 0) {
3223        *surrogateescape = 1;
3224        return 0;
3225    }
3226    PyErr_Format(PyExc_ValueError,
3227                 "only 'strict' and 'surrogateescape' error handlers "
3228                 "are supported, not '%s'",
3229                 errors);
3230    return -1;
3231}
3232
3233PyObject *
3234PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3235{
3236    Py_ssize_t wlen, wlen2;
3237    wchar_t *wstr;
3238    PyObject *bytes = NULL;
3239    char *errmsg;
3240    PyObject *reason;
3241    PyObject *exc;
3242    size_t error_pos;
3243    int surrogateescape;
3244
3245    if (locale_error_handler(errors, &surrogateescape) < 0)
3246        return NULL;
3247
3248    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3249    if (wstr == NULL)
3250        return NULL;
3251
3252    wlen2 = wcslen(wstr);
3253    if (wlen2 != wlen) {
3254        PyMem_Free(wstr);
3255        PyErr_SetString(PyExc_TypeError, "embedded null character");
3256        return NULL;
3257    }
3258
3259    if (surrogateescape) {
3260        /* "surrogateescape" error handler */
3261        char *str;
3262
3263        str = _Py_wchar2char(wstr, &error_pos);
3264        if (str == NULL) {
3265            if (error_pos == (size_t)-1) {
3266                PyErr_NoMemory();
3267                PyMem_Free(wstr);
3268                return NULL;
3269            }
3270            else {
3271                goto encode_error;
3272            }
3273        }
3274        PyMem_Free(wstr);
3275
3276        bytes = PyBytes_FromString(str);
3277        PyMem_Free(str);
3278    }
3279    else {
3280        /* strict mode */
3281        size_t len, len2;
3282
3283        len = wcstombs(NULL, wstr, 0);
3284        if (len == (size_t)-1) {
3285            error_pos = (size_t)-1;
3286            goto encode_error;
3287        }
3288
3289        bytes = PyBytes_FromStringAndSize(NULL, len);
3290        if (bytes == NULL) {
3291            PyMem_Free(wstr);
3292            return NULL;
3293        }
3294
3295        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3296        if (len2 == (size_t)-1 || len2 > len) {
3297            error_pos = (size_t)-1;
3298            goto encode_error;
3299        }
3300        PyMem_Free(wstr);
3301    }
3302    return bytes;
3303
3304encode_error:
3305    errmsg = strerror(errno);
3306    assert(errmsg != NULL);
3307
3308    if (error_pos == (size_t)-1)
3309        error_pos = wcstombs_errorpos(wstr);
3310
3311    PyMem_Free(wstr);
3312    Py_XDECREF(bytes);
3313
3314    if (errmsg != NULL) {
3315        size_t errlen;
3316        wstr = _Py_char2wchar(errmsg, &errlen);
3317        if (wstr != NULL) {
3318            reason = PyUnicode_FromWideChar(wstr, errlen);
3319            PyMem_Free(wstr);
3320        } else
3321            errmsg = NULL;
3322    }
3323    if (errmsg == NULL)
3324        reason = PyUnicode_FromString(
3325            "wcstombs() encountered an unencodable "
3326            "wide character");
3327    if (reason == NULL)
3328        return NULL;
3329
3330    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3331                                "locale", unicode,
3332                                (Py_ssize_t)error_pos,
3333                                (Py_ssize_t)(error_pos+1),
3334                                reason);
3335    Py_DECREF(reason);
3336    if (exc != NULL) {
3337        PyCodec_StrictErrors(exc);
3338        Py_XDECREF(exc);
3339    }
3340    return NULL;
3341}
3342
3343PyObject *
3344PyUnicode_EncodeFSDefault(PyObject *unicode)
3345{
3346#ifdef HAVE_MBCS
3347    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3348#elif defined(__APPLE__)
3349    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3350#else
3351    PyInterpreterState *interp = PyThreadState_GET()->interp;
3352    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3353       cannot use it to encode and decode filenames before it is loaded. Load
3354       the Python codec requires to encode at least its own filename. Use the C
3355       version of the locale codec until the codec registry is initialized and
3356       the Python codec is loaded.
3357
3358       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3359       cannot only rely on it: check also interp->fscodec_initialized for
3360       subinterpreters. */
3361    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3362        return PyUnicode_AsEncodedString(unicode,
3363                                         Py_FileSystemDefaultEncoding,
3364                                         "surrogateescape");
3365    }
3366    else {
3367        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3368    }
3369#endif
3370}
3371
3372PyObject *
3373PyUnicode_AsEncodedString(PyObject *unicode,
3374                          const char *encoding,
3375                          const char *errors)
3376{
3377    PyObject *v;
3378    char lower[11];  /* Enough for any encoding shortcut */
3379
3380    if (!PyUnicode_Check(unicode)) {
3381        PyErr_BadArgument();
3382        return NULL;
3383    }
3384
3385    /* Shortcuts for common default encodings */
3386    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3387        if ((strcmp(lower, "utf-8") == 0) ||
3388            (strcmp(lower, "utf8") == 0))
3389        {
3390            if (errors == NULL || strcmp(errors, "strict") == 0)
3391                return _PyUnicode_AsUTF8String(unicode, NULL);
3392            else
3393                return _PyUnicode_AsUTF8String(unicode, errors);
3394        }
3395        else if ((strcmp(lower, "latin-1") == 0) ||
3396                 (strcmp(lower, "latin1") == 0) ||
3397                 (strcmp(lower, "iso-8859-1") == 0))
3398            return _PyUnicode_AsLatin1String(unicode, errors);
3399#ifdef HAVE_MBCS
3400        else if (strcmp(lower, "mbcs") == 0)
3401            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3402#endif
3403        else if (strcmp(lower, "ascii") == 0)
3404            return _PyUnicode_AsASCIIString(unicode, errors);
3405    }
3406
3407    /* Encode via the codec registry */
3408    v = PyCodec_Encode(unicode, encoding, errors);
3409    if (v == NULL)
3410        return NULL;
3411
3412    /* The normal path */
3413    if (PyBytes_Check(v))
3414        return v;
3415
3416    /* If the codec returns a buffer, raise a warning and convert to bytes */
3417    if (PyByteArray_Check(v)) {
3418        int error;
3419        PyObject *b;
3420
3421        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3422            "encoder %s returned bytearray instead of bytes",
3423            encoding);
3424        if (error) {
3425            Py_DECREF(v);
3426            return NULL;
3427        }
3428
3429        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3430        Py_DECREF(v);
3431        return b;
3432    }
3433
3434    PyErr_Format(PyExc_TypeError,
3435                 "encoder did not return a bytes object (type=%.400s)",
3436                 Py_TYPE(v)->tp_name);
3437    Py_DECREF(v);
3438    return NULL;
3439}
3440
3441PyObject *
3442PyUnicode_AsEncodedUnicode(PyObject *unicode,
3443                           const char *encoding,
3444                           const char *errors)
3445{
3446    PyObject *v;
3447
3448    if (!PyUnicode_Check(unicode)) {
3449        PyErr_BadArgument();
3450        goto onError;
3451    }
3452
3453    if (encoding == NULL)
3454        encoding = PyUnicode_GetDefaultEncoding();
3455
3456    /* Encode via the codec registry */
3457    v = PyCodec_Encode(unicode, encoding, errors);
3458    if (v == NULL)
3459        goto onError;
3460    if (!PyUnicode_Check(v)) {
3461        PyErr_Format(PyExc_TypeError,
3462                     "encoder did not return an str object (type=%.400s)",
3463                     Py_TYPE(v)->tp_name);
3464        Py_DECREF(v);
3465        goto onError;
3466    }
3467    return v;
3468
3469  onError:
3470    return NULL;
3471}
3472
3473static size_t
3474mbstowcs_errorpos(const char *str, size_t len)
3475{
3476#ifdef HAVE_MBRTOWC
3477    const char *start = str;
3478    mbstate_t mbs;
3479    size_t converted;
3480    wchar_t ch;
3481
3482    memset(&mbs, 0, sizeof mbs);
3483    while (len)
3484    {
3485        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3486        if (converted == 0)
3487            /* Reached end of string */
3488            break;
3489        if (converted == (size_t)-1 || converted == (size_t)-2) {
3490            /* Conversion error or incomplete character */
3491            return str - start;
3492        }
3493        else {
3494            str += converted;
3495            len -= converted;
3496        }
3497    }
3498    /* failed to find the undecodable byte sequence */
3499    return 0;
3500#endif
3501    return 0;
3502}
3503
3504PyObject*
3505PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3506                              const char *errors)
3507{
3508    wchar_t smallbuf[256];
3509    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3510    wchar_t *wstr;
3511    size_t wlen, wlen2;
3512    PyObject *unicode;
3513    int surrogateescape;
3514    size_t error_pos;
3515    char *errmsg;
3516    PyObject *reason, *exc;
3517
3518    if (locale_error_handler(errors, &surrogateescape) < 0)
3519        return NULL;
3520
3521    if (str[len] != '\0' || len != strlen(str)) {
3522        PyErr_SetString(PyExc_TypeError, "embedded null character");
3523        return NULL;
3524    }
3525
3526    if (surrogateescape) {
3527        /* "surrogateescape" error handler */
3528        wstr = _Py_char2wchar(str, &wlen);
3529        if (wstr == NULL) {
3530            if (wlen == (size_t)-1)
3531                PyErr_NoMemory();
3532            else
3533                PyErr_SetFromErrno(PyExc_OSError);
3534            return NULL;
3535        }
3536
3537        unicode = PyUnicode_FromWideChar(wstr, wlen);
3538        PyMem_Free(wstr);
3539    }
3540    else {
3541        /* strict mode */
3542#ifndef HAVE_BROKEN_MBSTOWCS
3543        wlen = mbstowcs(NULL, str, 0);
3544#else
3545        wlen = len;
3546#endif
3547        if (wlen == (size_t)-1)
3548            goto decode_error;
3549        if (wlen+1 <= smallbuf_len) {
3550            wstr = smallbuf;
3551        }
3552        else {
3553            if (wlen > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1)
3554                return PyErr_NoMemory();
3555
3556            wstr = PyMem_Malloc((wlen+1) * sizeof(wchar_t));
3557            if (!wstr)
3558                return PyErr_NoMemory();
3559        }
3560
3561        wlen2 = mbstowcs(wstr, str, wlen+1);
3562        if (wlen2 == (size_t)-1) {
3563            if (wstr != smallbuf)
3564                PyMem_Free(wstr);
3565            goto decode_error;
3566        }
3567#ifdef HAVE_BROKEN_MBSTOWCS
3568        assert(wlen2 == wlen);
3569#endif
3570        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3571        if (wstr != smallbuf)
3572            PyMem_Free(wstr);
3573    }
3574    return unicode;
3575
3576decode_error:
3577    errmsg = strerror(errno);
3578    assert(errmsg != NULL);
3579
3580    error_pos = mbstowcs_errorpos(str, len);
3581    if (errmsg != NULL) {
3582        size_t errlen;
3583        wstr = _Py_char2wchar(errmsg, &errlen);
3584        if (wstr != NULL) {
3585            reason = PyUnicode_FromWideChar(wstr, errlen);
3586            PyMem_Free(wstr);
3587        } else
3588            errmsg = NULL;
3589    }
3590    if (errmsg == NULL)
3591        reason = PyUnicode_FromString(
3592            "mbstowcs() encountered an invalid multibyte sequence");
3593    if (reason == NULL)
3594        return NULL;
3595
3596    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3597                                "locale", str, len,
3598                                (Py_ssize_t)error_pos,
3599                                (Py_ssize_t)(error_pos+1),
3600                                reason);
3601    Py_DECREF(reason);
3602    if (exc != NULL) {
3603        PyCodec_StrictErrors(exc);
3604        Py_XDECREF(exc);
3605    }
3606    return NULL;
3607}
3608
3609PyObject*
3610PyUnicode_DecodeLocale(const char *str, const char *errors)
3611{
3612    Py_ssize_t size = (Py_ssize_t)strlen(str);
3613    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3614}
3615
3616
3617PyObject*
3618PyUnicode_DecodeFSDefault(const char *s) {
3619    Py_ssize_t size = (Py_ssize_t)strlen(s);
3620    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3621}
3622
3623PyObject*
3624PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3625{
3626#ifdef HAVE_MBCS
3627    return PyUnicode_DecodeMBCS(s, size, NULL);
3628#elif defined(__APPLE__)
3629    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3630#else
3631    PyInterpreterState *interp = PyThreadState_GET()->interp;
3632    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3633       cannot use it to encode and decode filenames before it is loaded. Load
3634       the Python codec requires to encode at least its own filename. Use the C
3635       version of the locale codec until the codec registry is initialized and
3636       the Python codec is loaded.
3637
3638       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3639       cannot only rely on it: check also interp->fscodec_initialized for
3640       subinterpreters. */
3641    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3642        return PyUnicode_Decode(s, size,
3643                                Py_FileSystemDefaultEncoding,
3644                                "surrogateescape");
3645    }
3646    else {
3647        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3648    }
3649#endif
3650}
3651
3652
3653int
3654_PyUnicode_HasNULChars(PyObject* str)
3655{
3656    Py_ssize_t pos;
3657
3658    if (PyUnicode_READY(str) == -1)
3659        return -1;
3660    pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3661                   PyUnicode_GET_LENGTH(str), '\0', 1);
3662    if (pos == -1)
3663        return 0;
3664    else
3665        return 1;
3666}
3667
3668int
3669PyUnicode_FSConverter(PyObject* arg, void* addr)
3670{
3671    PyObject *output = NULL;
3672    Py_ssize_t size;
3673    void *data;
3674    if (arg == NULL) {
3675        Py_DECREF(*(PyObject**)addr);
3676        return 1;
3677    }
3678    if (PyBytes_Check(arg)) {
3679        output = arg;
3680        Py_INCREF(output);
3681    }
3682    else {
3683        arg = PyUnicode_FromObject(arg);
3684        if (!arg)
3685            return 0;
3686        output = PyUnicode_EncodeFSDefault(arg);
3687        Py_DECREF(arg);
3688        if (!output)
3689            return 0;
3690        if (!PyBytes_Check(output)) {
3691            Py_DECREF(output);
3692            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3693            return 0;
3694        }
3695    }
3696    size = PyBytes_GET_SIZE(output);
3697    data = PyBytes_AS_STRING(output);
3698    if (size != strlen(data)) {
3699        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3700        Py_DECREF(output);
3701        return 0;
3702    }
3703    *(PyObject**)addr = output;
3704    return Py_CLEANUP_SUPPORTED;
3705}
3706
3707
3708int
3709PyUnicode_FSDecoder(PyObject* arg, void* addr)
3710{
3711    PyObject *output = NULL;
3712    if (arg == NULL) {
3713        Py_DECREF(*(PyObject**)addr);
3714        return 1;
3715    }
3716    if (PyUnicode_Check(arg)) {
3717        if (PyUnicode_READY(arg) == -1)
3718            return 0;
3719        output = arg;
3720        Py_INCREF(output);
3721    }
3722    else {
3723        arg = PyBytes_FromObject(arg);
3724        if (!arg)
3725            return 0;
3726        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3727                                                  PyBytes_GET_SIZE(arg));
3728        Py_DECREF(arg);
3729        if (!output)
3730            return 0;
3731        if (!PyUnicode_Check(output)) {
3732            Py_DECREF(output);
3733            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3734            return 0;
3735        }
3736    }
3737    if (PyUnicode_READY(output) == -1) {
3738        Py_DECREF(output);
3739        return 0;
3740    }
3741    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3742                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3743        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3744        Py_DECREF(output);
3745        return 0;
3746    }
3747    *(PyObject**)addr = output;
3748    return Py_CLEANUP_SUPPORTED;
3749}
3750
3751
3752char*
3753PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3754{
3755    PyObject *bytes;
3756
3757    if (!PyUnicode_Check(unicode)) {
3758        PyErr_BadArgument();
3759        return NULL;
3760    }
3761    if (PyUnicode_READY(unicode) == -1)
3762        return NULL;
3763
3764    if (PyUnicode_UTF8(unicode) == NULL) {
3765        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3766        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3767        if (bytes == NULL)
3768            return NULL;
3769        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3770        if (_PyUnicode_UTF8(unicode) == NULL) {
3771            Py_DECREF(bytes);
3772            return NULL;
3773        }
3774        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3775        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3776                  PyBytes_AS_STRING(bytes),
3777                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3778        Py_DECREF(bytes);
3779    }
3780
3781    if (psize)
3782        *psize = PyUnicode_UTF8_LENGTH(unicode);
3783    return PyUnicode_UTF8(unicode);
3784}
3785
3786char*
3787PyUnicode_AsUTF8(PyObject *unicode)
3788{
3789    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3790}
3791
3792Py_UNICODE *
3793PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3794{
3795    const unsigned char *one_byte;
3796#if SIZEOF_WCHAR_T == 4
3797    const Py_UCS2 *two_bytes;
3798#else
3799    const Py_UCS4 *four_bytes;
3800    const Py_UCS4 *ucs4_end;
3801    Py_ssize_t num_surrogates;
3802#endif
3803    wchar_t *w;
3804    wchar_t *wchar_end;
3805
3806    if (!PyUnicode_Check(unicode)) {
3807        PyErr_BadArgument();
3808        return NULL;
3809    }
3810    if (_PyUnicode_WSTR(unicode) == NULL) {
3811        /* Non-ASCII compact unicode object */
3812        assert(_PyUnicode_KIND(unicode) != 0);
3813        assert(PyUnicode_IS_READY(unicode));
3814
3815        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3816#if SIZEOF_WCHAR_T == 2
3817            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3818            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3819            num_surrogates = 0;
3820
3821            for (; four_bytes < ucs4_end; ++four_bytes) {
3822                if (*four_bytes > 0xFFFF)
3823                    ++num_surrogates;
3824            }
3825
3826            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3827                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3828            if (!_PyUnicode_WSTR(unicode)) {
3829                PyErr_NoMemory();
3830                return NULL;
3831            }
3832            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3833
3834            w = _PyUnicode_WSTR(unicode);
3835            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3836            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3837            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3838                if (*four_bytes > 0xFFFF) {
3839                    assert(*four_bytes <= MAX_UNICODE);
3840                    /* encode surrogate pair in this case */
3841                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3842                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3843                }
3844                else
3845                    *w = *four_bytes;
3846
3847                if (w > wchar_end) {
3848                    assert(0 && "Miscalculated string end");
3849                }
3850            }
3851            *w = 0;
3852#else
3853            /* sizeof(wchar_t) == 4 */
3854            Py_FatalError("Impossible unicode object state, wstr and str "
3855                          "should share memory already.");
3856            return NULL;
3857#endif
3858        }
3859        else {
3860            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3861                                                  (_PyUnicode_LENGTH(unicode) + 1));
3862            if (!_PyUnicode_WSTR(unicode)) {
3863                PyErr_NoMemory();
3864                return NULL;
3865            }
3866            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3867                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3868            w = _PyUnicode_WSTR(unicode);
3869            wchar_end = w + _PyUnicode_LENGTH(unicode);
3870
3871            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3872                one_byte = PyUnicode_1BYTE_DATA(unicode);
3873                for (; w < wchar_end; ++one_byte, ++w)
3874                    *w = *one_byte;
3875                /* null-terminate the wstr */
3876                *w = 0;
3877            }
3878            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3879#if SIZEOF_WCHAR_T == 4
3880                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3881                for (; w < wchar_end; ++two_bytes, ++w)
3882                    *w = *two_bytes;
3883                /* null-terminate the wstr */
3884                *w = 0;
3885#else
3886                /* sizeof(wchar_t) == 2 */
3887                PyObject_FREE(_PyUnicode_WSTR(unicode));
3888                _PyUnicode_WSTR(unicode) = NULL;
3889                Py_FatalError("Impossible unicode object state, wstr "
3890                              "and str should share memory already.");
3891                return NULL;
3892#endif
3893            }
3894            else {
3895                assert(0 && "This should never happen.");
3896            }
3897        }
3898    }
3899    if (size != NULL)
3900        *size = PyUnicode_WSTR_LENGTH(unicode);
3901    return _PyUnicode_WSTR(unicode);
3902}
3903
3904Py_UNICODE *
3905PyUnicode_AsUnicode(PyObject *unicode)
3906{
3907    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3908}
3909
3910
3911Py_ssize_t
3912PyUnicode_GetSize(PyObject *unicode)
3913{
3914    if (!PyUnicode_Check(unicode)) {
3915        PyErr_BadArgument();
3916        goto onError;
3917    }
3918    return PyUnicode_GET_SIZE(unicode);
3919
3920  onError:
3921    return -1;
3922}
3923
3924Py_ssize_t
3925PyUnicode_GetLength(PyObject *unicode)
3926{
3927    if (!PyUnicode_Check(unicode)) {
3928        PyErr_BadArgument();
3929        return -1;
3930    }
3931    if (PyUnicode_READY(unicode) == -1)
3932        return -1;
3933    return PyUnicode_GET_LENGTH(unicode);
3934}
3935
3936Py_UCS4
3937PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3938{
3939    void *data;
3940    int kind;
3941
3942    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3943        PyErr_BadArgument();
3944        return (Py_UCS4)-1;
3945    }
3946    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3947        PyErr_SetString(PyExc_IndexError, "string index out of range");
3948        return (Py_UCS4)-1;
3949    }
3950    data = PyUnicode_DATA(unicode);
3951    kind = PyUnicode_KIND(unicode);
3952    return PyUnicode_READ(kind, data, index);
3953}
3954
3955int
3956PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3957{
3958    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3959        PyErr_BadArgument();
3960        return -1;
3961    }
3962    assert(PyUnicode_IS_READY(unicode));
3963    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3964        PyErr_SetString(PyExc_IndexError, "string index out of range");
3965        return -1;
3966    }
3967    if (unicode_check_modifiable(unicode))
3968        return -1;
3969    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3970        PyErr_SetString(PyExc_ValueError, "character out of range");
3971        return -1;
3972    }
3973    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3974                    index, ch);
3975    return 0;
3976}
3977
3978const char *
3979PyUnicode_GetDefaultEncoding(void)
3980{
3981    return "utf-8";
3982}
3983
3984/* create or adjust a UnicodeDecodeError */
3985static void
3986make_decode_exception(PyObject **exceptionObject,
3987                      const char *encoding,
3988                      const char *input, Py_ssize_t length,
3989                      Py_ssize_t startpos, Py_ssize_t endpos,
3990                      const char *reason)
3991{
3992    if (*exceptionObject == NULL) {
3993        *exceptionObject = PyUnicodeDecodeError_Create(
3994            encoding, input, length, startpos, endpos, reason);
3995    }
3996    else {
3997        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3998            goto onError;
3999        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
4000            goto onError;
4001        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4002            goto onError;
4003    }
4004    return;
4005
4006onError:
4007    Py_DECREF(*exceptionObject);
4008    *exceptionObject = NULL;
4009}
4010
4011#ifdef HAVE_MBCS
4012/* error handling callback helper:
4013   build arguments, call the callback and check the arguments,
4014   if no exception occurred, copy the replacement to the output
4015   and adjust various state variables.
4016   return 0 on success, -1 on error
4017*/
4018
4019static int
4020unicode_decode_call_errorhandler_wchar(
4021    const char *errors, PyObject **errorHandler,
4022    const char *encoding, const char *reason,
4023    const char **input, const char **inend, Py_ssize_t *startinpos,
4024    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4025    PyObject **output, Py_ssize_t *outpos)
4026{
4027    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4028
4029    PyObject *restuple = NULL;
4030    PyObject *repunicode = NULL;
4031    Py_ssize_t outsize;
4032    Py_ssize_t insize;
4033    Py_ssize_t requiredsize;
4034    Py_ssize_t newpos;
4035    PyObject *inputobj = NULL;
4036    wchar_t *repwstr;
4037    Py_ssize_t repwlen;
4038
4039    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4040    outsize = _PyUnicode_WSTR_LENGTH(*output);
4041
4042    if (*errorHandler == NULL) {
4043        *errorHandler = PyCodec_LookupError(errors);
4044        if (*errorHandler == NULL)
4045            goto onError;
4046    }
4047
4048    make_decode_exception(exceptionObject,
4049        encoding,
4050        *input, *inend - *input,
4051        *startinpos, *endinpos,
4052        reason);
4053    if (*exceptionObject == NULL)
4054        goto onError;
4055
4056    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4057    if (restuple == NULL)
4058        goto onError;
4059    if (!PyTuple_Check(restuple)) {
4060        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4061        goto onError;
4062    }
4063    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4064        goto onError;
4065
4066    /* Copy back the bytes variables, which might have been modified by the
4067       callback */
4068    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4069    if (!inputobj)
4070        goto onError;
4071    if (!PyBytes_Check(inputobj)) {
4072        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4073    }
4074    *input = PyBytes_AS_STRING(inputobj);
4075    insize = PyBytes_GET_SIZE(inputobj);
4076    *inend = *input + insize;
4077    /* we can DECREF safely, as the exception has another reference,
4078       so the object won't go away. */
4079    Py_DECREF(inputobj);
4080
4081    if (newpos<0)
4082        newpos = insize+newpos;
4083    if (newpos<0 || newpos>insize) {
4084        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4085        goto onError;
4086    }
4087
4088    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4089    if (repwstr == NULL)
4090        goto onError;
4091    /* need more space? (at least enough for what we
4092       have+the replacement+the rest of the string (starting
4093       at the new input position), so we won't have to check space
4094       when there are no errors in the rest of the string) */
4095    requiredsize = *outpos + repwlen + insize-newpos;
4096    if (requiredsize > outsize) {
4097        if (requiredsize < 2*outsize)
4098            requiredsize = 2*outsize;
4099        if (unicode_resize(output, requiredsize) < 0)
4100            goto onError;
4101    }
4102    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4103    *outpos += repwlen;
4104
4105    *endinpos = newpos;
4106    *inptr = *input + newpos;
4107
4108    /* we made it! */
4109    Py_XDECREF(restuple);
4110    return 0;
4111
4112  onError:
4113    Py_XDECREF(restuple);
4114    return -1;
4115}
4116#endif   /* HAVE_MBCS */
4117
4118static int
4119unicode_decode_call_errorhandler_writer(
4120    const char *errors, PyObject **errorHandler,
4121    const char *encoding, const char *reason,
4122    const char **input, const char **inend, Py_ssize_t *startinpos,
4123    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4124    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4125{
4126    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4127
4128    PyObject *restuple = NULL;
4129    PyObject *repunicode = NULL;
4130    Py_ssize_t insize;
4131    Py_ssize_t newpos;
4132    Py_ssize_t replen;
4133    PyObject *inputobj = NULL;
4134
4135    if (*errorHandler == NULL) {
4136        *errorHandler = PyCodec_LookupError(errors);
4137        if (*errorHandler == NULL)
4138            goto onError;
4139    }
4140
4141    make_decode_exception(exceptionObject,
4142        encoding,
4143        *input, *inend - *input,
4144        *startinpos, *endinpos,
4145        reason);
4146    if (*exceptionObject == NULL)
4147        goto onError;
4148
4149    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4150    if (restuple == NULL)
4151        goto onError;
4152    if (!PyTuple_Check(restuple)) {
4153        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4154        goto onError;
4155    }
4156    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4157        goto onError;
4158
4159    /* Copy back the bytes variables, which might have been modified by the
4160       callback */
4161    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4162    if (!inputobj)
4163        goto onError;
4164    if (!PyBytes_Check(inputobj)) {
4165        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4166    }
4167    *input = PyBytes_AS_STRING(inputobj);
4168    insize = PyBytes_GET_SIZE(inputobj);
4169    *inend = *input + insize;
4170    /* we can DECREF safely, as the exception has another reference,
4171       so the object won't go away. */
4172    Py_DECREF(inputobj);
4173
4174    if (newpos<0)
4175        newpos = insize+newpos;
4176    if (newpos<0 || newpos>insize) {
4177        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4178        goto onError;
4179    }
4180
4181    if (PyUnicode_READY(repunicode) < 0)
4182        goto onError;
4183    replen = PyUnicode_GET_LENGTH(repunicode);
4184    writer->min_length += replen;
4185    if (replen > 1)
4186        writer->overallocate = 1;
4187    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4188        goto onError;
4189
4190    *endinpos = newpos;
4191    *inptr = *input + newpos;
4192
4193    /* we made it! */
4194    Py_XDECREF(restuple);
4195    return 0;
4196
4197  onError:
4198    Py_XDECREF(restuple);
4199    return -1;
4200}
4201
4202/* --- UTF-7 Codec -------------------------------------------------------- */
4203
4204/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4205
4206/* Three simple macros defining base-64. */
4207
4208/* Is c a base-64 character? */
4209
4210#define IS_BASE64(c) \
4211    (((c) >= 'A' && (c) <= 'Z') ||     \
4212     ((c) >= 'a' && (c) <= 'z') ||     \
4213     ((c) >= '0' && (c) <= '9') ||     \
4214     (c) == '+' || (c) == '/')
4215
4216/* given that c is a base-64 character, what is its base-64 value? */
4217
4218#define FROM_BASE64(c)                                                  \
4219    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4220     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4221     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4222     (c) == '+' ? 62 : 63)
4223
4224/* What is the base-64 character of the bottom 6 bits of n? */
4225
4226#define TO_BASE64(n)  \
4227    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4228
4229/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4230 * decoded as itself.  We are permissive on decoding; the only ASCII
4231 * byte not decoding to itself is the + which begins a base64
4232 * string. */
4233
4234#define DECODE_DIRECT(c)                                \
4235    ((c) <= 127 && (c) != '+')
4236
4237/* The UTF-7 encoder treats ASCII characters differently according to
4238 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4239 * the above).  See RFC2152.  This array identifies these different
4240 * sets:
4241 * 0 : "Set D"
4242 *     alphanumeric and '(),-./:?
4243 * 1 : "Set O"
4244 *     !"#$%&*;<=>@[]^_`{|}
4245 * 2 : "whitespace"
4246 *     ht nl cr sp
4247 * 3 : special (must be base64 encoded)
4248 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4249 */
4250
4251static
4252char utf7_category[128] = {
4253/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4254    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4255/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4256    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4257/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4258    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4259/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4260    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4261/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4262    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4263/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4264    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4265/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4266    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4267/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4268    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4269};
4270
4271/* ENCODE_DIRECT: this character should be encoded as itself.  The
4272 * answer depends on whether we are encoding set O as itself, and also
4273 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4274 * clear that the answers to these questions vary between
4275 * applications, so this code needs to be flexible.  */
4276
4277#define ENCODE_DIRECT(c, directO, directWS)             \
4278    ((c) < 128 && (c) > 0 &&                            \
4279     ((utf7_category[(c)] == 0) ||                      \
4280      (directWS && (utf7_category[(c)] == 2)) ||        \
4281      (directO && (utf7_category[(c)] == 1))))
4282
4283PyObject *
4284PyUnicode_DecodeUTF7(const char *s,
4285                     Py_ssize_t size,
4286                     const char *errors)
4287{
4288    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4289}
4290
4291/* The decoder.  The only state we preserve is our read position,
4292 * i.e. how many characters we have consumed.  So if we end in the
4293 * middle of a shift sequence we have to back off the read position
4294 * and the output to the beginning of the sequence, otherwise we lose
4295 * all the shift state (seen bits, number of bits seen, high
4296 * surrogate). */
4297
4298PyObject *
4299PyUnicode_DecodeUTF7Stateful(const char *s,
4300                             Py_ssize_t size,
4301                             const char *errors,
4302                             Py_ssize_t *consumed)
4303{
4304    const char *starts = s;
4305    Py_ssize_t startinpos;
4306    Py_ssize_t endinpos;
4307    const char *e;
4308    _PyUnicodeWriter writer;
4309    const char *errmsg = "";
4310    int inShift = 0;
4311    Py_ssize_t shiftOutStart;
4312    unsigned int base64bits = 0;
4313    unsigned long base64buffer = 0;
4314    Py_UCS4 surrogate = 0;
4315    PyObject *errorHandler = NULL;
4316    PyObject *exc = NULL;
4317
4318    if (size == 0) {
4319        if (consumed)
4320            *consumed = 0;
4321        _Py_RETURN_UNICODE_EMPTY();
4322    }
4323
4324    /* Start off assuming it's all ASCII. Widen later as necessary. */
4325    _PyUnicodeWriter_Init(&writer);
4326    writer.min_length = size;
4327
4328    shiftOutStart = 0;
4329    e = s + size;
4330
4331    while (s < e) {
4332        Py_UCS4 ch;
4333      restart:
4334        ch = (unsigned char) *s;
4335
4336        if (inShift) { /* in a base-64 section */
4337            if (IS_BASE64(ch)) { /* consume a base-64 character */
4338                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4339                base64bits += 6;
4340                s++;
4341                if (base64bits >= 16) {
4342                    /* we have enough bits for a UTF-16 value */
4343                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4344                    base64bits -= 16;
4345                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4346                    if (surrogate) {
4347                        /* expecting a second surrogate */
4348                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4349                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4350                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4351                                goto onError;
4352                            surrogate = 0;
4353                            continue;
4354                        }
4355                        else {
4356                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4357                                goto onError;
4358                            surrogate = 0;
4359                        }
4360                    }
4361                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4362                        /* first surrogate */
4363                        surrogate = outCh;
4364                    }
4365                    else {
4366                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4367                            goto onError;
4368                    }
4369                }
4370            }
4371            else { /* now leaving a base-64 section */
4372                inShift = 0;
4373                s++;
4374                if (surrogate) {
4375                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4376                        goto onError;
4377                    surrogate = 0;
4378                }
4379                if (base64bits > 0) { /* left-over bits */
4380                    if (base64bits >= 6) {
4381                        /* We've seen at least one base-64 character */
4382                        errmsg = "partial character in shift sequence";
4383                        goto utf7Error;
4384                    }
4385                    else {
4386                        /* Some bits remain; they should be zero */
4387                        if (base64buffer != 0) {
4388                            errmsg = "non-zero padding bits in shift sequence";
4389                            goto utf7Error;
4390                        }
4391                    }
4392                }
4393                if (ch != '-') {
4394                    /* '-' is absorbed; other terminating
4395                       characters are preserved */
4396                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4397                        goto onError;
4398                }
4399            }
4400        }
4401        else if ( ch == '+' ) {
4402            startinpos = s-starts;
4403            s++; /* consume '+' */
4404            if (s < e && *s == '-') { /* '+-' encodes '+' */
4405                s++;
4406                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4407                    goto onError;
4408            }
4409            else { /* begin base64-encoded section */
4410                inShift = 1;
4411                shiftOutStart = writer.pos;
4412                base64bits = 0;
4413            }
4414        }
4415        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4416            s++;
4417            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4418                goto onError;
4419        }
4420        else {
4421            startinpos = s-starts;
4422            s++;
4423            errmsg = "unexpected special character";
4424            goto utf7Error;
4425        }
4426        continue;
4427utf7Error:
4428        endinpos = s-starts;
4429        if (unicode_decode_call_errorhandler_writer(
4430                errors, &errorHandler,
4431                "utf7", errmsg,
4432                &starts, &e, &startinpos, &endinpos, &exc, &s,
4433                &writer))
4434            goto onError;
4435    }
4436
4437    /* end of string */
4438
4439    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4440        /* if we're in an inconsistent state, that's an error */
4441        if (surrogate ||
4442                (base64bits >= 6) ||
4443                (base64bits > 0 && base64buffer != 0)) {
4444            endinpos = size;
4445            if (unicode_decode_call_errorhandler_writer(
4446                    errors, &errorHandler,
4447                    "utf7", "unterminated shift sequence",
4448                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4449                    &writer))
4450                goto onError;
4451            if (s < e)
4452                goto restart;
4453        }
4454    }
4455
4456    /* return state */
4457    if (consumed) {
4458        if (inShift) {
4459            writer.pos = shiftOutStart; /* back off output */
4460            *consumed = startinpos;
4461        }
4462        else {
4463            *consumed = s-starts;
4464        }
4465    }
4466
4467    Py_XDECREF(errorHandler);
4468    Py_XDECREF(exc);
4469    return _PyUnicodeWriter_Finish(&writer);
4470
4471  onError:
4472    Py_XDECREF(errorHandler);
4473    Py_XDECREF(exc);
4474    _PyUnicodeWriter_Dealloc(&writer);
4475    return NULL;
4476}
4477
4478
4479PyObject *
4480_PyUnicode_EncodeUTF7(PyObject *str,
4481                      int base64SetO,
4482                      int base64WhiteSpace,
4483                      const char *errors)
4484{
4485    int kind;
4486    void *data;
4487    Py_ssize_t len;
4488    PyObject *v;
4489    int inShift = 0;
4490    Py_ssize_t i;
4491    unsigned int base64bits = 0;
4492    unsigned long base64buffer = 0;
4493    char * out;
4494    char * start;
4495
4496    if (PyUnicode_READY(str) == -1)
4497        return NULL;
4498    kind = PyUnicode_KIND(str);
4499    data = PyUnicode_DATA(str);
4500    len = PyUnicode_GET_LENGTH(str);
4501
4502    if (len == 0)
4503        return PyBytes_FromStringAndSize(NULL, 0);
4504
4505    /* It might be possible to tighten this worst case */
4506    if (len > PY_SSIZE_T_MAX / 8)
4507        return PyErr_NoMemory();
4508    v = PyBytes_FromStringAndSize(NULL, len * 8);
4509    if (v == NULL)
4510        return NULL;
4511
4512    start = out = PyBytes_AS_STRING(v);
4513    for (i = 0; i < len; ++i) {
4514        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4515
4516        if (inShift) {
4517            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4518                /* shifting out */
4519                if (base64bits) { /* output remaining bits */
4520                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4521                    base64buffer = 0;
4522                    base64bits = 0;
4523                }
4524                inShift = 0;
4525                /* Characters not in the BASE64 set implicitly unshift the sequence
4526                   so no '-' is required, except if the character is itself a '-' */
4527                if (IS_BASE64(ch) || ch == '-') {
4528                    *out++ = '-';
4529                }
4530                *out++ = (char) ch;
4531            }
4532            else {
4533                goto encode_char;
4534            }
4535        }
4536        else { /* not in a shift sequence */
4537            if (ch == '+') {
4538                *out++ = '+';
4539                        *out++ = '-';
4540            }
4541            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4542                *out++ = (char) ch;
4543            }
4544            else {
4545                *out++ = '+';
4546                inShift = 1;
4547                goto encode_char;
4548            }
4549        }
4550        continue;
4551encode_char:
4552        if (ch >= 0x10000) {
4553            assert(ch <= MAX_UNICODE);
4554
4555            /* code first surrogate */
4556            base64bits += 16;
4557            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4558            while (base64bits >= 6) {
4559                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4560                base64bits -= 6;
4561            }
4562            /* prepare second surrogate */
4563            ch = Py_UNICODE_LOW_SURROGATE(ch);
4564        }
4565        base64bits += 16;
4566        base64buffer = (base64buffer << 16) | ch;
4567        while (base64bits >= 6) {
4568            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4569            base64bits -= 6;
4570        }
4571    }
4572    if (base64bits)
4573        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4574    if (inShift)
4575        *out++ = '-';
4576    if (_PyBytes_Resize(&v, out - start) < 0)
4577        return NULL;
4578    return v;
4579}
4580PyObject *
4581PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4582                     Py_ssize_t size,
4583                     int base64SetO,
4584                     int base64WhiteSpace,
4585                     const char *errors)
4586{
4587    PyObject *result;
4588    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4589    if (tmp == NULL)
4590        return NULL;
4591    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4592                                   base64WhiteSpace, errors);
4593    Py_DECREF(tmp);
4594    return result;
4595}
4596
4597#undef IS_BASE64
4598#undef FROM_BASE64
4599#undef TO_BASE64
4600#undef DECODE_DIRECT
4601#undef ENCODE_DIRECT
4602
4603/* --- UTF-8 Codec -------------------------------------------------------- */
4604
4605PyObject *
4606PyUnicode_DecodeUTF8(const char *s,
4607                     Py_ssize_t size,
4608                     const char *errors)
4609{
4610    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4611}
4612
4613#include "stringlib/asciilib.h"
4614#include "stringlib/codecs.h"
4615#include "stringlib/undef.h"
4616
4617#include "stringlib/ucs1lib.h"
4618#include "stringlib/codecs.h"
4619#include "stringlib/undef.h"
4620
4621#include "stringlib/ucs2lib.h"
4622#include "stringlib/codecs.h"
4623#include "stringlib/undef.h"
4624
4625#include "stringlib/ucs4lib.h"
4626#include "stringlib/codecs.h"
4627#include "stringlib/undef.h"
4628
4629/* Mask to quickly check whether a C 'long' contains a
4630   non-ASCII, UTF8-encoded char. */
4631#if (SIZEOF_LONG == 8)
4632# define ASCII_CHAR_MASK 0x8080808080808080UL
4633#elif (SIZEOF_LONG == 4)
4634# define ASCII_CHAR_MASK 0x80808080UL
4635#else
4636# error C 'long' size should be either 4 or 8!
4637#endif
4638
4639static Py_ssize_t
4640ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4641{
4642    const char *p = start;
4643    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4644
4645    /*
4646     * Issue #17237: m68k is a bit different from most architectures in
4647     * that objects do not use "natural alignment" - for example, int and
4648     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4649     * won't work; also, tests have shown that skipping the "optimised
4650     * version" will even speed up m68k.
4651     */
4652#if !defined(__m68k__)
4653#if SIZEOF_LONG <= SIZEOF_VOID_P
4654    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4655    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4656        /* Fast path, see in STRINGLIB(utf8_decode) for
4657           an explanation. */
4658        /* Help register allocation */
4659        register const char *_p = p;
4660        register Py_UCS1 * q = dest;
4661        while (_p < aligned_end) {
4662            unsigned long value = *(const unsigned long *) _p;
4663            if (value & ASCII_CHAR_MASK)
4664                break;
4665            *((unsigned long *)q) = value;
4666            _p += SIZEOF_LONG;
4667            q += SIZEOF_LONG;
4668        }
4669        p = _p;
4670        while (p < end) {
4671            if ((unsigned char)*p & 0x80)
4672                break;
4673            *q++ = *p++;
4674        }
4675        return p - start;
4676    }
4677#endif
4678#endif
4679    while (p < end) {
4680        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4681           for an explanation. */
4682        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4683            /* Help register allocation */
4684            register const char *_p = p;
4685            while (_p < aligned_end) {
4686                unsigned long value = *(unsigned long *) _p;
4687                if (value & ASCII_CHAR_MASK)
4688                    break;
4689                _p += SIZEOF_LONG;
4690            }
4691            p = _p;
4692            if (_p == end)
4693                break;
4694        }
4695        if ((unsigned char)*p & 0x80)
4696            break;
4697        ++p;
4698    }
4699    memcpy(dest, start, p - start);
4700    return p - start;
4701}
4702
4703PyObject *
4704PyUnicode_DecodeUTF8Stateful(const char *s,
4705                             Py_ssize_t size,
4706                             const char *errors,
4707                             Py_ssize_t *consumed)
4708{
4709    _PyUnicodeWriter writer;
4710    const char *starts = s;
4711    const char *end = s + size;
4712
4713    Py_ssize_t startinpos;
4714    Py_ssize_t endinpos;
4715    const char *errmsg = "";
4716    PyObject *errorHandler = NULL;
4717    PyObject *exc = NULL;
4718
4719    if (size == 0) {
4720        if (consumed)
4721            *consumed = 0;
4722        _Py_RETURN_UNICODE_EMPTY();
4723    }
4724
4725    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4726    if (size == 1 && (unsigned char)s[0] < 128) {
4727        if (consumed)
4728            *consumed = 1;
4729        return get_latin1_char((unsigned char)s[0]);
4730    }
4731
4732    _PyUnicodeWriter_Init(&writer);
4733    writer.min_length = size;
4734    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4735        goto onError;
4736
4737    writer.pos = ascii_decode(s, end, writer.data);
4738    s += writer.pos;
4739    while (s < end) {
4740        Py_UCS4 ch;
4741        int kind = writer.kind;
4742        if (kind == PyUnicode_1BYTE_KIND) {
4743            if (PyUnicode_IS_ASCII(writer.buffer))
4744                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4745            else
4746                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4747        } else if (kind == PyUnicode_2BYTE_KIND) {
4748            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4749        } else {
4750            assert(kind == PyUnicode_4BYTE_KIND);
4751            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4752        }
4753
4754        switch (ch) {
4755        case 0:
4756            if (s == end || consumed)
4757                goto End;
4758            errmsg = "unexpected end of data";
4759            startinpos = s - starts;
4760            endinpos = end - starts;
4761            break;
4762        case 1:
4763            errmsg = "invalid start byte";
4764            startinpos = s - starts;
4765            endinpos = startinpos + 1;
4766            break;
4767        case 2:
4768        case 3:
4769        case 4:
4770            errmsg = "invalid continuation byte";
4771            startinpos = s - starts;
4772            endinpos = startinpos + ch - 1;
4773            break;
4774        default:
4775            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4776                goto onError;
4777            continue;
4778        }
4779
4780        if (unicode_decode_call_errorhandler_writer(
4781                errors, &errorHandler,
4782                "utf-8", errmsg,
4783                &starts, &end, &startinpos, &endinpos, &exc, &s,
4784                &writer))
4785            goto onError;
4786    }
4787
4788End:
4789    if (consumed)
4790        *consumed = s - starts;
4791
4792    Py_XDECREF(errorHandler);
4793    Py_XDECREF(exc);
4794    return _PyUnicodeWriter_Finish(&writer);
4795
4796onError:
4797    Py_XDECREF(errorHandler);
4798    Py_XDECREF(exc);
4799    _PyUnicodeWriter_Dealloc(&writer);
4800    return NULL;
4801}
4802
4803#ifdef __APPLE__
4804
4805/* Simplified UTF-8 decoder using surrogateescape error handler,
4806   used to decode the command line arguments on Mac OS X.
4807
4808   Return a pointer to a newly allocated wide character string (use
4809   PyMem_Free() to free the memory), or NULL on memory allocation error. */
4810
4811wchar_t*
4812_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4813{
4814    const char *e;
4815    wchar_t *unicode;
4816    Py_ssize_t outpos;
4817
4818    /* Note: size will always be longer than the resulting Unicode
4819       character count */
4820    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4821        return NULL;
4822    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
4823    if (!unicode)
4824        return NULL;
4825
4826    /* Unpack UTF-8 encoded data */
4827    e = s + size;
4828    outpos = 0;
4829    while (s < e) {
4830        Py_UCS4 ch;
4831#if SIZEOF_WCHAR_T == 4
4832        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4833#else
4834        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4835#endif
4836        if (ch > 0xFF) {
4837#if SIZEOF_WCHAR_T == 4
4838            assert(0);
4839#else
4840            assert(Py_UNICODE_IS_SURROGATE(ch));
4841            /*  compute and append the two surrogates: */
4842            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4843            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4844#endif
4845        }
4846        else {
4847            if (!ch && s == e)
4848                break;
4849            /* surrogateescape */
4850            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4851        }
4852    }
4853    unicode[outpos] = L'\0';
4854    return unicode;
4855}
4856
4857#endif /* __APPLE__ */
4858
4859/* Primary internal function which creates utf8 encoded bytes objects.
4860
4861   Allocation strategy:  if the string is short, convert into a stack buffer
4862   and allocate exactly as much space needed at the end.  Else allocate the
4863   maximum possible needed (4 result bytes per Unicode character), and return
4864   the excess memory at the end.
4865*/
4866PyObject *
4867_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4868{
4869    enum PyUnicode_Kind kind;
4870    void *data;
4871    Py_ssize_t size;
4872
4873    if (!PyUnicode_Check(unicode)) {
4874        PyErr_BadArgument();
4875        return NULL;
4876    }
4877
4878    if (PyUnicode_READY(unicode) == -1)
4879        return NULL;
4880
4881    if (PyUnicode_UTF8(unicode))
4882        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4883                                         PyUnicode_UTF8_LENGTH(unicode));
4884
4885    kind = PyUnicode_KIND(unicode);
4886    data = PyUnicode_DATA(unicode);
4887    size = PyUnicode_GET_LENGTH(unicode);
4888
4889    switch (kind) {
4890    default:
4891        assert(0);
4892    case PyUnicode_1BYTE_KIND:
4893        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4894        assert(!PyUnicode_IS_ASCII(unicode));
4895        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4896    case PyUnicode_2BYTE_KIND:
4897        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4898    case PyUnicode_4BYTE_KIND:
4899        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4900    }
4901}
4902
4903PyObject *
4904PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4905                     Py_ssize_t size,
4906                     const char *errors)
4907{
4908    PyObject *v, *unicode;
4909
4910    unicode = PyUnicode_FromUnicode(s, size);
4911    if (unicode == NULL)
4912        return NULL;
4913    v = _PyUnicode_AsUTF8String(unicode, errors);
4914    Py_DECREF(unicode);
4915    return v;
4916}
4917
4918PyObject *
4919PyUnicode_AsUTF8String(PyObject *unicode)
4920{
4921    return _PyUnicode_AsUTF8String(unicode, NULL);
4922}
4923
4924/* --- UTF-32 Codec ------------------------------------------------------- */
4925
4926PyObject *
4927PyUnicode_DecodeUTF32(const char *s,
4928                      Py_ssize_t size,
4929                      const char *errors,
4930                      int *byteorder)
4931{
4932    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4933}
4934
4935PyObject *
4936PyUnicode_DecodeUTF32Stateful(const char *s,
4937                              Py_ssize_t size,
4938                              const char *errors,
4939                              int *byteorder,
4940                              Py_ssize_t *consumed)
4941{
4942    const char *starts = s;
4943    Py_ssize_t startinpos;
4944    Py_ssize_t endinpos;
4945    _PyUnicodeWriter writer;
4946    const unsigned char *q, *e;
4947    int le, bo = 0;       /* assume native ordering by default */
4948    const char *errmsg = "";
4949    PyObject *errorHandler = NULL;
4950    PyObject *exc = NULL;
4951
4952    q = (unsigned char *)s;
4953    e = q + size;
4954
4955    if (byteorder)
4956        bo = *byteorder;
4957
4958    /* Check for BOM marks (U+FEFF) in the input and adjust current
4959       byte order setting accordingly. In native mode, the leading BOM
4960       mark is skipped, in all other modes, it is copied to the output
4961       stream as-is (giving a ZWNBSP character). */
4962    if (bo == 0 && size >= 4) {
4963        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4964        if (bom == 0x0000FEFF) {
4965            bo = -1;
4966            q += 4;
4967        }
4968        else if (bom == 0xFFFE0000) {
4969            bo = 1;
4970            q += 4;
4971        }
4972        if (byteorder)
4973            *byteorder = bo;
4974    }
4975
4976    if (q == e) {
4977        if (consumed)
4978            *consumed = size;
4979        _Py_RETURN_UNICODE_EMPTY();
4980    }
4981
4982#ifdef WORDS_BIGENDIAN
4983    le = bo < 0;
4984#else
4985    le = bo <= 0;
4986#endif
4987
4988    _PyUnicodeWriter_Init(&writer);
4989    writer.min_length = (e - q + 3) / 4;
4990    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4991        goto onError;
4992
4993    while (1) {
4994        Py_UCS4 ch = 0;
4995        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
4996
4997        if (e - q >= 4) {
4998            enum PyUnicode_Kind kind = writer.kind;
4999            void *data = writer.data;
5000            const unsigned char *last = e - 4;
5001            Py_ssize_t pos = writer.pos;
5002            if (le) {
5003                do {
5004                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5005                    if (ch > maxch)
5006                        break;
5007                    PyUnicode_WRITE(kind, data, pos++, ch);
5008                    q += 4;
5009                } while (q <= last);
5010            }
5011            else {
5012                do {
5013                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5014                    if (ch > maxch)
5015                        break;
5016                    PyUnicode_WRITE(kind, data, pos++, ch);
5017                    q += 4;
5018                } while (q <= last);
5019            }
5020            writer.pos = pos;
5021        }
5022
5023        if (ch <= maxch) {
5024            if (q == e || consumed)
5025                break;
5026            /* remaining bytes at the end? (size should be divisible by 4) */
5027            errmsg = "truncated data";
5028            startinpos = ((const char *)q) - starts;
5029            endinpos = ((const char *)e) - starts;
5030        }
5031        else {
5032            if (ch < 0x110000) {
5033                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5034                    goto onError;
5035                q += 4;
5036                continue;
5037            }
5038            errmsg = "codepoint not in range(0x110000)";
5039            startinpos = ((const char *)q) - starts;
5040            endinpos = startinpos + 4;
5041        }
5042
5043        /* The remaining input chars are ignored if the callback
5044           chooses to skip the input */
5045        if (unicode_decode_call_errorhandler_writer(
5046                errors, &errorHandler,
5047                "utf32", errmsg,
5048                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5049                &writer))
5050            goto onError;
5051    }
5052
5053    if (consumed)
5054        *consumed = (const char *)q-starts;
5055
5056    Py_XDECREF(errorHandler);
5057    Py_XDECREF(exc);
5058    return _PyUnicodeWriter_Finish(&writer);
5059
5060  onError:
5061    _PyUnicodeWriter_Dealloc(&writer);
5062    Py_XDECREF(errorHandler);
5063    Py_XDECREF(exc);
5064    return NULL;
5065}
5066
5067PyObject *
5068_PyUnicode_EncodeUTF32(PyObject *str,
5069                       const char *errors,
5070                       int byteorder)
5071{
5072    int kind;
5073    void *data;
5074    Py_ssize_t len;
5075    PyObject *v;
5076    unsigned char *p;
5077    Py_ssize_t nsize, i;
5078    /* Offsets from p for storing byte pairs in the right order. */
5079#if PY_LITTLE_ENDIAN
5080    int iorder[] = {0, 1, 2, 3};
5081#else
5082    int iorder[] = {3, 2, 1, 0};
5083#endif
5084
5085#define STORECHAR(CH)                           \
5086    do {                                        \
5087        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5088        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5089        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5090        p[iorder[0]] = (CH) & 0xff;             \
5091        p += 4;                                 \
5092    } while(0)
5093
5094    if (!PyUnicode_Check(str)) {
5095        PyErr_BadArgument();
5096        return NULL;
5097    }
5098    if (PyUnicode_READY(str) == -1)
5099        return NULL;
5100    kind = PyUnicode_KIND(str);
5101    data = PyUnicode_DATA(str);
5102    len = PyUnicode_GET_LENGTH(str);
5103
5104    nsize = len + (byteorder == 0);
5105    if (nsize > PY_SSIZE_T_MAX / 4)
5106        return PyErr_NoMemory();
5107    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5108    if (v == NULL)
5109        return NULL;
5110
5111    p = (unsigned char *)PyBytes_AS_STRING(v);
5112    if (byteorder == 0)
5113        STORECHAR(0xFEFF);
5114    if (len == 0)
5115        goto done;
5116
5117    if (byteorder == -1) {
5118        /* force LE */
5119        iorder[0] = 0;
5120        iorder[1] = 1;
5121        iorder[2] = 2;
5122        iorder[3] = 3;
5123    }
5124    else if (byteorder == 1) {
5125        /* force BE */
5126        iorder[0] = 3;
5127        iorder[1] = 2;
5128        iorder[2] = 1;
5129        iorder[3] = 0;
5130    }
5131
5132    for (i = 0; i < len; i++)
5133        STORECHAR(PyUnicode_READ(kind, data, i));
5134
5135  done:
5136    return v;
5137#undef STORECHAR
5138}
5139
5140PyObject *
5141PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5142                      Py_ssize_t size,
5143                      const char *errors,
5144                      int byteorder)
5145{
5146    PyObject *result;
5147    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5148    if (tmp == NULL)
5149        return NULL;
5150    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5151    Py_DECREF(tmp);
5152    return result;
5153}
5154
5155PyObject *
5156PyUnicode_AsUTF32String(PyObject *unicode)
5157{
5158    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5159}
5160
5161/* --- UTF-16 Codec ------------------------------------------------------- */
5162
5163PyObject *
5164PyUnicode_DecodeUTF16(const char *s,
5165                      Py_ssize_t size,
5166                      const char *errors,
5167                      int *byteorder)
5168{
5169    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5170}
5171
5172PyObject *
5173PyUnicode_DecodeUTF16Stateful(const char *s,
5174                              Py_ssize_t size,
5175                              const char *errors,
5176                              int *byteorder,
5177                              Py_ssize_t *consumed)
5178{
5179    const char *starts = s;
5180    Py_ssize_t startinpos;
5181    Py_ssize_t endinpos;
5182    _PyUnicodeWriter writer;
5183    const unsigned char *q, *e;
5184    int bo = 0;       /* assume native ordering by default */
5185    int native_ordering;
5186    const char *errmsg = "";
5187    PyObject *errorHandler = NULL;
5188    PyObject *exc = NULL;
5189
5190    q = (unsigned char *)s;
5191    e = q + size;
5192
5193    if (byteorder)
5194        bo = *byteorder;
5195
5196    /* Check for BOM marks (U+FEFF) in the input and adjust current
5197       byte order setting accordingly. In native mode, the leading BOM
5198       mark is skipped, in all other modes, it is copied to the output
5199       stream as-is (giving a ZWNBSP character). */
5200    if (bo == 0 && size >= 2) {
5201        const Py_UCS4 bom = (q[1] << 8) | q[0];
5202        if (bom == 0xFEFF) {
5203            q += 2;
5204            bo = -1;
5205        }
5206        else if (bom == 0xFFFE) {
5207            q += 2;
5208            bo = 1;
5209        }
5210        if (byteorder)
5211            *byteorder = bo;
5212    }
5213
5214    if (q == e) {
5215        if (consumed)
5216            *consumed = size;
5217        _Py_RETURN_UNICODE_EMPTY();
5218    }
5219
5220#if PY_LITTLE_ENDIAN
5221    native_ordering = bo <= 0;
5222#else
5223    native_ordering = bo >= 0;
5224#endif
5225
5226    /* Note: size will always be longer than the resulting Unicode
5227       character count */
5228    _PyUnicodeWriter_Init(&writer);
5229    writer.min_length = (e - q + 1) / 2;
5230    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5231        goto onError;
5232
5233    while (1) {
5234        Py_UCS4 ch = 0;
5235        if (e - q >= 2) {
5236            int kind = writer.kind;
5237            if (kind == PyUnicode_1BYTE_KIND) {
5238                if (PyUnicode_IS_ASCII(writer.buffer))
5239                    ch = asciilib_utf16_decode(&q, e,
5240                            (Py_UCS1*)writer.data, &writer.pos,
5241                            native_ordering);
5242                else
5243                    ch = ucs1lib_utf16_decode(&q, e,
5244                            (Py_UCS1*)writer.data, &writer.pos,
5245                            native_ordering);
5246            } else if (kind == PyUnicode_2BYTE_KIND) {
5247                ch = ucs2lib_utf16_decode(&q, e,
5248                        (Py_UCS2*)writer.data, &writer.pos,
5249                        native_ordering);
5250            } else {
5251                assert(kind == PyUnicode_4BYTE_KIND);
5252                ch = ucs4lib_utf16_decode(&q, e,
5253                        (Py_UCS4*)writer.data, &writer.pos,
5254                        native_ordering);
5255            }
5256        }
5257
5258        switch (ch)
5259        {
5260        case 0:
5261            /* remaining byte at the end? (size should be even) */
5262            if (q == e || consumed)
5263                goto End;
5264            errmsg = "truncated data";
5265            startinpos = ((const char *)q) - starts;
5266            endinpos = ((const char *)e) - starts;
5267            break;
5268            /* The remaining input chars are ignored if the callback
5269               chooses to skip the input */
5270        case 1:
5271            q -= 2;
5272            if (consumed)
5273                goto End;
5274            errmsg = "unexpected end of data";
5275            startinpos = ((const char *)q) - starts;
5276            endinpos = ((const char *)e) - starts;
5277            break;
5278        case 2:
5279            errmsg = "illegal encoding";
5280            startinpos = ((const char *)q) - 2 - starts;
5281            endinpos = startinpos + 2;
5282            break;
5283        case 3:
5284            errmsg = "illegal UTF-16 surrogate";
5285            startinpos = ((const char *)q) - 4 - starts;
5286            endinpos = startinpos + 2;
5287            break;
5288        default:
5289            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5290                goto onError;
5291            continue;
5292        }
5293
5294        if (unicode_decode_call_errorhandler_writer(
5295                errors,
5296                &errorHandler,
5297                "utf16", errmsg,
5298                &starts,
5299                (const char **)&e,
5300                &startinpos,
5301                &endinpos,
5302                &exc,
5303                (const char **)&q,
5304                &writer))
5305            goto onError;
5306    }
5307
5308End:
5309    if (consumed)
5310        *consumed = (const char *)q-starts;
5311
5312    Py_XDECREF(errorHandler);
5313    Py_XDECREF(exc);
5314    return _PyUnicodeWriter_Finish(&writer);
5315
5316  onError:
5317    _PyUnicodeWriter_Dealloc(&writer);
5318    Py_XDECREF(errorHandler);
5319    Py_XDECREF(exc);
5320    return NULL;
5321}
5322
5323PyObject *
5324_PyUnicode_EncodeUTF16(PyObject *str,
5325                       const char *errors,
5326                       int byteorder)
5327{
5328    enum PyUnicode_Kind kind;
5329    const void *data;
5330    Py_ssize_t len;
5331    PyObject *v;
5332    unsigned short *out;
5333    Py_ssize_t bytesize;
5334    Py_ssize_t pairs;
5335#if PY_BIG_ENDIAN
5336    int native_ordering = byteorder >= 0;
5337#else
5338    int native_ordering = byteorder <= 0;
5339#endif
5340
5341    if (!PyUnicode_Check(str)) {
5342        PyErr_BadArgument();
5343        return NULL;
5344    }
5345    if (PyUnicode_READY(str) == -1)
5346        return NULL;
5347    kind = PyUnicode_KIND(str);
5348    data = PyUnicode_DATA(str);
5349    len = PyUnicode_GET_LENGTH(str);
5350
5351    pairs = 0;
5352    if (kind == PyUnicode_4BYTE_KIND) {
5353        const Py_UCS4 *in = (const Py_UCS4 *)data;
5354        const Py_UCS4 *end = in + len;
5355        while (in < end)
5356            if (*in++ >= 0x10000)
5357                pairs++;
5358    }
5359    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5360        return PyErr_NoMemory();
5361    bytesize = (len + pairs + (byteorder == 0)) * 2;
5362    v = PyBytes_FromStringAndSize(NULL, bytesize);
5363    if (v == NULL)
5364        return NULL;
5365
5366    /* output buffer is 2-bytes aligned */
5367    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5368    out = (unsigned short *)PyBytes_AS_STRING(v);
5369    if (byteorder == 0)
5370        *out++ = 0xFEFF;
5371    if (len == 0)
5372        goto done;
5373
5374    switch (kind) {
5375    case PyUnicode_1BYTE_KIND: {
5376        ucs1lib_utf16_encode(out, (const Py_UCS1 *)data, len, native_ordering);
5377        break;
5378    }
5379    case PyUnicode_2BYTE_KIND: {
5380        ucs2lib_utf16_encode(out, (const Py_UCS2 *)data, len, native_ordering);
5381        break;
5382    }
5383    case PyUnicode_4BYTE_KIND: {
5384        ucs4lib_utf16_encode(out, (const Py_UCS4 *)data, len, native_ordering);
5385        break;
5386    }
5387    default:
5388        assert(0);
5389    }
5390
5391  done:
5392    return v;
5393}
5394
5395PyObject *
5396PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5397                      Py_ssize_t size,
5398                      const char *errors,
5399                      int byteorder)
5400{
5401    PyObject *result;
5402    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5403    if (tmp == NULL)
5404        return NULL;
5405    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5406    Py_DECREF(tmp);
5407    return result;
5408}
5409
5410PyObject *
5411PyUnicode_AsUTF16String(PyObject *unicode)
5412{
5413    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5414}
5415
5416/* --- Unicode Escape Codec ----------------------------------------------- */
5417
5418/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5419   if all the escapes in the string make it still a valid ASCII string.
5420   Returns -1 if any escapes were found which cause the string to
5421   pop out of ASCII range.  Otherwise returns the length of the
5422   required buffer to hold the string.
5423   */
5424static Py_ssize_t
5425length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5426{
5427    const unsigned char *p = (const unsigned char *)s;
5428    const unsigned char *end = p + size;
5429    Py_ssize_t length = 0;
5430
5431    if (size < 0)
5432        return -1;
5433
5434    for (; p < end; ++p) {
5435        if (*p > 127) {
5436            /* Non-ASCII */
5437            return -1;
5438        }
5439        else if (*p != '\\') {
5440            /* Normal character */
5441            ++length;
5442        }
5443        else {
5444            /* Backslash-escape, check next char */
5445            ++p;
5446            /* Escape sequence reaches till end of string or
5447               non-ASCII follow-up. */
5448            if (p >= end || *p > 127)
5449                return -1;
5450            switch (*p) {
5451            case '\n':
5452                /* backslash + \n result in zero characters */
5453                break;
5454            case '\\': case '\'': case '\"':
5455            case 'b': case 'f': case 't':
5456            case 'n': case 'r': case 'v': case 'a':
5457                ++length;
5458                break;
5459            case '0': case '1': case '2': case '3':
5460            case '4': case '5': case '6': case '7':
5461            case 'x': case 'u': case 'U': case 'N':
5462                /* these do not guarantee ASCII characters */
5463                return -1;
5464            default:
5465                /* count the backslash + the other character */
5466                length += 2;
5467            }
5468        }
5469    }
5470    return length;
5471}
5472
5473static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5474
5475PyObject *
5476PyUnicode_DecodeUnicodeEscape(const char *s,
5477                              Py_ssize_t size,
5478                              const char *errors)
5479{
5480    const char *starts = s;
5481    Py_ssize_t startinpos;
5482    Py_ssize_t endinpos;
5483    _PyUnicodeWriter writer;
5484    const char *end;
5485    char* message;
5486    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5487    PyObject *errorHandler = NULL;
5488    PyObject *exc = NULL;
5489    Py_ssize_t len;
5490
5491    len = length_of_escaped_ascii_string(s, size);
5492    if (len == 0)
5493        _Py_RETURN_UNICODE_EMPTY();
5494
5495    /* After length_of_escaped_ascii_string() there are two alternatives,
5496       either the string is pure ASCII with named escapes like \n, etc.
5497       and we determined it's exact size (common case)
5498       or it contains \x, \u, ... escape sequences.  then we create a
5499       legacy wchar string and resize it at the end of this function. */
5500    _PyUnicodeWriter_Init(&writer);
5501    if (len > 0) {
5502        writer.min_length = len;
5503    }
5504    else {
5505        /* Escaped strings will always be longer than the resulting
5506           Unicode string, so we start with size here and then reduce the
5507           length after conversion to the true value.
5508           (but if the error callback returns a long replacement string
5509           we'll have to allocate more space) */
5510        writer.min_length = size;
5511    }
5512
5513    if (size == 0)
5514        return _PyUnicodeWriter_Finish(&writer);
5515    end = s + size;
5516
5517    while (s < end) {
5518        unsigned char c;
5519        Py_UCS4 x;
5520        int digits;
5521
5522        /* Non-escape characters are interpreted as Unicode ordinals */
5523        if (*s != '\\') {
5524            x = (unsigned char)*s;
5525            s++;
5526            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5527                goto onError;
5528            continue;
5529        }
5530
5531        startinpos = s-starts;
5532        /* \ - Escapes */
5533        s++;
5534        c = *s++;
5535        if (s > end)
5536            c = '\0'; /* Invalid after \ */
5537
5538        switch (c) {
5539
5540            /* \x escapes */
5541#define WRITECHAR(ch)                                                      \
5542            do {                                                           \
5543                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5544                    goto onError;                                          \
5545            } while(0)
5546
5547        case '\n': break;
5548        case '\\': WRITECHAR('\\'); break;
5549        case '\'': WRITECHAR('\''); break;
5550        case '\"': WRITECHAR('\"'); break;
5551        case 'b': WRITECHAR('\b'); break;
5552        /* FF */
5553        case 'f': WRITECHAR('\014'); break;
5554        case 't': WRITECHAR('\t'); break;
5555        case 'n': WRITECHAR('\n'); break;
5556        case 'r': WRITECHAR('\r'); break;
5557        /* VT */
5558        case 'v': WRITECHAR('\013'); break;
5559        /* BEL, not classic C */
5560        case 'a': WRITECHAR('\007'); break;
5561
5562            /* \OOO (octal) escapes */
5563        case '0': case '1': case '2': case '3':
5564        case '4': case '5': case '6': case '7':
5565            x = s[-1] - '0';
5566            if (s < end && '0' <= *s && *s <= '7') {
5567                x = (x<<3) + *s++ - '0';
5568                if (s < end && '0' <= *s && *s <= '7')
5569                    x = (x<<3) + *s++ - '0';
5570            }
5571            WRITECHAR(x);
5572            break;
5573
5574            /* hex escapes */
5575            /* \xXX */
5576        case 'x':
5577            digits = 2;
5578            message = "truncated \\xXX escape";
5579            goto hexescape;
5580
5581            /* \uXXXX */
5582        case 'u':
5583            digits = 4;
5584            message = "truncated \\uXXXX escape";
5585            goto hexescape;
5586
5587            /* \UXXXXXXXX */
5588        case 'U':
5589            digits = 8;
5590            message = "truncated \\UXXXXXXXX escape";
5591        hexescape:
5592            chr = 0;
5593            if (end - s < digits) {
5594                /* count only hex digits */
5595                for (; s < end; ++s) {
5596                    c = (unsigned char)*s;
5597                    if (!Py_ISXDIGIT(c))
5598                        goto error;
5599                }
5600                goto error;
5601            }
5602            for (; digits--; ++s) {
5603                c = (unsigned char)*s;
5604                if (!Py_ISXDIGIT(c))
5605                    goto error;
5606                chr = (chr<<4) & ~0xF;
5607                if (c >= '0' && c <= '9')
5608                    chr += c - '0';
5609                else if (c >= 'a' && c <= 'f')
5610                    chr += 10 + c - 'a';
5611                else
5612                    chr += 10 + c - 'A';
5613            }
5614            if (chr == 0xffffffff && PyErr_Occurred())
5615                /* _decoding_error will have already written into the
5616                   target buffer. */
5617                break;
5618        store:
5619            /* when we get here, chr is a 32-bit unicode character */
5620            message = "illegal Unicode character";
5621            if (chr > MAX_UNICODE)
5622                goto error;
5623            WRITECHAR(chr);
5624            break;
5625
5626            /* \N{name} */
5627        case 'N':
5628            message = "malformed \\N character escape";
5629            if (ucnhash_CAPI == NULL) {
5630                /* load the unicode data module */
5631                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5632                                                PyUnicodeData_CAPSULE_NAME, 1);
5633                if (ucnhash_CAPI == NULL)
5634                    goto ucnhashError;
5635            }
5636            if (*s == '{') {
5637                const char *start = s+1;
5638                /* look for the closing brace */
5639                while (*s != '}' && s < end)
5640                    s++;
5641                if (s > start && s < end && *s == '}') {
5642                    /* found a name.  look it up in the unicode database */
5643                    message = "unknown Unicode character name";
5644                    s++;
5645                    if (s - start - 1 <= INT_MAX &&
5646                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5647                                              &chr, 0))
5648                        goto store;
5649                }
5650            }
5651            goto error;
5652
5653        default:
5654            if (s > end) {
5655                message = "\\ at end of string";
5656                s--;
5657                goto error;
5658            }
5659            else {
5660                WRITECHAR('\\');
5661                WRITECHAR((unsigned char)s[-1]);
5662            }
5663            break;
5664        }
5665        continue;
5666
5667      error:
5668        endinpos = s-starts;
5669        if (unicode_decode_call_errorhandler_writer(
5670                errors, &errorHandler,
5671                "unicodeescape", message,
5672                &starts, &end, &startinpos, &endinpos, &exc, &s,
5673                &writer))
5674            goto onError;
5675        continue;
5676    }
5677#undef WRITECHAR
5678
5679    Py_XDECREF(errorHandler);
5680    Py_XDECREF(exc);
5681    return _PyUnicodeWriter_Finish(&writer);
5682
5683  ucnhashError:
5684    PyErr_SetString(
5685        PyExc_UnicodeError,
5686        "\\N escapes not supported (can't load unicodedata module)"
5687        );
5688    _PyUnicodeWriter_Dealloc(&writer);
5689    Py_XDECREF(errorHandler);
5690    Py_XDECREF(exc);
5691    return NULL;
5692
5693  onError:
5694    _PyUnicodeWriter_Dealloc(&writer);
5695    Py_XDECREF(errorHandler);
5696    Py_XDECREF(exc);
5697    return NULL;
5698}
5699
5700/* Return a Unicode-Escape string version of the Unicode object.
5701
5702   If quotes is true, the string is enclosed in u"" or u'' quotes as
5703   appropriate.
5704
5705*/
5706
5707PyObject *
5708PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5709{
5710    Py_ssize_t i, len;
5711    PyObject *repr;
5712    char *p;
5713    int kind;
5714    void *data;
5715    Py_ssize_t expandsize = 0;
5716
5717    /* Initial allocation is based on the longest-possible character
5718       escape.
5719
5720       For UCS1 strings it's '\xxx', 4 bytes per source character.
5721       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5722       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5723    */
5724
5725    if (!PyUnicode_Check(unicode)) {
5726        PyErr_BadArgument();
5727        return NULL;
5728    }
5729    if (PyUnicode_READY(unicode) == -1)
5730        return NULL;
5731    len = PyUnicode_GET_LENGTH(unicode);
5732    kind = PyUnicode_KIND(unicode);
5733    data = PyUnicode_DATA(unicode);
5734    switch (kind) {
5735    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5736    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5737    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5738    }
5739
5740    if (len == 0)
5741        return PyBytes_FromStringAndSize(NULL, 0);
5742
5743    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5744        return PyErr_NoMemory();
5745
5746    repr = PyBytes_FromStringAndSize(NULL,
5747                                     2
5748                                     + expandsize*len
5749                                     + 1);
5750    if (repr == NULL)
5751        return NULL;
5752
5753    p = PyBytes_AS_STRING(repr);
5754
5755    for (i = 0; i < len; i++) {
5756        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5757
5758        /* Escape backslashes */
5759        if (ch == '\\') {
5760            *p++ = '\\';
5761            *p++ = (char) ch;
5762            continue;
5763        }
5764
5765        /* Map 21-bit characters to '\U00xxxxxx' */
5766        else if (ch >= 0x10000) {
5767            assert(ch <= MAX_UNICODE);
5768            *p++ = '\\';
5769            *p++ = 'U';
5770            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5771            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5772            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5773            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5774            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5775            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5776            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5777            *p++ = Py_hexdigits[ch & 0x0000000F];
5778            continue;
5779        }
5780
5781        /* Map 16-bit characters to '\uxxxx' */
5782        if (ch >= 256) {
5783            *p++ = '\\';
5784            *p++ = 'u';
5785            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
5786            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
5787            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5788            *p++ = Py_hexdigits[ch & 0x000F];
5789        }
5790
5791        /* Map special whitespace to '\t', \n', '\r' */
5792        else if (ch == '\t') {
5793            *p++ = '\\';
5794            *p++ = 't';
5795        }
5796        else if (ch == '\n') {
5797            *p++ = '\\';
5798            *p++ = 'n';
5799        }
5800        else if (ch == '\r') {
5801            *p++ = '\\';
5802            *p++ = 'r';
5803        }
5804
5805        /* Map non-printable US ASCII to '\xhh' */
5806        else if (ch < ' ' || ch >= 0x7F) {
5807            *p++ = '\\';
5808            *p++ = 'x';
5809            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
5810            *p++ = Py_hexdigits[ch & 0x000F];
5811        }
5812
5813        /* Copy everything else as-is */
5814        else
5815            *p++ = (char) ch;
5816    }
5817
5818    assert(p - PyBytes_AS_STRING(repr) > 0);
5819    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
5820        return NULL;
5821    return repr;
5822}
5823
5824PyObject *
5825PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
5826                              Py_ssize_t size)
5827{
5828    PyObject *result;
5829    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5830    if (tmp == NULL)
5831        return NULL;
5832    result = PyUnicode_AsUnicodeEscapeString(tmp);
5833    Py_DECREF(tmp);
5834    return result;
5835}
5836
5837/* --- Raw Unicode Escape Codec ------------------------------------------- */
5838
5839PyObject *
5840PyUnicode_DecodeRawUnicodeEscape(const char *s,
5841                                 Py_ssize_t size,
5842                                 const char *errors)
5843{
5844    const char *starts = s;
5845    Py_ssize_t startinpos;
5846    Py_ssize_t endinpos;
5847    _PyUnicodeWriter writer;
5848    const char *end;
5849    const char *bs;
5850    PyObject *errorHandler = NULL;
5851    PyObject *exc = NULL;
5852
5853    if (size == 0)
5854        _Py_RETURN_UNICODE_EMPTY();
5855
5856    /* Escaped strings will always be longer than the resulting
5857       Unicode string, so we start with size here and then reduce the
5858       length after conversion to the true value. (But decoding error
5859       handler might have to resize the string) */
5860    _PyUnicodeWriter_Init(&writer);
5861    writer.min_length = size;
5862
5863    end = s + size;
5864    while (s < end) {
5865        unsigned char c;
5866        Py_UCS4 x;
5867        int i;
5868        int count;
5869
5870        /* Non-escape characters are interpreted as Unicode ordinals */
5871        if (*s != '\\') {
5872            x = (unsigned char)*s++;
5873            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5874                goto onError;
5875            continue;
5876        }
5877        startinpos = s-starts;
5878
5879        /* \u-escapes are only interpreted iff the number of leading
5880           backslashes if odd */
5881        bs = s;
5882        for (;s < end;) {
5883            if (*s != '\\')
5884                break;
5885            x = (unsigned char)*s++;
5886            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5887                goto onError;
5888        }
5889        if (((s - bs) & 1) == 0 ||
5890            s >= end ||
5891            (*s != 'u' && *s != 'U')) {
5892            continue;
5893        }
5894        writer.pos--;
5895        count = *s=='u' ? 4 : 8;
5896        s++;
5897
5898        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
5899        for (x = 0, i = 0; i < count; ++i, ++s) {
5900            c = (unsigned char)*s;
5901            if (!Py_ISXDIGIT(c)) {
5902                endinpos = s-starts;
5903                if (unicode_decode_call_errorhandler_writer(
5904                        errors, &errorHandler,
5905                        "rawunicodeescape", "truncated \\uXXXX",
5906                        &starts, &end, &startinpos, &endinpos, &exc, &s,
5907                        &writer))
5908                    goto onError;
5909                goto nextByte;
5910            }
5911            x = (x<<4) & ~0xF;
5912            if (c >= '0' && c <= '9')
5913                x += c - '0';
5914            else if (c >= 'a' && c <= 'f')
5915                x += 10 + c - 'a';
5916            else
5917                x += 10 + c - 'A';
5918        }
5919        if (x <= MAX_UNICODE) {
5920            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5921                goto onError;
5922        }
5923        else {
5924            endinpos = s-starts;
5925            if (unicode_decode_call_errorhandler_writer(
5926                    errors, &errorHandler,
5927                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
5928                    &starts, &end, &startinpos, &endinpos, &exc, &s,
5929                    &writer))
5930                goto onError;
5931        }
5932      nextByte:
5933        ;
5934    }
5935    Py_XDECREF(errorHandler);
5936    Py_XDECREF(exc);
5937    return _PyUnicodeWriter_Finish(&writer);
5938
5939  onError:
5940    _PyUnicodeWriter_Dealloc(&writer);
5941    Py_XDECREF(errorHandler);
5942    Py_XDECREF(exc);
5943    return NULL;
5944}
5945
5946
5947PyObject *
5948PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
5949{
5950    PyObject *repr;
5951    char *p;
5952    char *q;
5953    Py_ssize_t expandsize, pos;
5954    int kind;
5955    void *data;
5956    Py_ssize_t len;
5957
5958    if (!PyUnicode_Check(unicode)) {
5959        PyErr_BadArgument();
5960        return NULL;
5961    }
5962    if (PyUnicode_READY(unicode) == -1)
5963        return NULL;
5964    kind = PyUnicode_KIND(unicode);
5965    data = PyUnicode_DATA(unicode);
5966    len = PyUnicode_GET_LENGTH(unicode);
5967    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
5968       bytes, and 1 byte characters 4. */
5969    expandsize = kind * 2 + 2;
5970
5971    if (len > PY_SSIZE_T_MAX / expandsize)
5972        return PyErr_NoMemory();
5973
5974    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
5975    if (repr == NULL)
5976        return NULL;
5977    if (len == 0)
5978        return repr;
5979
5980    p = q = PyBytes_AS_STRING(repr);
5981    for (pos = 0; pos < len; pos++) {
5982        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
5983        /* Map 32-bit characters to '\Uxxxxxxxx' */
5984        if (ch >= 0x10000) {
5985            assert(ch <= MAX_UNICODE);
5986            *p++ = '\\';
5987            *p++ = 'U';
5988            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
5989            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
5990            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
5991            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
5992            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
5993            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
5994            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
5995            *p++ = Py_hexdigits[ch & 15];
5996        }
5997        /* Map 16-bit characters to '\uxxxx' */
5998        else if (ch >= 256) {
5999            *p++ = '\\';
6000            *p++ = 'u';
6001            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6002            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6003            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6004            *p++ = Py_hexdigits[ch & 15];
6005        }
6006        /* Copy everything else as-is */
6007        else
6008            *p++ = (char) ch;
6009    }
6010
6011    assert(p > q);
6012    if (_PyBytes_Resize(&repr, p - q) < 0)
6013        return NULL;
6014    return repr;
6015}
6016
6017PyObject *
6018PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6019                                 Py_ssize_t size)
6020{
6021    PyObject *result;
6022    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6023    if (tmp == NULL)
6024        return NULL;
6025    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6026    Py_DECREF(tmp);
6027    return result;
6028}
6029
6030/* --- Unicode Internal Codec ------------------------------------------- */
6031
6032PyObject *
6033_PyUnicode_DecodeUnicodeInternal(const char *s,
6034                                 Py_ssize_t size,
6035                                 const char *errors)
6036{
6037    const char *starts = s;
6038    Py_ssize_t startinpos;
6039    Py_ssize_t endinpos;
6040    _PyUnicodeWriter writer;
6041    const char *end;
6042    const char *reason;
6043    PyObject *errorHandler = NULL;
6044    PyObject *exc = NULL;
6045
6046    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6047                     "unicode_internal codec has been deprecated",
6048                     1))
6049        return NULL;
6050
6051    if (size == 0)
6052        _Py_RETURN_UNICODE_EMPTY();
6053
6054    _PyUnicodeWriter_Init(&writer);
6055    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6056        PyErr_NoMemory();
6057        goto onError;
6058    }
6059    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6060
6061    end = s + size;
6062    while (s < end) {
6063        Py_UNICODE uch;
6064        Py_UCS4 ch;
6065        if (end - s < Py_UNICODE_SIZE) {
6066            endinpos = end-starts;
6067            reason = "truncated input";
6068            goto error;
6069        }
6070        /* We copy the raw representation one byte at a time because the
6071           pointer may be unaligned (see test_codeccallbacks). */
6072        ((char *) &uch)[0] = s[0];
6073        ((char *) &uch)[1] = s[1];
6074#ifdef Py_UNICODE_WIDE
6075        ((char *) &uch)[2] = s[2];
6076        ((char *) &uch)[3] = s[3];
6077#endif
6078        ch = uch;
6079#ifdef Py_UNICODE_WIDE
6080        /* We have to sanity check the raw data, otherwise doom looms for
6081           some malformed UCS-4 data. */
6082        if (ch > 0x10ffff) {
6083            endinpos = s - starts + Py_UNICODE_SIZE;
6084            reason = "illegal code point (> 0x10FFFF)";
6085            goto error;
6086        }
6087#endif
6088        s += Py_UNICODE_SIZE;
6089#ifndef Py_UNICODE_WIDE
6090        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6091        {
6092            Py_UNICODE uch2;
6093            ((char *) &uch2)[0] = s[0];
6094            ((char *) &uch2)[1] = s[1];
6095            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6096            {
6097                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6098                s += Py_UNICODE_SIZE;
6099            }
6100        }
6101#endif
6102
6103        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6104            goto onError;
6105        continue;
6106
6107  error:
6108        startinpos = s - starts;
6109        if (unicode_decode_call_errorhandler_writer(
6110                errors, &errorHandler,
6111                "unicode_internal", reason,
6112                &starts, &end, &startinpos, &endinpos, &exc, &s,
6113                &writer))
6114            goto onError;
6115    }
6116
6117    Py_XDECREF(errorHandler);
6118    Py_XDECREF(exc);
6119    return _PyUnicodeWriter_Finish(&writer);
6120
6121  onError:
6122    _PyUnicodeWriter_Dealloc(&writer);
6123    Py_XDECREF(errorHandler);
6124    Py_XDECREF(exc);
6125    return NULL;
6126}
6127
6128/* --- Latin-1 Codec ------------------------------------------------------ */
6129
6130PyObject *
6131PyUnicode_DecodeLatin1(const char *s,
6132                       Py_ssize_t size,
6133                       const char *errors)
6134{
6135    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6136    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6137}
6138
6139/* create or adjust a UnicodeEncodeError */
6140static void
6141make_encode_exception(PyObject **exceptionObject,
6142                      const char *encoding,
6143                      PyObject *unicode,
6144                      Py_ssize_t startpos, Py_ssize_t endpos,
6145                      const char *reason)
6146{
6147    if (*exceptionObject == NULL) {
6148        *exceptionObject = PyObject_CallFunction(
6149            PyExc_UnicodeEncodeError, "sOnns",
6150            encoding, unicode, startpos, endpos, reason);
6151    }
6152    else {
6153        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6154            goto onError;
6155        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6156            goto onError;
6157        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6158            goto onError;
6159        return;
6160      onError:
6161        Py_DECREF(*exceptionObject);
6162        *exceptionObject = NULL;
6163    }
6164}
6165
6166/* raises a UnicodeEncodeError */
6167static void
6168raise_encode_exception(PyObject **exceptionObject,
6169                       const char *encoding,
6170                       PyObject *unicode,
6171                       Py_ssize_t startpos, Py_ssize_t endpos,
6172                       const char *reason)
6173{
6174    make_encode_exception(exceptionObject,
6175                          encoding, unicode, startpos, endpos, reason);
6176    if (*exceptionObject != NULL)
6177        PyCodec_StrictErrors(*exceptionObject);
6178}
6179
6180/* error handling callback helper:
6181   build arguments, call the callback and check the arguments,
6182   put the result into newpos and return the replacement string, which
6183   has to be freed by the caller */
6184static PyObject *
6185unicode_encode_call_errorhandler(const char *errors,
6186                                 PyObject **errorHandler,
6187                                 const char *encoding, const char *reason,
6188                                 PyObject *unicode, PyObject **exceptionObject,
6189                                 Py_ssize_t startpos, Py_ssize_t endpos,
6190                                 Py_ssize_t *newpos)
6191{
6192    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6193    Py_ssize_t len;
6194    PyObject *restuple;
6195    PyObject *resunicode;
6196
6197    if (*errorHandler == NULL) {
6198        *errorHandler = PyCodec_LookupError(errors);
6199        if (*errorHandler == NULL)
6200            return NULL;
6201    }
6202
6203    if (PyUnicode_READY(unicode) == -1)
6204        return NULL;
6205    len = PyUnicode_GET_LENGTH(unicode);
6206
6207    make_encode_exception(exceptionObject,
6208                          encoding, unicode, startpos, endpos, reason);
6209    if (*exceptionObject == NULL)
6210        return NULL;
6211
6212    restuple = PyObject_CallFunctionObjArgs(
6213        *errorHandler, *exceptionObject, NULL);
6214    if (restuple == NULL)
6215        return NULL;
6216    if (!PyTuple_Check(restuple)) {
6217        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6218        Py_DECREF(restuple);
6219        return NULL;
6220    }
6221    if (!PyArg_ParseTuple(restuple, argparse,
6222                          &resunicode, newpos)) {
6223        Py_DECREF(restuple);
6224        return NULL;
6225    }
6226    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6227        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6228        Py_DECREF(restuple);
6229        return NULL;
6230    }
6231    if (*newpos<0)
6232        *newpos = len + *newpos;
6233    if (*newpos<0 || *newpos>len) {
6234        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6235        Py_DECREF(restuple);
6236        return NULL;
6237    }
6238    Py_INCREF(resunicode);
6239    Py_DECREF(restuple);
6240    return resunicode;
6241}
6242
6243static PyObject *
6244unicode_encode_ucs1(PyObject *unicode,
6245                    const char *errors,
6246                    unsigned int limit)
6247{
6248    /* input state */
6249    Py_ssize_t pos=0, size;
6250    int kind;
6251    void *data;
6252    /* output object */
6253    PyObject *res;
6254    /* pointer into the output */
6255    char *str;
6256    /* current output position */
6257    Py_ssize_t ressize;
6258    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6259    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6260    PyObject *errorHandler = NULL;
6261    PyObject *exc = NULL;
6262    /* the following variable is used for caching string comparisons
6263     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6264    int known_errorHandler = -1;
6265
6266    if (PyUnicode_READY(unicode) == -1)
6267        return NULL;
6268    size = PyUnicode_GET_LENGTH(unicode);
6269    kind = PyUnicode_KIND(unicode);
6270    data = PyUnicode_DATA(unicode);
6271    /* allocate enough for a simple encoding without
6272       replacements, if we need more, we'll resize */
6273    if (size == 0)
6274        return PyBytes_FromStringAndSize(NULL, 0);
6275    res = PyBytes_FromStringAndSize(NULL, size);
6276    if (res == NULL)
6277        return NULL;
6278    str = PyBytes_AS_STRING(res);
6279    ressize = size;
6280
6281    while (pos < size) {
6282        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6283
6284        /* can we encode this? */
6285        if (c<limit) {
6286            /* no overflow check, because we know that the space is enough */
6287            *str++ = (char)c;
6288            ++pos;
6289        }
6290        else {
6291            Py_ssize_t requiredsize;
6292            PyObject *repunicode;
6293            Py_ssize_t repsize, newpos, respos, i;
6294            /* startpos for collecting unencodable chars */
6295            Py_ssize_t collstart = pos;
6296            Py_ssize_t collend = pos;
6297            /* find all unecodable characters */
6298            while ((collend < size) && (PyUnicode_READ(kind, data, collend)>=limit))
6299                ++collend;
6300            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6301            if (known_errorHandler==-1) {
6302                if ((errors==NULL) || (!strcmp(errors, "strict")))
6303                    known_errorHandler = 1;
6304                else if (!strcmp(errors, "replace"))
6305                    known_errorHandler = 2;
6306                else if (!strcmp(errors, "ignore"))
6307                    known_errorHandler = 3;
6308                else if (!strcmp(errors, "xmlcharrefreplace"))
6309                    known_errorHandler = 4;
6310                else
6311                    known_errorHandler = 0;
6312            }
6313            switch (known_errorHandler) {
6314            case 1: /* strict */
6315                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6316                goto onError;
6317            case 2: /* replace */
6318                while (collstart++<collend)
6319                    *str++ = '?'; /* fall through */
6320            case 3: /* ignore */
6321                pos = collend;
6322                break;
6323            case 4: /* xmlcharrefreplace */
6324                respos = str - PyBytes_AS_STRING(res);
6325                /* determine replacement size */
6326                for (i = collstart, repsize = 0; i < collend; ++i) {
6327                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6328                    if (ch < 10)
6329                        repsize += 2+1+1;
6330                    else if (ch < 100)
6331                        repsize += 2+2+1;
6332                    else if (ch < 1000)
6333                        repsize += 2+3+1;
6334                    else if (ch < 10000)
6335                        repsize += 2+4+1;
6336                    else if (ch < 100000)
6337                        repsize += 2+5+1;
6338                    else if (ch < 1000000)
6339                        repsize += 2+6+1;
6340                    else {
6341                        assert(ch <= MAX_UNICODE);
6342                        repsize += 2+7+1;
6343                    }
6344                }
6345                requiredsize = respos+repsize+(size-collend);
6346                if (requiredsize > ressize) {
6347                    if (requiredsize<2*ressize)
6348                        requiredsize = 2*ressize;
6349                    if (_PyBytes_Resize(&res, requiredsize))
6350                        goto onError;
6351                    str = PyBytes_AS_STRING(res) + respos;
6352                    ressize = requiredsize;
6353                }
6354                /* generate replacement */
6355                for (i = collstart; i < collend; ++i) {
6356                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6357                }
6358                pos = collend;
6359                break;
6360            default:
6361                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6362                                                              encoding, reason, unicode, &exc,
6363                                                              collstart, collend, &newpos);
6364                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6365                                           PyUnicode_READY(repunicode) == -1))
6366                    goto onError;
6367                if (PyBytes_Check(repunicode)) {
6368                    /* Directly copy bytes result to output. */
6369                    repsize = PyBytes_Size(repunicode);
6370                    if (repsize > 1) {
6371                        /* Make room for all additional bytes. */
6372                        respos = str - PyBytes_AS_STRING(res);
6373                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6374                            Py_DECREF(repunicode);
6375                            goto onError;
6376                        }
6377                        str = PyBytes_AS_STRING(res) + respos;
6378                        ressize += repsize-1;
6379                    }
6380                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6381                    str += repsize;
6382                    pos = newpos;
6383                    Py_DECREF(repunicode);
6384                    break;
6385                }
6386                /* need more space? (at least enough for what we
6387                   have+the replacement+the rest of the string, so
6388                   we won't have to check space for encodable characters) */
6389                respos = str - PyBytes_AS_STRING(res);
6390                repsize = PyUnicode_GET_LENGTH(repunicode);
6391                requiredsize = respos+repsize+(size-collend);
6392                if (requiredsize > ressize) {
6393                    if (requiredsize<2*ressize)
6394                        requiredsize = 2*ressize;
6395                    if (_PyBytes_Resize(&res, requiredsize)) {
6396                        Py_DECREF(repunicode);
6397                        goto onError;
6398                    }
6399                    str = PyBytes_AS_STRING(res) + respos;
6400                    ressize = requiredsize;
6401                }
6402                /* check if there is anything unencodable in the replacement
6403                   and copy it to the output */
6404                for (i = 0; repsize-->0; ++i, ++str) {
6405                    c = PyUnicode_READ_CHAR(repunicode, i);
6406                    if (c >= limit) {
6407                        raise_encode_exception(&exc, encoding, unicode,
6408                                               pos, pos+1, reason);
6409                        Py_DECREF(repunicode);
6410                        goto onError;
6411                    }
6412                    *str = (char)c;
6413                }
6414                pos = newpos;
6415                Py_DECREF(repunicode);
6416            }
6417        }
6418    }
6419    /* Resize if we allocated to much */
6420    size = str - PyBytes_AS_STRING(res);
6421    if (size < ressize) { /* If this falls res will be NULL */
6422        assert(size >= 0);
6423        if (_PyBytes_Resize(&res, size) < 0)
6424            goto onError;
6425    }
6426
6427    Py_XDECREF(errorHandler);
6428    Py_XDECREF(exc);
6429    return res;
6430
6431  onError:
6432    Py_XDECREF(res);
6433    Py_XDECREF(errorHandler);
6434    Py_XDECREF(exc);
6435    return NULL;
6436}
6437
6438/* Deprecated */
6439PyObject *
6440PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6441                       Py_ssize_t size,
6442                       const char *errors)
6443{
6444    PyObject *result;
6445    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6446    if (unicode == NULL)
6447        return NULL;
6448    result = unicode_encode_ucs1(unicode, errors, 256);
6449    Py_DECREF(unicode);
6450    return result;
6451}
6452
6453PyObject *
6454_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6455{
6456    if (!PyUnicode_Check(unicode)) {
6457        PyErr_BadArgument();
6458        return NULL;
6459    }
6460    if (PyUnicode_READY(unicode) == -1)
6461        return NULL;
6462    /* Fast path: if it is a one-byte string, construct
6463       bytes object directly. */
6464    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6465        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6466                                         PyUnicode_GET_LENGTH(unicode));
6467    /* Non-Latin-1 characters present. Defer to above function to
6468       raise the exception. */
6469    return unicode_encode_ucs1(unicode, errors, 256);
6470}
6471
6472PyObject*
6473PyUnicode_AsLatin1String(PyObject *unicode)
6474{
6475    return _PyUnicode_AsLatin1String(unicode, NULL);
6476}
6477
6478/* --- 7-bit ASCII Codec -------------------------------------------------- */
6479
6480PyObject *
6481PyUnicode_DecodeASCII(const char *s,
6482                      Py_ssize_t size,
6483                      const char *errors)
6484{
6485    const char *starts = s;
6486    _PyUnicodeWriter writer;
6487    int kind;
6488    void *data;
6489    Py_ssize_t startinpos;
6490    Py_ssize_t endinpos;
6491    Py_ssize_t outpos;
6492    const char *e;
6493    PyObject *errorHandler = NULL;
6494    PyObject *exc = NULL;
6495
6496    if (size == 0)
6497        _Py_RETURN_UNICODE_EMPTY();
6498
6499    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6500    if (size == 1 && (unsigned char)s[0] < 128)
6501        return get_latin1_char((unsigned char)s[0]);
6502
6503    _PyUnicodeWriter_Init(&writer);
6504    writer.min_length = size;
6505    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6506        return NULL;
6507
6508    e = s + size;
6509    data = writer.data;
6510    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6511    writer.pos = outpos;
6512    if (writer.pos == size)
6513        return _PyUnicodeWriter_Finish(&writer);
6514
6515    s += writer.pos;
6516    kind = writer.kind;
6517    while (s < e) {
6518        register unsigned char c = (unsigned char)*s;
6519        if (c < 128) {
6520            PyUnicode_WRITE(kind, data, writer.pos, c);
6521            writer.pos++;
6522            ++s;
6523        }
6524        else {
6525            startinpos = s-starts;
6526            endinpos = startinpos + 1;
6527            if (unicode_decode_call_errorhandler_writer(
6528                    errors, &errorHandler,
6529                    "ascii", "ordinal not in range(128)",
6530                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6531                    &writer))
6532                goto onError;
6533            kind = writer.kind;
6534            data = writer.data;
6535        }
6536    }
6537    Py_XDECREF(errorHandler);
6538    Py_XDECREF(exc);
6539    return _PyUnicodeWriter_Finish(&writer);
6540
6541  onError:
6542    _PyUnicodeWriter_Dealloc(&writer);
6543    Py_XDECREF(errorHandler);
6544    Py_XDECREF(exc);
6545    return NULL;
6546}
6547
6548/* Deprecated */
6549PyObject *
6550PyUnicode_EncodeASCII(const Py_UNICODE *p,
6551                      Py_ssize_t size,
6552                      const char *errors)
6553{
6554    PyObject *result;
6555    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6556    if (unicode == NULL)
6557        return NULL;
6558    result = unicode_encode_ucs1(unicode, errors, 128);
6559    Py_DECREF(unicode);
6560    return result;
6561}
6562
6563PyObject *
6564_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6565{
6566    if (!PyUnicode_Check(unicode)) {
6567        PyErr_BadArgument();
6568        return NULL;
6569    }
6570    if (PyUnicode_READY(unicode) == -1)
6571        return NULL;
6572    /* Fast path: if it is an ASCII-only string, construct bytes object
6573       directly. Else defer to above function to raise the exception. */
6574    if (PyUnicode_IS_ASCII(unicode))
6575        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6576                                         PyUnicode_GET_LENGTH(unicode));
6577    return unicode_encode_ucs1(unicode, errors, 128);
6578}
6579
6580PyObject *
6581PyUnicode_AsASCIIString(PyObject *unicode)
6582{
6583    return _PyUnicode_AsASCIIString(unicode, NULL);
6584}
6585
6586#ifdef HAVE_MBCS
6587
6588/* --- MBCS codecs for Windows -------------------------------------------- */
6589
6590#if SIZEOF_INT < SIZEOF_SIZE_T
6591#define NEED_RETRY
6592#endif
6593
6594#ifndef WC_ERR_INVALID_CHARS
6595#  define WC_ERR_INVALID_CHARS 0x0080
6596#endif
6597
6598static char*
6599code_page_name(UINT code_page, PyObject **obj)
6600{
6601    *obj = NULL;
6602    if (code_page == CP_ACP)
6603        return "mbcs";
6604    if (code_page == CP_UTF7)
6605        return "CP_UTF7";
6606    if (code_page == CP_UTF8)
6607        return "CP_UTF8";
6608
6609    *obj = PyBytes_FromFormat("cp%u", code_page);
6610    if (*obj == NULL)
6611        return NULL;
6612    return PyBytes_AS_STRING(*obj);
6613}
6614
6615static int
6616is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6617{
6618    const char *curr = s + offset;
6619    const char *prev;
6620
6621    if (!IsDBCSLeadByteEx(code_page, *curr))
6622        return 0;
6623
6624    prev = CharPrevExA(code_page, s, curr, 0);
6625    if (prev == curr)
6626        return 1;
6627    /* FIXME: This code is limited to "true" double-byte encodings,
6628       as it assumes an incomplete character consists of a single
6629       byte. */
6630    if (curr - prev == 2)
6631        return 1;
6632    if (!IsDBCSLeadByteEx(code_page, *prev))
6633        return 1;
6634    return 0;
6635}
6636
6637static DWORD
6638decode_code_page_flags(UINT code_page)
6639{
6640    if (code_page == CP_UTF7) {
6641        /* The CP_UTF7 decoder only supports flags=0 */
6642        return 0;
6643    }
6644    else
6645        return MB_ERR_INVALID_CHARS;
6646}
6647
6648/*
6649 * Decode a byte string from a Windows code page into unicode object in strict
6650 * mode.
6651 *
6652 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6653 * OSError and returns -1 on other error.
6654 */
6655static int
6656decode_code_page_strict(UINT code_page,
6657                        PyObject **v,
6658                        const char *in,
6659                        int insize)
6660{
6661    const DWORD flags = decode_code_page_flags(code_page);
6662    wchar_t *out;
6663    DWORD outsize;
6664
6665    /* First get the size of the result */
6666    assert(insize > 0);
6667    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6668    if (outsize <= 0)
6669        goto error;
6670
6671    if (*v == NULL) {
6672        /* Create unicode object */
6673        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6674        *v = (PyObject*)_PyUnicode_New(outsize);
6675        if (*v == NULL)
6676            return -1;
6677        out = PyUnicode_AS_UNICODE(*v);
6678    }
6679    else {
6680        /* Extend unicode object */
6681        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6682        if (unicode_resize(v, n + outsize) < 0)
6683            return -1;
6684        out = PyUnicode_AS_UNICODE(*v) + n;
6685    }
6686
6687    /* Do the conversion */
6688    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6689    if (outsize <= 0)
6690        goto error;
6691    return insize;
6692
6693error:
6694    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6695        return -2;
6696    PyErr_SetFromWindowsErr(0);
6697    return -1;
6698}
6699
6700/*
6701 * Decode a byte string from a code page into unicode object with an error
6702 * handler.
6703 *
6704 * Returns consumed size if succeed, or raise an OSError or
6705 * UnicodeDecodeError exception and returns -1 on error.
6706 */
6707static int
6708decode_code_page_errors(UINT code_page,
6709                        PyObject **v,
6710                        const char *in, const int size,
6711                        const char *errors)
6712{
6713    const char *startin = in;
6714    const char *endin = in + size;
6715    const DWORD flags = decode_code_page_flags(code_page);
6716    /* Ideally, we should get reason from FormatMessage. This is the Windows
6717       2000 English version of the message. */
6718    const char *reason = "No mapping for the Unicode character exists "
6719                         "in the target code page.";
6720    /* each step cannot decode more than 1 character, but a character can be
6721       represented as a surrogate pair */
6722    wchar_t buffer[2], *startout, *out;
6723    int insize;
6724    Py_ssize_t outsize;
6725    PyObject *errorHandler = NULL;
6726    PyObject *exc = NULL;
6727    PyObject *encoding_obj = NULL;
6728    char *encoding;
6729    DWORD err;
6730    int ret = -1;
6731
6732    assert(size > 0);
6733
6734    encoding = code_page_name(code_page, &encoding_obj);
6735    if (encoding == NULL)
6736        return -1;
6737
6738    if (errors == NULL || strcmp(errors, "strict") == 0) {
6739        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6740           UnicodeDecodeError. */
6741        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6742        if (exc != NULL) {
6743            PyCodec_StrictErrors(exc);
6744            Py_CLEAR(exc);
6745        }
6746        goto error;
6747    }
6748
6749    if (*v == NULL) {
6750        /* Create unicode object */
6751        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6752            PyErr_NoMemory();
6753            goto error;
6754        }
6755        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6756        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6757        if (*v == NULL)
6758            goto error;
6759        startout = PyUnicode_AS_UNICODE(*v);
6760    }
6761    else {
6762        /* Extend unicode object */
6763        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6764        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6765            PyErr_NoMemory();
6766            goto error;
6767        }
6768        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
6769            goto error;
6770        startout = PyUnicode_AS_UNICODE(*v) + n;
6771    }
6772
6773    /* Decode the byte string character per character */
6774    out = startout;
6775    while (in < endin)
6776    {
6777        /* Decode a character */
6778        insize = 1;
6779        do
6780        {
6781            outsize = MultiByteToWideChar(code_page, flags,
6782                                          in, insize,
6783                                          buffer, Py_ARRAY_LENGTH(buffer));
6784            if (outsize > 0)
6785                break;
6786            err = GetLastError();
6787            if (err != ERROR_NO_UNICODE_TRANSLATION
6788                && err != ERROR_INSUFFICIENT_BUFFER)
6789            {
6790                PyErr_SetFromWindowsErr(0);
6791                goto error;
6792            }
6793            insize++;
6794        }
6795        /* 4=maximum length of a UTF-8 sequence */
6796        while (insize <= 4 && (in + insize) <= endin);
6797
6798        if (outsize <= 0) {
6799            Py_ssize_t startinpos, endinpos, outpos;
6800
6801            startinpos = in - startin;
6802            endinpos = startinpos + 1;
6803            outpos = out - PyUnicode_AS_UNICODE(*v);
6804            if (unicode_decode_call_errorhandler_wchar(
6805                    errors, &errorHandler,
6806                    encoding, reason,
6807                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
6808                    v, &outpos))
6809            {
6810                goto error;
6811            }
6812            out = PyUnicode_AS_UNICODE(*v) + outpos;
6813        }
6814        else {
6815            in += insize;
6816            memcpy(out, buffer, outsize * sizeof(wchar_t));
6817            out += outsize;
6818        }
6819    }
6820
6821    /* write a NUL character at the end */
6822    *out = 0;
6823
6824    /* Extend unicode object */
6825    outsize = out - startout;
6826    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
6827    if (unicode_resize(v, outsize) < 0)
6828        goto error;
6829    ret = size;
6830
6831error:
6832    Py_XDECREF(encoding_obj);
6833    Py_XDECREF(errorHandler);
6834    Py_XDECREF(exc);
6835    return ret;
6836}
6837
6838static PyObject *
6839decode_code_page_stateful(int code_page,
6840                          const char *s, Py_ssize_t size,
6841                          const char *errors, Py_ssize_t *consumed)
6842{
6843    PyObject *v = NULL;
6844    int chunk_size, final, converted, done;
6845
6846    if (code_page < 0) {
6847        PyErr_SetString(PyExc_ValueError, "invalid code page number");
6848        return NULL;
6849    }
6850
6851    if (consumed)
6852        *consumed = 0;
6853
6854    do
6855    {
6856#ifdef NEED_RETRY
6857        if (size > INT_MAX) {
6858            chunk_size = INT_MAX;
6859            final = 0;
6860            done = 0;
6861        }
6862        else
6863#endif
6864        {
6865            chunk_size = (int)size;
6866            final = (consumed == NULL);
6867            done = 1;
6868        }
6869
6870        /* Skip trailing lead-byte unless 'final' is set */
6871        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
6872            --chunk_size;
6873
6874        if (chunk_size == 0 && done) {
6875            if (v != NULL)
6876                break;
6877            _Py_RETURN_UNICODE_EMPTY();
6878        }
6879
6880
6881        converted = decode_code_page_strict(code_page, &v,
6882                                            s, chunk_size);
6883        if (converted == -2)
6884            converted = decode_code_page_errors(code_page, &v,
6885                                                s, chunk_size,
6886                                                errors);
6887        assert(converted != 0);
6888
6889        if (converted < 0) {
6890            Py_XDECREF(v);
6891            return NULL;
6892        }
6893
6894        if (consumed)
6895            *consumed += converted;
6896
6897        s += converted;
6898        size -= converted;
6899    } while (!done);
6900
6901    return unicode_result(v);
6902}
6903
6904PyObject *
6905PyUnicode_DecodeCodePageStateful(int code_page,
6906                                 const char *s,
6907                                 Py_ssize_t size,
6908                                 const char *errors,
6909                                 Py_ssize_t *consumed)
6910{
6911    return decode_code_page_stateful(code_page, s, size, errors, consumed);
6912}
6913
6914PyObject *
6915PyUnicode_DecodeMBCSStateful(const char *s,
6916                             Py_ssize_t size,
6917                             const char *errors,
6918                             Py_ssize_t *consumed)
6919{
6920    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
6921}
6922
6923PyObject *
6924PyUnicode_DecodeMBCS(const char *s,
6925                     Py_ssize_t size,
6926                     const char *errors)
6927{
6928    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
6929}
6930
6931static DWORD
6932encode_code_page_flags(UINT code_page, const char *errors)
6933{
6934    if (code_page == CP_UTF8) {
6935        if (winver.dwMajorVersion >= 6)
6936            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
6937               and later */
6938            return WC_ERR_INVALID_CHARS;
6939        else
6940            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
6941            return 0;
6942    }
6943    else if (code_page == CP_UTF7) {
6944        /* CP_UTF7 only supports flags=0 */
6945        return 0;
6946    }
6947    else {
6948        if (errors != NULL && strcmp(errors, "replace") == 0)
6949            return 0;
6950        else
6951            return WC_NO_BEST_FIT_CHARS;
6952    }
6953}
6954
6955/*
6956 * Encode a Unicode string to a Windows code page into a byte string in strict
6957 * mode.
6958 *
6959 * Returns consumed characters if succeed, returns -2 on encode error, or raise
6960 * an OSError and returns -1 on other error.
6961 */
6962static int
6963encode_code_page_strict(UINT code_page, PyObject **outbytes,
6964                        PyObject *unicode, Py_ssize_t offset, int len,
6965                        const char* errors)
6966{
6967    BOOL usedDefaultChar = FALSE;
6968    BOOL *pusedDefaultChar = &usedDefaultChar;
6969    int outsize;
6970    PyObject *exc = NULL;
6971    wchar_t *p;
6972    Py_ssize_t size;
6973    const DWORD flags = encode_code_page_flags(code_page, NULL);
6974    char *out;
6975    /* Create a substring so that we can get the UTF-16 representation
6976       of just the slice under consideration. */
6977    PyObject *substring;
6978
6979    assert(len > 0);
6980
6981    if (code_page != CP_UTF8 && code_page != CP_UTF7)
6982        pusedDefaultChar = &usedDefaultChar;
6983    else
6984        pusedDefaultChar = NULL;
6985
6986    substring = PyUnicode_Substring(unicode, offset, offset+len);
6987    if (substring == NULL)
6988        return -1;
6989    p = PyUnicode_AsUnicodeAndSize(substring, &size);
6990    if (p == NULL) {
6991        Py_DECREF(substring);
6992        return -1;
6993    }
6994    assert(size <= INT_MAX);
6995
6996    /* First get the size of the result */
6997    outsize = WideCharToMultiByte(code_page, flags,
6998                                  p, (int)size,
6999                                  NULL, 0,
7000                                  NULL, pusedDefaultChar);
7001    if (outsize <= 0)
7002        goto error;
7003    /* If we used a default char, then we failed! */
7004    if (pusedDefaultChar && *pusedDefaultChar) {
7005        Py_DECREF(substring);
7006        return -2;
7007    }
7008
7009    if (*outbytes == NULL) {
7010        /* Create string object */
7011        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7012        if (*outbytes == NULL) {
7013            Py_DECREF(substring);
7014            return -1;
7015        }
7016        out = PyBytes_AS_STRING(*outbytes);
7017    }
7018    else {
7019        /* Extend string object */
7020        const Py_ssize_t n = PyBytes_Size(*outbytes);
7021        if (outsize > PY_SSIZE_T_MAX - n) {
7022            PyErr_NoMemory();
7023            Py_DECREF(substring);
7024            return -1;
7025        }
7026        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7027            Py_DECREF(substring);
7028            return -1;
7029        }
7030        out = PyBytes_AS_STRING(*outbytes) + n;
7031    }
7032
7033    /* Do the conversion */
7034    outsize = WideCharToMultiByte(code_page, flags,
7035                                  p, (int)size,
7036                                  out, outsize,
7037                                  NULL, pusedDefaultChar);
7038    Py_CLEAR(substring);
7039    if (outsize <= 0)
7040        goto error;
7041    if (pusedDefaultChar && *pusedDefaultChar)
7042        return -2;
7043    return 0;
7044
7045error:
7046    Py_XDECREF(substring);
7047    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7048        return -2;
7049    PyErr_SetFromWindowsErr(0);
7050    return -1;
7051}
7052
7053/*
7054 * Encode a Unicode string to a Windows code page into a byte string using a
7055 * error handler.
7056 *
7057 * Returns consumed characters if succeed, or raise an OSError and returns
7058 * -1 on other error.
7059 */
7060static int
7061encode_code_page_errors(UINT code_page, PyObject **outbytes,
7062                        PyObject *unicode, Py_ssize_t unicode_offset,
7063                        Py_ssize_t insize, const char* errors)
7064{
7065    const DWORD flags = encode_code_page_flags(code_page, errors);
7066    Py_ssize_t pos = unicode_offset;
7067    Py_ssize_t endin = unicode_offset + insize;
7068    /* Ideally, we should get reason from FormatMessage. This is the Windows
7069       2000 English version of the message. */
7070    const char *reason = "invalid character";
7071    /* 4=maximum length of a UTF-8 sequence */
7072    char buffer[4];
7073    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7074    Py_ssize_t outsize;
7075    char *out;
7076    PyObject *errorHandler = NULL;
7077    PyObject *exc = NULL;
7078    PyObject *encoding_obj = NULL;
7079    char *encoding;
7080    Py_ssize_t newpos, newoutsize;
7081    PyObject *rep;
7082    int ret = -1;
7083
7084    assert(insize > 0);
7085
7086    encoding = code_page_name(code_page, &encoding_obj);
7087    if (encoding == NULL)
7088        return -1;
7089
7090    if (errors == NULL || strcmp(errors, "strict") == 0) {
7091        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7092           then we raise a UnicodeEncodeError. */
7093        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7094        if (exc != NULL) {
7095            PyCodec_StrictErrors(exc);
7096            Py_DECREF(exc);
7097        }
7098        Py_XDECREF(encoding_obj);
7099        return -1;
7100    }
7101
7102    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7103        pusedDefaultChar = &usedDefaultChar;
7104    else
7105        pusedDefaultChar = NULL;
7106
7107    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7108        PyErr_NoMemory();
7109        goto error;
7110    }
7111    outsize = insize * Py_ARRAY_LENGTH(buffer);
7112
7113    if (*outbytes == NULL) {
7114        /* Create string object */
7115        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7116        if (*outbytes == NULL)
7117            goto error;
7118        out = PyBytes_AS_STRING(*outbytes);
7119    }
7120    else {
7121        /* Extend string object */
7122        Py_ssize_t n = PyBytes_Size(*outbytes);
7123        if (n > PY_SSIZE_T_MAX - outsize) {
7124            PyErr_NoMemory();
7125            goto error;
7126        }
7127        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7128            goto error;
7129        out = PyBytes_AS_STRING(*outbytes) + n;
7130    }
7131
7132    /* Encode the string character per character */
7133    while (pos < endin)
7134    {
7135        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7136        wchar_t chars[2];
7137        int charsize;
7138        if (ch < 0x10000) {
7139            chars[0] = (wchar_t)ch;
7140            charsize = 1;
7141        }
7142        else {
7143            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7144            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7145            charsize = 2;
7146        }
7147
7148        outsize = WideCharToMultiByte(code_page, flags,
7149                                      chars, charsize,
7150                                      buffer, Py_ARRAY_LENGTH(buffer),
7151                                      NULL, pusedDefaultChar);
7152        if (outsize > 0) {
7153            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7154            {
7155                pos++;
7156                memcpy(out, buffer, outsize);
7157                out += outsize;
7158                continue;
7159            }
7160        }
7161        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7162            PyErr_SetFromWindowsErr(0);
7163            goto error;
7164        }
7165
7166        rep = unicode_encode_call_errorhandler(
7167                  errors, &errorHandler, encoding, reason,
7168                  unicode, &exc,
7169                  pos, pos + 1, &newpos);
7170        if (rep == NULL)
7171            goto error;
7172        pos = newpos;
7173
7174        if (PyBytes_Check(rep)) {
7175            outsize = PyBytes_GET_SIZE(rep);
7176            if (outsize != 1) {
7177                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7178                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7179                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7180                    Py_DECREF(rep);
7181                    goto error;
7182                }
7183                out = PyBytes_AS_STRING(*outbytes) + offset;
7184            }
7185            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7186            out += outsize;
7187        }
7188        else {
7189            Py_ssize_t i;
7190            enum PyUnicode_Kind kind;
7191            void *data;
7192
7193            if (PyUnicode_READY(rep) == -1) {
7194                Py_DECREF(rep);
7195                goto error;
7196            }
7197
7198            outsize = PyUnicode_GET_LENGTH(rep);
7199            if (outsize != 1) {
7200                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7201                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7202                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7203                    Py_DECREF(rep);
7204                    goto error;
7205                }
7206                out = PyBytes_AS_STRING(*outbytes) + offset;
7207            }
7208            kind = PyUnicode_KIND(rep);
7209            data = PyUnicode_DATA(rep);
7210            for (i=0; i < outsize; i++) {
7211                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7212                if (ch > 127) {
7213                    raise_encode_exception(&exc,
7214                        encoding, unicode,
7215                        pos, pos + 1,
7216                        "unable to encode error handler result to ASCII");
7217                    Py_DECREF(rep);
7218                    goto error;
7219                }
7220                *out = (unsigned char)ch;
7221                out++;
7222            }
7223        }
7224        Py_DECREF(rep);
7225    }
7226    /* write a NUL byte */
7227    *out = 0;
7228    outsize = out - PyBytes_AS_STRING(*outbytes);
7229    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7230    if (_PyBytes_Resize(outbytes, outsize) < 0)
7231        goto error;
7232    ret = 0;
7233
7234error:
7235    Py_XDECREF(encoding_obj);
7236    Py_XDECREF(errorHandler);
7237    Py_XDECREF(exc);
7238    return ret;
7239}
7240
7241static PyObject *
7242encode_code_page(int code_page,
7243                 PyObject *unicode,
7244                 const char *errors)
7245{
7246    Py_ssize_t len;
7247    PyObject *outbytes = NULL;
7248    Py_ssize_t offset;
7249    int chunk_len, ret, done;
7250
7251    if (PyUnicode_READY(unicode) == -1)
7252        return NULL;
7253    len = PyUnicode_GET_LENGTH(unicode);
7254
7255    if (code_page < 0) {
7256        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7257        return NULL;
7258    }
7259
7260    if (len == 0)
7261        return PyBytes_FromStringAndSize(NULL, 0);
7262
7263    offset = 0;
7264    do
7265    {
7266#ifdef NEED_RETRY
7267        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7268           chunks. */
7269        if (len > INT_MAX/2) {
7270            chunk_len = INT_MAX/2;
7271            done = 0;
7272        }
7273        else
7274#endif
7275        {
7276            chunk_len = (int)len;
7277            done = 1;
7278        }
7279
7280        ret = encode_code_page_strict(code_page, &outbytes,
7281                                      unicode, offset, chunk_len,
7282                                      errors);
7283        if (ret == -2)
7284            ret = encode_code_page_errors(code_page, &outbytes,
7285                                          unicode, offset,
7286                                          chunk_len, errors);
7287        if (ret < 0) {
7288            Py_XDECREF(outbytes);
7289            return NULL;
7290        }
7291
7292        offset += chunk_len;
7293        len -= chunk_len;
7294    } while (!done);
7295
7296    return outbytes;
7297}
7298
7299PyObject *
7300PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7301                     Py_ssize_t size,
7302                     const char *errors)
7303{
7304    PyObject *unicode, *res;
7305    unicode = PyUnicode_FromUnicode(p, size);
7306    if (unicode == NULL)
7307        return NULL;
7308    res = encode_code_page(CP_ACP, unicode, errors);
7309    Py_DECREF(unicode);
7310    return res;
7311}
7312
7313PyObject *
7314PyUnicode_EncodeCodePage(int code_page,
7315                         PyObject *unicode,
7316                         const char *errors)
7317{
7318    return encode_code_page(code_page, unicode, errors);
7319}
7320
7321PyObject *
7322PyUnicode_AsMBCSString(PyObject *unicode)
7323{
7324    if (!PyUnicode_Check(unicode)) {
7325        PyErr_BadArgument();
7326        return NULL;
7327    }
7328    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7329}
7330
7331#undef NEED_RETRY
7332
7333#endif /* HAVE_MBCS */
7334
7335/* --- Character Mapping Codec -------------------------------------------- */
7336
7337static int
7338charmap_decode_string(const char *s,
7339                      Py_ssize_t size,
7340                      PyObject *mapping,
7341                      const char *errors,
7342                      _PyUnicodeWriter *writer)
7343{
7344    const char *starts = s;
7345    const char *e;
7346    Py_ssize_t startinpos, endinpos;
7347    PyObject *errorHandler = NULL, *exc = NULL;
7348    Py_ssize_t maplen;
7349    enum PyUnicode_Kind mapkind;
7350    void *mapdata;
7351    Py_UCS4 x;
7352    unsigned char ch;
7353
7354    if (PyUnicode_READY(mapping) == -1)
7355        return -1;
7356
7357    maplen = PyUnicode_GET_LENGTH(mapping);
7358    mapdata = PyUnicode_DATA(mapping);
7359    mapkind = PyUnicode_KIND(mapping);
7360
7361    e = s + size;
7362
7363    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7364        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7365         * is disabled in encoding aliases, latin1 is preferred because
7366         * its implementation is faster. */
7367        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7368        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7369        Py_UCS4 maxchar = writer->maxchar;
7370
7371        assert (writer->kind == PyUnicode_1BYTE_KIND);
7372        while (s < e) {
7373            ch = *s;
7374            x = mapdata_ucs1[ch];
7375            if (x > maxchar) {
7376                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7377                    goto onError;
7378                maxchar = writer->maxchar;
7379                outdata = (Py_UCS1 *)writer->data;
7380            }
7381            outdata[writer->pos] = x;
7382            writer->pos++;
7383            ++s;
7384        }
7385        return 0;
7386    }
7387
7388    while (s < e) {
7389        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7390            enum PyUnicode_Kind outkind = writer->kind;
7391            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7392            if (outkind == PyUnicode_1BYTE_KIND) {
7393                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7394                Py_UCS4 maxchar = writer->maxchar;
7395                while (s < e) {
7396                    ch = *s;
7397                    x = mapdata_ucs2[ch];
7398                    if (x > maxchar)
7399                        goto Error;
7400                    outdata[writer->pos] = x;
7401                    writer->pos++;
7402                    ++s;
7403                }
7404                break;
7405            }
7406            else if (outkind == PyUnicode_2BYTE_KIND) {
7407                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7408                while (s < e) {
7409                    ch = *s;
7410                    x = mapdata_ucs2[ch];
7411                    if (x == 0xFFFE)
7412                        goto Error;
7413                    outdata[writer->pos] = x;
7414                    writer->pos++;
7415                    ++s;
7416                }
7417                break;
7418            }
7419        }
7420        ch = *s;
7421
7422        if (ch < maplen)
7423            x = PyUnicode_READ(mapkind, mapdata, ch);
7424        else
7425            x = 0xfffe; /* invalid value */
7426Error:
7427        if (x == 0xfffe)
7428        {
7429            /* undefined mapping */
7430            startinpos = s-starts;
7431            endinpos = startinpos+1;
7432            if (unicode_decode_call_errorhandler_writer(
7433                    errors, &errorHandler,
7434                    "charmap", "character maps to <undefined>",
7435                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7436                    writer)) {
7437                goto onError;
7438            }
7439            continue;
7440        }
7441
7442        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7443            goto onError;
7444        ++s;
7445    }
7446    Py_XDECREF(errorHandler);
7447    Py_XDECREF(exc);
7448    return 0;
7449
7450onError:
7451    Py_XDECREF(errorHandler);
7452    Py_XDECREF(exc);
7453    return -1;
7454}
7455
7456static int
7457charmap_decode_mapping(const char *s,
7458                       Py_ssize_t size,
7459                       PyObject *mapping,
7460                       const char *errors,
7461                       _PyUnicodeWriter *writer)
7462{
7463    const char *starts = s;
7464    const char *e;
7465    Py_ssize_t startinpos, endinpos;
7466    PyObject *errorHandler = NULL, *exc = NULL;
7467    unsigned char ch;
7468    PyObject *key, *item = NULL;
7469
7470    e = s + size;
7471
7472    while (s < e) {
7473        ch = *s;
7474
7475        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7476        key = PyLong_FromLong((long)ch);
7477        if (key == NULL)
7478            goto onError;
7479
7480        item = PyObject_GetItem(mapping, key);
7481        Py_DECREF(key);
7482        if (item == NULL) {
7483            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7484                /* No mapping found means: mapping is undefined. */
7485                PyErr_Clear();
7486                goto Undefined;
7487            } else
7488                goto onError;
7489        }
7490
7491        /* Apply mapping */
7492        if (item == Py_None)
7493            goto Undefined;
7494        if (PyLong_Check(item)) {
7495            long value = PyLong_AS_LONG(item);
7496            if (value == 0xFFFE)
7497                goto Undefined;
7498            if (value < 0 || value > MAX_UNICODE) {
7499                PyErr_Format(PyExc_TypeError,
7500                             "character mapping must be in range(0x%lx)",
7501                             (unsigned long)MAX_UNICODE + 1);
7502                goto onError;
7503            }
7504
7505            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7506                goto onError;
7507        }
7508        else if (PyUnicode_Check(item)) {
7509            if (PyUnicode_READY(item) == -1)
7510                goto onError;
7511            if (PyUnicode_GET_LENGTH(item) == 1) {
7512                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7513                if (value == 0xFFFE)
7514                    goto Undefined;
7515                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7516                    goto onError;
7517            }
7518            else {
7519                writer->overallocate = 1;
7520                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7521                    goto onError;
7522            }
7523        }
7524        else {
7525            /* wrong return value */
7526            PyErr_SetString(PyExc_TypeError,
7527                            "character mapping must return integer, None or str");
7528            goto onError;
7529        }
7530        Py_CLEAR(item);
7531        ++s;
7532        continue;
7533
7534Undefined:
7535        /* undefined mapping */
7536        Py_CLEAR(item);
7537        startinpos = s-starts;
7538        endinpos = startinpos+1;
7539        if (unicode_decode_call_errorhandler_writer(
7540                errors, &errorHandler,
7541                "charmap", "character maps to <undefined>",
7542                &starts, &e, &startinpos, &endinpos, &exc, &s,
7543                writer)) {
7544            goto onError;
7545        }
7546    }
7547    Py_XDECREF(errorHandler);
7548    Py_XDECREF(exc);
7549    return 0;
7550
7551onError:
7552    Py_XDECREF(item);
7553    Py_XDECREF(errorHandler);
7554    Py_XDECREF(exc);
7555    return -1;
7556}
7557
7558PyObject *
7559PyUnicode_DecodeCharmap(const char *s,
7560                        Py_ssize_t size,
7561                        PyObject *mapping,
7562                        const char *errors)
7563{
7564    _PyUnicodeWriter writer;
7565
7566    /* Default to Latin-1 */
7567    if (mapping == NULL)
7568        return PyUnicode_DecodeLatin1(s, size, errors);
7569
7570    if (size == 0)
7571        _Py_RETURN_UNICODE_EMPTY();
7572    _PyUnicodeWriter_Init(&writer);
7573    writer.min_length = size;
7574    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7575        goto onError;
7576
7577    if (PyUnicode_CheckExact(mapping)) {
7578        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7579            goto onError;
7580    }
7581    else {
7582        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7583            goto onError;
7584    }
7585    return _PyUnicodeWriter_Finish(&writer);
7586
7587  onError:
7588    _PyUnicodeWriter_Dealloc(&writer);
7589    return NULL;
7590}
7591
7592/* Charmap encoding: the lookup table */
7593
7594struct encoding_map {
7595    PyObject_HEAD
7596    unsigned char level1[32];
7597    int count2, count3;
7598    unsigned char level23[1];
7599};
7600
7601static PyObject*
7602encoding_map_size(PyObject *obj, PyObject* args)
7603{
7604    struct encoding_map *map = (struct encoding_map*)obj;
7605    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7606                           128*map->count3);
7607}
7608
7609static PyMethodDef encoding_map_methods[] = {
7610    {"size", encoding_map_size, METH_NOARGS,
7611     PyDoc_STR("Return the size (in bytes) of this object") },
7612    { 0 }
7613};
7614
7615static void
7616encoding_map_dealloc(PyObject* o)
7617{
7618    PyObject_FREE(o);
7619}
7620
7621static PyTypeObject EncodingMapType = {
7622    PyVarObject_HEAD_INIT(NULL, 0)
7623    "EncodingMap",          /*tp_name*/
7624    sizeof(struct encoding_map),   /*tp_basicsize*/
7625    0,                      /*tp_itemsize*/
7626    /* methods */
7627    encoding_map_dealloc,   /*tp_dealloc*/
7628    0,                      /*tp_print*/
7629    0,                      /*tp_getattr*/
7630    0,                      /*tp_setattr*/
7631    0,                      /*tp_reserved*/
7632    0,                      /*tp_repr*/
7633    0,                      /*tp_as_number*/
7634    0,                      /*tp_as_sequence*/
7635    0,                      /*tp_as_mapping*/
7636    0,                      /*tp_hash*/
7637    0,                      /*tp_call*/
7638    0,                      /*tp_str*/
7639    0,                      /*tp_getattro*/
7640    0,                      /*tp_setattro*/
7641    0,                      /*tp_as_buffer*/
7642    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7643    0,                      /*tp_doc*/
7644    0,                      /*tp_traverse*/
7645    0,                      /*tp_clear*/
7646    0,                      /*tp_richcompare*/
7647    0,                      /*tp_weaklistoffset*/
7648    0,                      /*tp_iter*/
7649    0,                      /*tp_iternext*/
7650    encoding_map_methods,   /*tp_methods*/
7651    0,                      /*tp_members*/
7652    0,                      /*tp_getset*/
7653    0,                      /*tp_base*/
7654    0,                      /*tp_dict*/
7655    0,                      /*tp_descr_get*/
7656    0,                      /*tp_descr_set*/
7657    0,                      /*tp_dictoffset*/
7658    0,                      /*tp_init*/
7659    0,                      /*tp_alloc*/
7660    0,                      /*tp_new*/
7661    0,                      /*tp_free*/
7662    0,                      /*tp_is_gc*/
7663};
7664
7665PyObject*
7666PyUnicode_BuildEncodingMap(PyObject* string)
7667{
7668    PyObject *result;
7669    struct encoding_map *mresult;
7670    int i;
7671    int need_dict = 0;
7672    unsigned char level1[32];
7673    unsigned char level2[512];
7674    unsigned char *mlevel1, *mlevel2, *mlevel3;
7675    int count2 = 0, count3 = 0;
7676    int kind;
7677    void *data;
7678    Py_ssize_t length;
7679    Py_UCS4 ch;
7680
7681    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7682        PyErr_BadArgument();
7683        return NULL;
7684    }
7685    kind = PyUnicode_KIND(string);
7686    data = PyUnicode_DATA(string);
7687    length = PyUnicode_GET_LENGTH(string);
7688    length = Py_MIN(length, 256);
7689    memset(level1, 0xFF, sizeof level1);
7690    memset(level2, 0xFF, sizeof level2);
7691
7692    /* If there isn't a one-to-one mapping of NULL to \0,
7693       or if there are non-BMP characters, we need to use
7694       a mapping dictionary. */
7695    if (PyUnicode_READ(kind, data, 0) != 0)
7696        need_dict = 1;
7697    for (i = 1; i < length; i++) {
7698        int l1, l2;
7699        ch = PyUnicode_READ(kind, data, i);
7700        if (ch == 0 || ch > 0xFFFF) {
7701            need_dict = 1;
7702            break;
7703        }
7704        if (ch == 0xFFFE)
7705            /* unmapped character */
7706            continue;
7707        l1 = ch >> 11;
7708        l2 = ch >> 7;
7709        if (level1[l1] == 0xFF)
7710            level1[l1] = count2++;
7711        if (level2[l2] == 0xFF)
7712            level2[l2] = count3++;
7713    }
7714
7715    if (count2 >= 0xFF || count3 >= 0xFF)
7716        need_dict = 1;
7717
7718    if (need_dict) {
7719        PyObject *result = PyDict_New();
7720        PyObject *key, *value;
7721        if (!result)
7722            return NULL;
7723        for (i = 0; i < length; i++) {
7724            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7725            value = PyLong_FromLong(i);
7726            if (!key || !value)
7727                goto failed1;
7728            if (PyDict_SetItem(result, key, value) == -1)
7729                goto failed1;
7730            Py_DECREF(key);
7731            Py_DECREF(value);
7732        }
7733        return result;
7734      failed1:
7735        Py_XDECREF(key);
7736        Py_XDECREF(value);
7737        Py_DECREF(result);
7738        return NULL;
7739    }
7740
7741    /* Create a three-level trie */
7742    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7743                             16*count2 + 128*count3 - 1);
7744    if (!result)
7745        return PyErr_NoMemory();
7746    PyObject_Init(result, &EncodingMapType);
7747    mresult = (struct encoding_map*)result;
7748    mresult->count2 = count2;
7749    mresult->count3 = count3;
7750    mlevel1 = mresult->level1;
7751    mlevel2 = mresult->level23;
7752    mlevel3 = mresult->level23 + 16*count2;
7753    memcpy(mlevel1, level1, 32);
7754    memset(mlevel2, 0xFF, 16*count2);
7755    memset(mlevel3, 0, 128*count3);
7756    count3 = 0;
7757    for (i = 1; i < length; i++) {
7758        int o1, o2, o3, i2, i3;
7759        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7760        if (ch == 0xFFFE)
7761            /* unmapped character */
7762            continue;
7763        o1 = ch>>11;
7764        o2 = (ch>>7) & 0xF;
7765        i2 = 16*mlevel1[o1] + o2;
7766        if (mlevel2[i2] == 0xFF)
7767            mlevel2[i2] = count3++;
7768        o3 = ch & 0x7F;
7769        i3 = 128*mlevel2[i2] + o3;
7770        mlevel3[i3] = i;
7771    }
7772    return result;
7773}
7774
7775static int
7776encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
7777{
7778    struct encoding_map *map = (struct encoding_map*)mapping;
7779    int l1 = c>>11;
7780    int l2 = (c>>7) & 0xF;
7781    int l3 = c & 0x7F;
7782    int i;
7783
7784    if (c > 0xFFFF)
7785        return -1;
7786    if (c == 0)
7787        return 0;
7788    /* level 1*/
7789    i = map->level1[l1];
7790    if (i == 0xFF) {
7791        return -1;
7792    }
7793    /* level 2*/
7794    i = map->level23[16*i+l2];
7795    if (i == 0xFF) {
7796        return -1;
7797    }
7798    /* level 3 */
7799    i = map->level23[16*map->count2 + 128*i + l3];
7800    if (i == 0) {
7801        return -1;
7802    }
7803    return i;
7804}
7805
7806/* Lookup the character ch in the mapping. If the character
7807   can't be found, Py_None is returned (or NULL, if another
7808   error occurred). */
7809static PyObject *
7810charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
7811{
7812    PyObject *w = PyLong_FromLong((long)c);
7813    PyObject *x;
7814
7815    if (w == NULL)
7816        return NULL;
7817    x = PyObject_GetItem(mapping, w);
7818    Py_DECREF(w);
7819    if (x == NULL) {
7820        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7821            /* No mapping found means: mapping is undefined. */
7822            PyErr_Clear();
7823            x = Py_None;
7824            Py_INCREF(x);
7825            return x;
7826        } else
7827            return NULL;
7828    }
7829    else if (x == Py_None)
7830        return x;
7831    else if (PyLong_Check(x)) {
7832        long value = PyLong_AS_LONG(x);
7833        if (value < 0 || value > 255) {
7834            PyErr_SetString(PyExc_TypeError,
7835                            "character mapping must be in range(256)");
7836            Py_DECREF(x);
7837            return NULL;
7838        }
7839        return x;
7840    }
7841    else if (PyBytes_Check(x))
7842        return x;
7843    else {
7844        /* wrong return value */
7845        PyErr_Format(PyExc_TypeError,
7846                     "character mapping must return integer, bytes or None, not %.400s",
7847                     x->ob_type->tp_name);
7848        Py_DECREF(x);
7849        return NULL;
7850    }
7851}
7852
7853static int
7854charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
7855{
7856    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7857    /* exponentially overallocate to minimize reallocations */
7858    if (requiredsize < 2*outsize)
7859        requiredsize = 2*outsize;
7860    if (_PyBytes_Resize(outobj, requiredsize))
7861        return -1;
7862    return 0;
7863}
7864
7865typedef enum charmapencode_result {
7866    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
7867} charmapencode_result;
7868/* lookup the character, put the result in the output string and adjust
7869   various state variables. Resize the output bytes object if not enough
7870   space is available. Return a new reference to the object that
7871   was put in the output buffer, or Py_None, if the mapping was undefined
7872   (in which case no character was written) or NULL, if a
7873   reallocation error occurred. The caller must decref the result */
7874static charmapencode_result
7875charmapencode_output(Py_UCS4 c, PyObject *mapping,
7876                     PyObject **outobj, Py_ssize_t *outpos)
7877{
7878    PyObject *rep;
7879    char *outstart;
7880    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
7881
7882    if (Py_TYPE(mapping) == &EncodingMapType) {
7883        int res = encoding_map_lookup(c, mapping);
7884        Py_ssize_t requiredsize = *outpos+1;
7885        if (res == -1)
7886            return enc_FAILED;
7887        if (outsize<requiredsize)
7888            if (charmapencode_resize(outobj, outpos, requiredsize))
7889                return enc_EXCEPTION;
7890        outstart = PyBytes_AS_STRING(*outobj);
7891        outstart[(*outpos)++] = (char)res;
7892        return enc_SUCCESS;
7893    }
7894
7895    rep = charmapencode_lookup(c, mapping);
7896    if (rep==NULL)
7897        return enc_EXCEPTION;
7898    else if (rep==Py_None) {
7899        Py_DECREF(rep);
7900        return enc_FAILED;
7901    } else {
7902        if (PyLong_Check(rep)) {
7903            Py_ssize_t requiredsize = *outpos+1;
7904            if (outsize<requiredsize)
7905                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7906                    Py_DECREF(rep);
7907                    return enc_EXCEPTION;
7908                }
7909            outstart = PyBytes_AS_STRING(*outobj);
7910            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
7911        }
7912        else {
7913            const char *repchars = PyBytes_AS_STRING(rep);
7914            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
7915            Py_ssize_t requiredsize = *outpos+repsize;
7916            if (outsize<requiredsize)
7917                if (charmapencode_resize(outobj, outpos, requiredsize)) {
7918                    Py_DECREF(rep);
7919                    return enc_EXCEPTION;
7920                }
7921            outstart = PyBytes_AS_STRING(*outobj);
7922            memcpy(outstart + *outpos, repchars, repsize);
7923            *outpos += repsize;
7924        }
7925    }
7926    Py_DECREF(rep);
7927    return enc_SUCCESS;
7928}
7929
7930/* handle an error in PyUnicode_EncodeCharmap
7931   Return 0 on success, -1 on error */
7932static int
7933charmap_encoding_error(
7934    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
7935    PyObject **exceptionObject,
7936    int *known_errorHandler, PyObject **errorHandler, const char *errors,
7937    PyObject **res, Py_ssize_t *respos)
7938{
7939    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
7940    Py_ssize_t size, repsize;
7941    Py_ssize_t newpos;
7942    enum PyUnicode_Kind kind;
7943    void *data;
7944    Py_ssize_t index;
7945    /* startpos for collecting unencodable chars */
7946    Py_ssize_t collstartpos = *inpos;
7947    Py_ssize_t collendpos = *inpos+1;
7948    Py_ssize_t collpos;
7949    char *encoding = "charmap";
7950    char *reason = "character maps to <undefined>";
7951    charmapencode_result x;
7952    Py_UCS4 ch;
7953    int val;
7954
7955    if (PyUnicode_READY(unicode) == -1)
7956        return -1;
7957    size = PyUnicode_GET_LENGTH(unicode);
7958    /* find all unencodable characters */
7959    while (collendpos < size) {
7960        PyObject *rep;
7961        if (Py_TYPE(mapping) == &EncodingMapType) {
7962            ch = PyUnicode_READ_CHAR(unicode, collendpos);
7963            val = encoding_map_lookup(ch, mapping);
7964            if (val != -1)
7965                break;
7966            ++collendpos;
7967            continue;
7968        }
7969
7970        ch = PyUnicode_READ_CHAR(unicode, collendpos);
7971        rep = charmapencode_lookup(ch, mapping);
7972        if (rep==NULL)
7973            return -1;
7974        else if (rep!=Py_None) {
7975            Py_DECREF(rep);
7976            break;
7977        }
7978        Py_DECREF(rep);
7979        ++collendpos;
7980    }
7981    /* cache callback name lookup
7982     * (if not done yet, i.e. it's the first error) */
7983    if (*known_errorHandler==-1) {
7984        if ((errors==NULL) || (!strcmp(errors, "strict")))
7985            *known_errorHandler = 1;
7986        else if (!strcmp(errors, "replace"))
7987            *known_errorHandler = 2;
7988        else if (!strcmp(errors, "ignore"))
7989            *known_errorHandler = 3;
7990        else if (!strcmp(errors, "xmlcharrefreplace"))
7991            *known_errorHandler = 4;
7992        else
7993            *known_errorHandler = 0;
7994    }
7995    switch (*known_errorHandler) {
7996    case 1: /* strict */
7997        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
7998        return -1;
7999    case 2: /* replace */
8000        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8001            x = charmapencode_output('?', mapping, res, respos);
8002            if (x==enc_EXCEPTION) {
8003                return -1;
8004            }
8005            else if (x==enc_FAILED) {
8006                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8007                return -1;
8008            }
8009        }
8010        /* fall through */
8011    case 3: /* ignore */
8012        *inpos = collendpos;
8013        break;
8014    case 4: /* xmlcharrefreplace */
8015        /* generate replacement (temporarily (mis)uses p) */
8016        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8017            char buffer[2+29+1+1];
8018            char *cp;
8019            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8020            for (cp = buffer; *cp; ++cp) {
8021                x = charmapencode_output(*cp, mapping, res, respos);
8022                if (x==enc_EXCEPTION)
8023                    return -1;
8024                else if (x==enc_FAILED) {
8025                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8026                    return -1;
8027                }
8028            }
8029        }
8030        *inpos = collendpos;
8031        break;
8032    default:
8033        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8034                                                      encoding, reason, unicode, exceptionObject,
8035                                                      collstartpos, collendpos, &newpos);
8036        if (repunicode == NULL)
8037            return -1;
8038        if (PyBytes_Check(repunicode)) {
8039            /* Directly copy bytes result to output. */
8040            Py_ssize_t outsize = PyBytes_Size(*res);
8041            Py_ssize_t requiredsize;
8042            repsize = PyBytes_Size(repunicode);
8043            requiredsize = *respos + repsize;
8044            if (requiredsize > outsize)
8045                /* Make room for all additional bytes. */
8046                if (charmapencode_resize(res, respos, requiredsize)) {
8047                    Py_DECREF(repunicode);
8048                    return -1;
8049                }
8050            memcpy(PyBytes_AsString(*res) + *respos,
8051                   PyBytes_AsString(repunicode),  repsize);
8052            *respos += repsize;
8053            *inpos = newpos;
8054            Py_DECREF(repunicode);
8055            break;
8056        }
8057        /* generate replacement  */
8058        if (PyUnicode_READY(repunicode) == -1) {
8059            Py_DECREF(repunicode);
8060            return -1;
8061        }
8062        repsize = PyUnicode_GET_LENGTH(repunicode);
8063        data = PyUnicode_DATA(repunicode);
8064        kind = PyUnicode_KIND(repunicode);
8065        for (index = 0; index < repsize; index++) {
8066            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8067            x = charmapencode_output(repch, mapping, res, respos);
8068            if (x==enc_EXCEPTION) {
8069                Py_DECREF(repunicode);
8070                return -1;
8071            }
8072            else if (x==enc_FAILED) {
8073                Py_DECREF(repunicode);
8074                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8075                return -1;
8076            }
8077        }
8078        *inpos = newpos;
8079        Py_DECREF(repunicode);
8080    }
8081    return 0;
8082}
8083
8084PyObject *
8085_PyUnicode_EncodeCharmap(PyObject *unicode,
8086                         PyObject *mapping,
8087                         const char *errors)
8088{
8089    /* output object */
8090    PyObject *res = NULL;
8091    /* current input position */
8092    Py_ssize_t inpos = 0;
8093    Py_ssize_t size;
8094    /* current output position */
8095    Py_ssize_t respos = 0;
8096    PyObject *errorHandler = NULL;
8097    PyObject *exc = NULL;
8098    /* the following variable is used for caching string comparisons
8099     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8100     * 3=ignore, 4=xmlcharrefreplace */
8101    int known_errorHandler = -1;
8102    void *data;
8103    int kind;
8104
8105    if (PyUnicode_READY(unicode) == -1)
8106        return NULL;
8107    size = PyUnicode_GET_LENGTH(unicode);
8108    data = PyUnicode_DATA(unicode);
8109    kind = PyUnicode_KIND(unicode);
8110
8111    /* Default to Latin-1 */
8112    if (mapping == NULL)
8113        return unicode_encode_ucs1(unicode, errors, 256);
8114
8115    /* allocate enough for a simple encoding without
8116       replacements, if we need more, we'll resize */
8117    res = PyBytes_FromStringAndSize(NULL, size);
8118    if (res == NULL)
8119        goto onError;
8120    if (size == 0)
8121        return res;
8122
8123    while (inpos<size) {
8124        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8125        /* try to encode it */
8126        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8127        if (x==enc_EXCEPTION) /* error */
8128            goto onError;
8129        if (x==enc_FAILED) { /* unencodable character */
8130            if (charmap_encoding_error(unicode, &inpos, mapping,
8131                                       &exc,
8132                                       &known_errorHandler, &errorHandler, errors,
8133                                       &res, &respos)) {
8134                goto onError;
8135            }
8136        }
8137        else
8138            /* done with this character => adjust input position */
8139            ++inpos;
8140    }
8141
8142    /* Resize if we allocated to much */
8143    if (respos<PyBytes_GET_SIZE(res))
8144        if (_PyBytes_Resize(&res, respos) < 0)
8145            goto onError;
8146
8147    Py_XDECREF(exc);
8148    Py_XDECREF(errorHandler);
8149    return res;
8150
8151  onError:
8152    Py_XDECREF(res);
8153    Py_XDECREF(exc);
8154    Py_XDECREF(errorHandler);
8155    return NULL;
8156}
8157
8158/* Deprecated */
8159PyObject *
8160PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8161                        Py_ssize_t size,
8162                        PyObject *mapping,
8163                        const char *errors)
8164{
8165    PyObject *result;
8166    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8167    if (unicode == NULL)
8168        return NULL;
8169    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8170    Py_DECREF(unicode);
8171    return result;
8172}
8173
8174PyObject *
8175PyUnicode_AsCharmapString(PyObject *unicode,
8176                          PyObject *mapping)
8177{
8178    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8179        PyErr_BadArgument();
8180        return NULL;
8181    }
8182    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8183}
8184
8185/* create or adjust a UnicodeTranslateError */
8186static void
8187make_translate_exception(PyObject **exceptionObject,
8188                         PyObject *unicode,
8189                         Py_ssize_t startpos, Py_ssize_t endpos,
8190                         const char *reason)
8191{
8192    if (*exceptionObject == NULL) {
8193        *exceptionObject = _PyUnicodeTranslateError_Create(
8194            unicode, startpos, endpos, reason);
8195    }
8196    else {
8197        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8198            goto onError;
8199        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8200            goto onError;
8201        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8202            goto onError;
8203        return;
8204      onError:
8205        Py_DECREF(*exceptionObject);
8206        *exceptionObject = NULL;
8207    }
8208}
8209
8210/* error handling callback helper:
8211   build arguments, call the callback and check the arguments,
8212   put the result into newpos and return the replacement string, which
8213   has to be freed by the caller */
8214static PyObject *
8215unicode_translate_call_errorhandler(const char *errors,
8216                                    PyObject **errorHandler,
8217                                    const char *reason,
8218                                    PyObject *unicode, PyObject **exceptionObject,
8219                                    Py_ssize_t startpos, Py_ssize_t endpos,
8220                                    Py_ssize_t *newpos)
8221{
8222    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8223
8224    Py_ssize_t i_newpos;
8225    PyObject *restuple;
8226    PyObject *resunicode;
8227
8228    if (*errorHandler == NULL) {
8229        *errorHandler = PyCodec_LookupError(errors);
8230        if (*errorHandler == NULL)
8231            return NULL;
8232    }
8233
8234    make_translate_exception(exceptionObject,
8235                             unicode, startpos, endpos, reason);
8236    if (*exceptionObject == NULL)
8237        return NULL;
8238
8239    restuple = PyObject_CallFunctionObjArgs(
8240        *errorHandler, *exceptionObject, NULL);
8241    if (restuple == NULL)
8242        return NULL;
8243    if (!PyTuple_Check(restuple)) {
8244        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8245        Py_DECREF(restuple);
8246        return NULL;
8247    }
8248    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8249                          &resunicode, &i_newpos)) {
8250        Py_DECREF(restuple);
8251        return NULL;
8252    }
8253    if (i_newpos<0)
8254        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8255    else
8256        *newpos = i_newpos;
8257    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8258        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8259        Py_DECREF(restuple);
8260        return NULL;
8261    }
8262    Py_INCREF(resunicode);
8263    Py_DECREF(restuple);
8264    return resunicode;
8265}
8266
8267/* Lookup the character ch in the mapping and put the result in result,
8268   which must be decrefed by the caller.
8269   Return 0 on success, -1 on error */
8270static int
8271charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8272{
8273    PyObject *w = PyLong_FromLong((long)c);
8274    PyObject *x;
8275
8276    if (w == NULL)
8277        return -1;
8278    x = PyObject_GetItem(mapping, w);
8279    Py_DECREF(w);
8280    if (x == NULL) {
8281        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8282            /* No mapping found means: use 1:1 mapping. */
8283            PyErr_Clear();
8284            *result = NULL;
8285            return 0;
8286        } else
8287            return -1;
8288    }
8289    else if (x == Py_None) {
8290        *result = x;
8291        return 0;
8292    }
8293    else if (PyLong_Check(x)) {
8294        long value = PyLong_AS_LONG(x);
8295        long max = PyUnicode_GetMax();
8296        if (value < 0 || value > max) {
8297            PyErr_Format(PyExc_TypeError,
8298                         "character mapping must be in range(0x%x)", max+1);
8299            Py_DECREF(x);
8300            return -1;
8301        }
8302        *result = x;
8303        return 0;
8304    }
8305    else if (PyUnicode_Check(x)) {
8306        *result = x;
8307        return 0;
8308    }
8309    else {
8310        /* wrong return value */
8311        PyErr_SetString(PyExc_TypeError,
8312                        "character mapping must return integer, None or str");
8313        Py_DECREF(x);
8314        return -1;
8315    }
8316}
8317/* ensure that *outobj is at least requiredsize characters long,
8318   if not reallocate and adjust various state variables.
8319   Return 0 on success, -1 on error */
8320static int
8321charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8322                               Py_ssize_t requiredsize)
8323{
8324    Py_ssize_t oldsize = *psize;
8325    Py_UCS4 *new_outobj;
8326    if (requiredsize > oldsize) {
8327        /* exponentially overallocate to minimize reallocations */
8328        if (requiredsize < 2 * oldsize)
8329            requiredsize = 2 * oldsize;
8330        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8331        if (new_outobj == 0)
8332            return -1;
8333        *outobj = new_outobj;
8334        *psize = requiredsize;
8335    }
8336    return 0;
8337}
8338/* lookup the character, put the result in the output string and adjust
8339   various state variables. Return a new reference to the object that
8340   was put in the output buffer in *result, or Py_None, if the mapping was
8341   undefined (in which case no character was written).
8342   The called must decref result.
8343   Return 0 on success, -1 on error. */
8344static int
8345charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8346                        PyObject *mapping, Py_UCS4 **output,
8347                        Py_ssize_t *osize, Py_ssize_t *opos,
8348                        PyObject **res)
8349{
8350    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8351    if (charmaptranslate_lookup(curinp, mapping, res))
8352        return -1;
8353    if (*res==NULL) {
8354        /* not found => default to 1:1 mapping */
8355        (*output)[(*opos)++] = curinp;
8356    }
8357    else if (*res==Py_None)
8358        ;
8359    else if (PyLong_Check(*res)) {
8360        /* no overflow check, because we know that the space is enough */
8361        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8362    }
8363    else if (PyUnicode_Check(*res)) {
8364        Py_ssize_t repsize;
8365        if (PyUnicode_READY(*res) == -1)
8366            return -1;
8367        repsize = PyUnicode_GET_LENGTH(*res);
8368        if (repsize==1) {
8369            /* no overflow check, because we know that the space is enough */
8370            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8371        }
8372        else if (repsize!=0) {
8373            /* more than one character */
8374            Py_ssize_t requiredsize = *opos +
8375                (PyUnicode_GET_LENGTH(input) - ipos) +
8376                repsize - 1;
8377            Py_ssize_t i;
8378            if (charmaptranslate_makespace(output, osize, requiredsize))
8379                return -1;
8380            for(i = 0; i < repsize; i++)
8381                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8382        }
8383    }
8384    else
8385        return -1;
8386    return 0;
8387}
8388
8389PyObject *
8390_PyUnicode_TranslateCharmap(PyObject *input,
8391                            PyObject *mapping,
8392                            const char *errors)
8393{
8394    /* input object */
8395    char *idata;
8396    Py_ssize_t size, i;
8397    int kind;
8398    /* output buffer */
8399    Py_UCS4 *output = NULL;
8400    Py_ssize_t osize;
8401    PyObject *res;
8402    /* current output position */
8403    Py_ssize_t opos;
8404    char *reason = "character maps to <undefined>";
8405    PyObject *errorHandler = NULL;
8406    PyObject *exc = NULL;
8407    /* the following variable is used for caching string comparisons
8408     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8409     * 3=ignore, 4=xmlcharrefreplace */
8410    int known_errorHandler = -1;
8411
8412    if (mapping == NULL) {
8413        PyErr_BadArgument();
8414        return NULL;
8415    }
8416
8417    if (PyUnicode_READY(input) == -1)
8418        return NULL;
8419    idata = (char*)PyUnicode_DATA(input);
8420    kind = PyUnicode_KIND(input);
8421    size = PyUnicode_GET_LENGTH(input);
8422    i = 0;
8423
8424    if (size == 0) {
8425        Py_INCREF(input);
8426        return input;
8427    }
8428
8429    /* allocate enough for a simple 1:1 translation without
8430       replacements, if we need more, we'll resize */
8431    osize = size;
8432    output = PyMem_Malloc(osize * sizeof(Py_UCS4));
8433    opos = 0;
8434    if (output == NULL) {
8435        PyErr_NoMemory();
8436        goto onError;
8437    }
8438
8439    while (i<size) {
8440        /* try to encode it */
8441        PyObject *x = NULL;
8442        if (charmaptranslate_output(input, i, mapping,
8443                                    &output, &osize, &opos, &x)) {
8444            Py_XDECREF(x);
8445            goto onError;
8446        }
8447        Py_XDECREF(x);
8448        if (x!=Py_None) /* it worked => adjust input pointer */
8449            ++i;
8450        else { /* untranslatable character */
8451            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8452            Py_ssize_t repsize;
8453            Py_ssize_t newpos;
8454            Py_ssize_t uni2;
8455            /* startpos for collecting untranslatable chars */
8456            Py_ssize_t collstart = i;
8457            Py_ssize_t collend = i+1;
8458            Py_ssize_t coll;
8459
8460            /* find all untranslatable characters */
8461            while (collend < size) {
8462                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8463                    goto onError;
8464                Py_XDECREF(x);
8465                if (x!=Py_None)
8466                    break;
8467                ++collend;
8468            }
8469            /* cache callback name lookup
8470             * (if not done yet, i.e. it's the first error) */
8471            if (known_errorHandler==-1) {
8472                if ((errors==NULL) || (!strcmp(errors, "strict")))
8473                    known_errorHandler = 1;
8474                else if (!strcmp(errors, "replace"))
8475                    known_errorHandler = 2;
8476                else if (!strcmp(errors, "ignore"))
8477                    known_errorHandler = 3;
8478                else if (!strcmp(errors, "xmlcharrefreplace"))
8479                    known_errorHandler = 4;
8480                else
8481                    known_errorHandler = 0;
8482            }
8483            switch (known_errorHandler) {
8484            case 1: /* strict */
8485                make_translate_exception(&exc,
8486                                         input, collstart, collend, reason);
8487                if (exc != NULL)
8488                    PyCodec_StrictErrors(exc);
8489                goto onError;
8490            case 2: /* replace */
8491                /* No need to check for space, this is a 1:1 replacement */
8492                for (coll = collstart; coll<collend; coll++)
8493                    output[opos++] = '?';
8494                /* fall through */
8495            case 3: /* ignore */
8496                i = collend;
8497                break;
8498            case 4: /* xmlcharrefreplace */
8499                /* generate replacement (temporarily (mis)uses i) */
8500                for (i = collstart; i < collend; ++i) {
8501                    char buffer[2+29+1+1];
8502                    char *cp;
8503                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8504                    if (charmaptranslate_makespace(&output, &osize,
8505                                                   opos+strlen(buffer)+(size-collend)))
8506                        goto onError;
8507                    for (cp = buffer; *cp; ++cp)
8508                        output[opos++] = *cp;
8509                }
8510                i = collend;
8511                break;
8512            default:
8513                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8514                                                                 reason, input, &exc,
8515                                                                 collstart, collend, &newpos);
8516                if (repunicode == NULL)
8517                    goto onError;
8518                if (PyUnicode_READY(repunicode) == -1) {
8519                    Py_DECREF(repunicode);
8520                    goto onError;
8521                }
8522                /* generate replacement  */
8523                repsize = PyUnicode_GET_LENGTH(repunicode);
8524                if (charmaptranslate_makespace(&output, &osize,
8525                                               opos+repsize+(size-collend))) {
8526                    Py_DECREF(repunicode);
8527                    goto onError;
8528                }
8529                for (uni2 = 0; repsize-->0; ++uni2)
8530                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8531                i = newpos;
8532                Py_DECREF(repunicode);
8533            }
8534        }
8535    }
8536    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8537    if (!res)
8538        goto onError;
8539    PyMem_Free(output);
8540    Py_XDECREF(exc);
8541    Py_XDECREF(errorHandler);
8542    return res;
8543
8544  onError:
8545    PyMem_Free(output);
8546    Py_XDECREF(exc);
8547    Py_XDECREF(errorHandler);
8548    return NULL;
8549}
8550
8551/* Deprecated. Use PyUnicode_Translate instead. */
8552PyObject *
8553PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8554                           Py_ssize_t size,
8555                           PyObject *mapping,
8556                           const char *errors)
8557{
8558    PyObject *result;
8559    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8560    if (!unicode)
8561        return NULL;
8562    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8563    Py_DECREF(unicode);
8564    return result;
8565}
8566
8567PyObject *
8568PyUnicode_Translate(PyObject *str,
8569                    PyObject *mapping,
8570                    const char *errors)
8571{
8572    PyObject *result;
8573
8574    str = PyUnicode_FromObject(str);
8575    if (str == NULL)
8576        return NULL;
8577    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8578    Py_DECREF(str);
8579    return result;
8580}
8581
8582static Py_UCS4
8583fix_decimal_and_space_to_ascii(PyObject *self)
8584{
8585    /* No need to call PyUnicode_READY(self) because this function is only
8586       called as a callback from fixup() which does it already. */
8587    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8588    const int kind = PyUnicode_KIND(self);
8589    void *data = PyUnicode_DATA(self);
8590    Py_UCS4 maxchar = 127, ch, fixed;
8591    int modified = 0;
8592    Py_ssize_t i;
8593
8594    for (i = 0; i < len; ++i) {
8595        ch = PyUnicode_READ(kind, data, i);
8596        fixed = 0;
8597        if (ch > 127) {
8598            if (Py_UNICODE_ISSPACE(ch))
8599                fixed = ' ';
8600            else {
8601                const int decimal = Py_UNICODE_TODECIMAL(ch);
8602                if (decimal >= 0)
8603                    fixed = '0' + decimal;
8604            }
8605            if (fixed != 0) {
8606                modified = 1;
8607                maxchar = Py_MAX(maxchar, fixed);
8608                PyUnicode_WRITE(kind, data, i, fixed);
8609            }
8610            else
8611                maxchar = Py_MAX(maxchar, ch);
8612        }
8613    }
8614
8615    return (modified) ? maxchar : 0;
8616}
8617
8618PyObject *
8619_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8620{
8621    if (!PyUnicode_Check(unicode)) {
8622        PyErr_BadInternalCall();
8623        return NULL;
8624    }
8625    if (PyUnicode_READY(unicode) == -1)
8626        return NULL;
8627    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8628        /* If the string is already ASCII, just return the same string */
8629        Py_INCREF(unicode);
8630        return unicode;
8631    }
8632    return fixup(unicode, fix_decimal_and_space_to_ascii);
8633}
8634
8635PyObject *
8636PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8637                                  Py_ssize_t length)
8638{
8639    PyObject *decimal;
8640    Py_ssize_t i;
8641    Py_UCS4 maxchar;
8642    enum PyUnicode_Kind kind;
8643    void *data;
8644
8645    maxchar = 127;
8646    for (i = 0; i < length; i++) {
8647        Py_UNICODE ch = s[i];
8648        if (ch > 127) {
8649            int decimal = Py_UNICODE_TODECIMAL(ch);
8650            if (decimal >= 0)
8651                ch = '0' + decimal;
8652            maxchar = Py_MAX(maxchar, ch);
8653        }
8654    }
8655
8656    /* Copy to a new string */
8657    decimal = PyUnicode_New(length, maxchar);
8658    if (decimal == NULL)
8659        return decimal;
8660    kind = PyUnicode_KIND(decimal);
8661    data = PyUnicode_DATA(decimal);
8662    /* Iterate over code points */
8663    for (i = 0; i < length; i++) {
8664        Py_UNICODE ch = s[i];
8665        if (ch > 127) {
8666            int decimal = Py_UNICODE_TODECIMAL(ch);
8667            if (decimal >= 0)
8668                ch = '0' + decimal;
8669        }
8670        PyUnicode_WRITE(kind, data, i, ch);
8671    }
8672    return unicode_result(decimal);
8673}
8674/* --- Decimal Encoder ---------------------------------------------------- */
8675
8676int
8677PyUnicode_EncodeDecimal(Py_UNICODE *s,
8678                        Py_ssize_t length,
8679                        char *output,
8680                        const char *errors)
8681{
8682    PyObject *unicode;
8683    Py_ssize_t i;
8684    enum PyUnicode_Kind kind;
8685    void *data;
8686
8687    if (output == NULL) {
8688        PyErr_BadArgument();
8689        return -1;
8690    }
8691
8692    unicode = PyUnicode_FromUnicode(s, length);
8693    if (unicode == NULL)
8694        return -1;
8695
8696    if (PyUnicode_READY(unicode) == -1) {
8697        Py_DECREF(unicode);
8698        return -1;
8699    }
8700    kind = PyUnicode_KIND(unicode);
8701    data = PyUnicode_DATA(unicode);
8702
8703    for (i=0; i < length; ) {
8704        PyObject *exc;
8705        Py_UCS4 ch;
8706        int decimal;
8707        Py_ssize_t startpos;
8708
8709        ch = PyUnicode_READ(kind, data, i);
8710
8711        if (Py_UNICODE_ISSPACE(ch)) {
8712            *output++ = ' ';
8713            i++;
8714            continue;
8715        }
8716        decimal = Py_UNICODE_TODECIMAL(ch);
8717        if (decimal >= 0) {
8718            *output++ = '0' + decimal;
8719            i++;
8720            continue;
8721        }
8722        if (0 < ch && ch < 256) {
8723            *output++ = (char)ch;
8724            i++;
8725            continue;
8726        }
8727
8728        startpos = i;
8729        exc = NULL;
8730        raise_encode_exception(&exc, "decimal", unicode,
8731                               startpos, startpos+1,
8732                               "invalid decimal Unicode string");
8733        Py_XDECREF(exc);
8734        Py_DECREF(unicode);
8735        return -1;
8736    }
8737    /* 0-terminate the output string */
8738    *output++ = '\0';
8739    Py_DECREF(unicode);
8740    return 0;
8741}
8742
8743/* --- Helpers ------------------------------------------------------------ */
8744
8745static Py_ssize_t
8746any_find_slice(int direction, PyObject* s1, PyObject* s2,
8747               Py_ssize_t start,
8748               Py_ssize_t end)
8749{
8750    int kind1, kind2, kind;
8751    void *buf1, *buf2;
8752    Py_ssize_t len1, len2, result;
8753
8754    kind1 = PyUnicode_KIND(s1);
8755    kind2 = PyUnicode_KIND(s2);
8756    kind = kind1 > kind2 ? kind1 : kind2;
8757    buf1 = PyUnicode_DATA(s1);
8758    buf2 = PyUnicode_DATA(s2);
8759    if (kind1 != kind)
8760        buf1 = _PyUnicode_AsKind(s1, kind);
8761    if (!buf1)
8762        return -2;
8763    if (kind2 != kind)
8764        buf2 = _PyUnicode_AsKind(s2, kind);
8765    if (!buf2) {
8766        if (kind1 != kind) PyMem_Free(buf1);
8767        return -2;
8768    }
8769    len1 = PyUnicode_GET_LENGTH(s1);
8770    len2 = PyUnicode_GET_LENGTH(s2);
8771
8772    if (direction > 0) {
8773        switch (kind) {
8774        case PyUnicode_1BYTE_KIND:
8775            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8776                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
8777            else
8778                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
8779            break;
8780        case PyUnicode_2BYTE_KIND:
8781            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
8782            break;
8783        case PyUnicode_4BYTE_KIND:
8784            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
8785            break;
8786        default:
8787            assert(0); result = -2;
8788        }
8789    }
8790    else {
8791        switch (kind) {
8792        case PyUnicode_1BYTE_KIND:
8793            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
8794                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
8795            else
8796                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8797            break;
8798        case PyUnicode_2BYTE_KIND:
8799            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8800            break;
8801        case PyUnicode_4BYTE_KIND:
8802            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
8803            break;
8804        default:
8805            assert(0); result = -2;
8806        }
8807    }
8808
8809    if (kind1 != kind)
8810        PyMem_Free(buf1);
8811    if (kind2 != kind)
8812        PyMem_Free(buf2);
8813
8814    return result;
8815}
8816
8817Py_ssize_t
8818_PyUnicode_InsertThousandsGrouping(
8819    PyObject *unicode, Py_ssize_t index,
8820    Py_ssize_t n_buffer,
8821    void *digits, Py_ssize_t n_digits,
8822    Py_ssize_t min_width,
8823    const char *grouping, PyObject *thousands_sep,
8824    Py_UCS4 *maxchar)
8825{
8826    unsigned int kind, thousands_sep_kind;
8827    char *data, *thousands_sep_data;
8828    Py_ssize_t thousands_sep_len;
8829    Py_ssize_t len;
8830
8831    if (unicode != NULL) {
8832        kind = PyUnicode_KIND(unicode);
8833        data = (char *) PyUnicode_DATA(unicode) + index * kind;
8834    }
8835    else {
8836        kind = PyUnicode_1BYTE_KIND;
8837        data = NULL;
8838    }
8839    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
8840    thousands_sep_data = PyUnicode_DATA(thousands_sep);
8841    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
8842    if (unicode != NULL && thousands_sep_kind != kind) {
8843        if (thousands_sep_kind < kind) {
8844            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
8845            if (!thousands_sep_data)
8846                return -1;
8847        }
8848        else {
8849            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
8850            if (!data)
8851                return -1;
8852        }
8853    }
8854
8855    switch (kind) {
8856    case PyUnicode_1BYTE_KIND:
8857        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
8858            len = asciilib_InsertThousandsGrouping(
8859                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
8860                min_width, grouping,
8861                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8862        else
8863            len = ucs1lib_InsertThousandsGrouping(
8864                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
8865                min_width, grouping,
8866                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
8867        break;
8868    case PyUnicode_2BYTE_KIND:
8869        len = ucs2lib_InsertThousandsGrouping(
8870            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
8871            min_width, grouping,
8872            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
8873        break;
8874    case PyUnicode_4BYTE_KIND:
8875        len = ucs4lib_InsertThousandsGrouping(
8876            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
8877            min_width, grouping,
8878            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
8879        break;
8880    default:
8881        assert(0);
8882        return -1;
8883    }
8884    if (unicode != NULL && thousands_sep_kind != kind) {
8885        if (thousands_sep_kind < kind)
8886            PyMem_Free(thousands_sep_data);
8887        else
8888            PyMem_Free(data);
8889    }
8890    if (unicode == NULL) {
8891        *maxchar = 127;
8892        if (len != n_digits) {
8893            *maxchar = Py_MAX(*maxchar,
8894                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
8895        }
8896    }
8897    return len;
8898}
8899
8900
8901/* helper macro to fixup start/end slice values */
8902#define ADJUST_INDICES(start, end, len)         \
8903    if (end > len)                              \
8904        end = len;                              \
8905    else if (end < 0) {                         \
8906        end += len;                             \
8907        if (end < 0)                            \
8908            end = 0;                            \
8909    }                                           \
8910    if (start < 0) {                            \
8911        start += len;                           \
8912        if (start < 0)                          \
8913            start = 0;                          \
8914    }
8915
8916Py_ssize_t
8917PyUnicode_Count(PyObject *str,
8918                PyObject *substr,
8919                Py_ssize_t start,
8920                Py_ssize_t end)
8921{
8922    Py_ssize_t result;
8923    PyObject* str_obj;
8924    PyObject* sub_obj;
8925    int kind1, kind2, kind;
8926    void *buf1 = NULL, *buf2 = NULL;
8927    Py_ssize_t len1, len2;
8928
8929    str_obj = PyUnicode_FromObject(str);
8930    if (!str_obj)
8931        return -1;
8932    sub_obj = PyUnicode_FromObject(substr);
8933    if (!sub_obj) {
8934        Py_DECREF(str_obj);
8935        return -1;
8936    }
8937    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
8938        Py_DECREF(sub_obj);
8939        Py_DECREF(str_obj);
8940        return -1;
8941    }
8942
8943    kind1 = PyUnicode_KIND(str_obj);
8944    kind2 = PyUnicode_KIND(sub_obj);
8945    kind = kind1;
8946    buf1 = PyUnicode_DATA(str_obj);
8947    buf2 = PyUnicode_DATA(sub_obj);
8948    if (kind2 != kind) {
8949        if (kind2 > kind) {
8950            Py_DECREF(sub_obj);
8951            Py_DECREF(str_obj);
8952            return 0;
8953        }
8954        buf2 = _PyUnicode_AsKind(sub_obj, kind);
8955    }
8956    if (!buf2)
8957        goto onError;
8958    len1 = PyUnicode_GET_LENGTH(str_obj);
8959    len2 = PyUnicode_GET_LENGTH(sub_obj);
8960
8961    ADJUST_INDICES(start, end, len1);
8962    switch (kind) {
8963    case PyUnicode_1BYTE_KIND:
8964        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
8965            result = asciilib_count(
8966                ((Py_UCS1*)buf1) + start, end - start,
8967                buf2, len2, PY_SSIZE_T_MAX
8968                );
8969        else
8970            result = ucs1lib_count(
8971                ((Py_UCS1*)buf1) + start, end - start,
8972                buf2, len2, PY_SSIZE_T_MAX
8973                );
8974        break;
8975    case PyUnicode_2BYTE_KIND:
8976        result = ucs2lib_count(
8977            ((Py_UCS2*)buf1) + start, end - start,
8978            buf2, len2, PY_SSIZE_T_MAX
8979            );
8980        break;
8981    case PyUnicode_4BYTE_KIND:
8982        result = ucs4lib_count(
8983            ((Py_UCS4*)buf1) + start, end - start,
8984            buf2, len2, PY_SSIZE_T_MAX
8985            );
8986        break;
8987    default:
8988        assert(0); result = 0;
8989    }
8990
8991    Py_DECREF(sub_obj);
8992    Py_DECREF(str_obj);
8993
8994    if (kind2 != kind)
8995        PyMem_Free(buf2);
8996
8997    return result;
8998  onError:
8999    Py_DECREF(sub_obj);
9000    Py_DECREF(str_obj);
9001    if (kind2 != kind && buf2)
9002        PyMem_Free(buf2);
9003    return -1;
9004}
9005
9006Py_ssize_t
9007PyUnicode_Find(PyObject *str,
9008               PyObject *sub,
9009               Py_ssize_t start,
9010               Py_ssize_t end,
9011               int direction)
9012{
9013    Py_ssize_t result;
9014
9015    str = PyUnicode_FromObject(str);
9016    if (!str)
9017        return -2;
9018    sub = PyUnicode_FromObject(sub);
9019    if (!sub) {
9020        Py_DECREF(str);
9021        return -2;
9022    }
9023    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9024        Py_DECREF(sub);
9025        Py_DECREF(str);
9026        return -2;
9027    }
9028
9029    result = any_find_slice(direction,
9030        str, sub, start, end
9031        );
9032
9033    Py_DECREF(str);
9034    Py_DECREF(sub);
9035
9036    return result;
9037}
9038
9039Py_ssize_t
9040PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9041                   Py_ssize_t start, Py_ssize_t end,
9042                   int direction)
9043{
9044    int kind;
9045    Py_ssize_t result;
9046    if (PyUnicode_READY(str) == -1)
9047        return -2;
9048    if (start < 0 || end < 0) {
9049        PyErr_SetString(PyExc_IndexError, "string index out of range");
9050        return -2;
9051    }
9052    if (end > PyUnicode_GET_LENGTH(str))
9053        end = PyUnicode_GET_LENGTH(str);
9054    kind = PyUnicode_KIND(str);
9055    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9056                      kind, end-start, ch, direction);
9057    if (result == -1)
9058        return -1;
9059    else
9060        return start + result;
9061}
9062
9063static int
9064tailmatch(PyObject *self,
9065          PyObject *substring,
9066          Py_ssize_t start,
9067          Py_ssize_t end,
9068          int direction)
9069{
9070    int kind_self;
9071    int kind_sub;
9072    void *data_self;
9073    void *data_sub;
9074    Py_ssize_t offset;
9075    Py_ssize_t i;
9076    Py_ssize_t end_sub;
9077
9078    if (PyUnicode_READY(self) == -1 ||
9079        PyUnicode_READY(substring) == -1)
9080        return -1;
9081
9082    if (PyUnicode_GET_LENGTH(substring) == 0)
9083        return 1;
9084
9085    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9086    end -= PyUnicode_GET_LENGTH(substring);
9087    if (end < start)
9088        return 0;
9089
9090    kind_self = PyUnicode_KIND(self);
9091    data_self = PyUnicode_DATA(self);
9092    kind_sub = PyUnicode_KIND(substring);
9093    data_sub = PyUnicode_DATA(substring);
9094    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9095
9096    if (direction > 0)
9097        offset = end;
9098    else
9099        offset = start;
9100
9101    if (PyUnicode_READ(kind_self, data_self, offset) ==
9102        PyUnicode_READ(kind_sub, data_sub, 0) &&
9103        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9104        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9105        /* If both are of the same kind, memcmp is sufficient */
9106        if (kind_self == kind_sub) {
9107            return ! memcmp((char *)data_self +
9108                                (offset * PyUnicode_KIND(substring)),
9109                            data_sub,
9110                            PyUnicode_GET_LENGTH(substring) *
9111                                PyUnicode_KIND(substring));
9112        }
9113        /* otherwise we have to compare each character by first accesing it */
9114        else {
9115            /* We do not need to compare 0 and len(substring)-1 because
9116               the if statement above ensured already that they are equal
9117               when we end up here. */
9118            for (i = 1; i < end_sub; ++i) {
9119                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9120                    PyUnicode_READ(kind_sub, data_sub, i))
9121                    return 0;
9122            }
9123            return 1;
9124        }
9125    }
9126
9127    return 0;
9128}
9129
9130Py_ssize_t
9131PyUnicode_Tailmatch(PyObject *str,
9132                    PyObject *substr,
9133                    Py_ssize_t start,
9134                    Py_ssize_t end,
9135                    int direction)
9136{
9137    Py_ssize_t result;
9138
9139    str = PyUnicode_FromObject(str);
9140    if (str == NULL)
9141        return -1;
9142    substr = PyUnicode_FromObject(substr);
9143    if (substr == NULL) {
9144        Py_DECREF(str);
9145        return -1;
9146    }
9147
9148    result = tailmatch(str, substr,
9149                       start, end, direction);
9150    Py_DECREF(str);
9151    Py_DECREF(substr);
9152    return result;
9153}
9154
9155/* Apply fixfct filter to the Unicode object self and return a
9156   reference to the modified object */
9157
9158static PyObject *
9159fixup(PyObject *self,
9160      Py_UCS4 (*fixfct)(PyObject *s))
9161{
9162    PyObject *u;
9163    Py_UCS4 maxchar_old, maxchar_new = 0;
9164    PyObject *v;
9165
9166    u = _PyUnicode_Copy(self);
9167    if (u == NULL)
9168        return NULL;
9169    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9170
9171    /* fix functions return the new maximum character in a string,
9172       if the kind of the resulting unicode object does not change,
9173       everything is fine.  Otherwise we need to change the string kind
9174       and re-run the fix function. */
9175    maxchar_new = fixfct(u);
9176
9177    if (maxchar_new == 0) {
9178        /* no changes */;
9179        if (PyUnicode_CheckExact(self)) {
9180            Py_DECREF(u);
9181            Py_INCREF(self);
9182            return self;
9183        }
9184        else
9185            return u;
9186    }
9187
9188    maxchar_new = align_maxchar(maxchar_new);
9189
9190    if (maxchar_new == maxchar_old)
9191        return u;
9192
9193    /* In case the maximum character changed, we need to
9194       convert the string to the new category. */
9195    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9196    if (v == NULL) {
9197        Py_DECREF(u);
9198        return NULL;
9199    }
9200    if (maxchar_new > maxchar_old) {
9201        /* If the maxchar increased so that the kind changed, not all
9202           characters are representable anymore and we need to fix the
9203           string again. This only happens in very few cases. */
9204        _PyUnicode_FastCopyCharacters(v, 0,
9205                                      self, 0, PyUnicode_GET_LENGTH(self));
9206        maxchar_old = fixfct(v);
9207        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9208    }
9209    else {
9210        _PyUnicode_FastCopyCharacters(v, 0,
9211                                      u, 0, PyUnicode_GET_LENGTH(self));
9212    }
9213    Py_DECREF(u);
9214    assert(_PyUnicode_CheckConsistency(v, 1));
9215    return v;
9216}
9217
9218static PyObject *
9219ascii_upper_or_lower(PyObject *self, int lower)
9220{
9221    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9222    char *resdata, *data = PyUnicode_DATA(self);
9223    PyObject *res;
9224
9225    res = PyUnicode_New(len, 127);
9226    if (res == NULL)
9227        return NULL;
9228    resdata = PyUnicode_DATA(res);
9229    if (lower)
9230        _Py_bytes_lower(resdata, data, len);
9231    else
9232        _Py_bytes_upper(resdata, data, len);
9233    return res;
9234}
9235
9236static Py_UCS4
9237handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9238{
9239    Py_ssize_t j;
9240    int final_sigma;
9241    Py_UCS4 c;
9242    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9243
9244     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9245
9246    where ! is a negation and \p{xxx} is a character with property xxx.
9247    */
9248    for (j = i - 1; j >= 0; j--) {
9249        c = PyUnicode_READ(kind, data, j);
9250        if (!_PyUnicode_IsCaseIgnorable(c))
9251            break;
9252    }
9253    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9254    if (final_sigma) {
9255        for (j = i + 1; j < length; j++) {
9256            c = PyUnicode_READ(kind, data, j);
9257            if (!_PyUnicode_IsCaseIgnorable(c))
9258                break;
9259        }
9260        final_sigma = j == length || !_PyUnicode_IsCased(c);
9261    }
9262    return (final_sigma) ? 0x3C2 : 0x3C3;
9263}
9264
9265static int
9266lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9267           Py_UCS4 c, Py_UCS4 *mapped)
9268{
9269    /* Obscure special case. */
9270    if (c == 0x3A3) {
9271        mapped[0] = handle_capital_sigma(kind, data, length, i);
9272        return 1;
9273    }
9274    return _PyUnicode_ToLowerFull(c, mapped);
9275}
9276
9277static Py_ssize_t
9278do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9279{
9280    Py_ssize_t i, k = 0;
9281    int n_res, j;
9282    Py_UCS4 c, mapped[3];
9283
9284    c = PyUnicode_READ(kind, data, 0);
9285    n_res = _PyUnicode_ToUpperFull(c, mapped);
9286    for (j = 0; j < n_res; j++) {
9287        *maxchar = Py_MAX(*maxchar, mapped[j]);
9288        res[k++] = mapped[j];
9289    }
9290    for (i = 1; i < length; i++) {
9291        c = PyUnicode_READ(kind, data, i);
9292        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9293        for (j = 0; j < n_res; j++) {
9294            *maxchar = Py_MAX(*maxchar, mapped[j]);
9295            res[k++] = mapped[j];
9296        }
9297    }
9298    return k;
9299}
9300
9301static Py_ssize_t
9302do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9303    Py_ssize_t i, k = 0;
9304
9305    for (i = 0; i < length; i++) {
9306        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9307        int n_res, j;
9308        if (Py_UNICODE_ISUPPER(c)) {
9309            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9310        }
9311        else if (Py_UNICODE_ISLOWER(c)) {
9312            n_res = _PyUnicode_ToUpperFull(c, mapped);
9313        }
9314        else {
9315            n_res = 1;
9316            mapped[0] = c;
9317        }
9318        for (j = 0; j < n_res; j++) {
9319            *maxchar = Py_MAX(*maxchar, mapped[j]);
9320            res[k++] = mapped[j];
9321        }
9322    }
9323    return k;
9324}
9325
9326static Py_ssize_t
9327do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9328                  Py_UCS4 *maxchar, int lower)
9329{
9330    Py_ssize_t i, k = 0;
9331
9332    for (i = 0; i < length; i++) {
9333        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9334        int n_res, j;
9335        if (lower)
9336            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9337        else
9338            n_res = _PyUnicode_ToUpperFull(c, mapped);
9339        for (j = 0; j < n_res; j++) {
9340            *maxchar = Py_MAX(*maxchar, mapped[j]);
9341            res[k++] = mapped[j];
9342        }
9343    }
9344    return k;
9345}
9346
9347static Py_ssize_t
9348do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9349{
9350    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9351}
9352
9353static Py_ssize_t
9354do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9355{
9356    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9357}
9358
9359static Py_ssize_t
9360do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9361{
9362    Py_ssize_t i, k = 0;
9363
9364    for (i = 0; i < length; i++) {
9365        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9366        Py_UCS4 mapped[3];
9367        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9368        for (j = 0; j < n_res; j++) {
9369            *maxchar = Py_MAX(*maxchar, mapped[j]);
9370            res[k++] = mapped[j];
9371        }
9372    }
9373    return k;
9374}
9375
9376static Py_ssize_t
9377do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9378{
9379    Py_ssize_t i, k = 0;
9380    int previous_is_cased;
9381
9382    previous_is_cased = 0;
9383    for (i = 0; i < length; i++) {
9384        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9385        Py_UCS4 mapped[3];
9386        int n_res, j;
9387
9388        if (previous_is_cased)
9389            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9390        else
9391            n_res = _PyUnicode_ToTitleFull(c, mapped);
9392
9393        for (j = 0; j < n_res; j++) {
9394            *maxchar = Py_MAX(*maxchar, mapped[j]);
9395            res[k++] = mapped[j];
9396        }
9397
9398        previous_is_cased = _PyUnicode_IsCased(c);
9399    }
9400    return k;
9401}
9402
9403static PyObject *
9404case_operation(PyObject *self,
9405               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9406{
9407    PyObject *res = NULL;
9408    Py_ssize_t length, newlength = 0;
9409    int kind, outkind;
9410    void *data, *outdata;
9411    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9412
9413    assert(PyUnicode_IS_READY(self));
9414
9415    kind = PyUnicode_KIND(self);
9416    data = PyUnicode_DATA(self);
9417    length = PyUnicode_GET_LENGTH(self);
9418    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9419    if (tmp == NULL)
9420        return PyErr_NoMemory();
9421    newlength = perform(kind, data, length, tmp, &maxchar);
9422    res = PyUnicode_New(newlength, maxchar);
9423    if (res == NULL)
9424        goto leave;
9425    tmpend = tmp + newlength;
9426    outdata = PyUnicode_DATA(res);
9427    outkind = PyUnicode_KIND(res);
9428    switch (outkind) {
9429    case PyUnicode_1BYTE_KIND:
9430        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9431        break;
9432    case PyUnicode_2BYTE_KIND:
9433        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9434        break;
9435    case PyUnicode_4BYTE_KIND:
9436        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9437        break;
9438    default:
9439        assert(0);
9440        break;
9441    }
9442  leave:
9443    PyMem_FREE(tmp);
9444    return res;
9445}
9446
9447PyObject *
9448PyUnicode_Join(PyObject *separator, PyObject *seq)
9449{
9450    PyObject *sep = NULL;
9451    Py_ssize_t seplen;
9452    PyObject *res = NULL; /* the result */
9453    PyObject *fseq;          /* PySequence_Fast(seq) */
9454    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9455    PyObject **items;
9456    PyObject *item;
9457    Py_ssize_t sz, i, res_offset;
9458    Py_UCS4 maxchar;
9459    Py_UCS4 item_maxchar;
9460    int use_memcpy;
9461    unsigned char *res_data = NULL, *sep_data = NULL;
9462    PyObject *last_obj;
9463    unsigned int kind = 0;
9464
9465    fseq = PySequence_Fast(seq, "");
9466    if (fseq == NULL) {
9467        return NULL;
9468    }
9469
9470    /* NOTE: the following code can't call back into Python code,
9471     * so we are sure that fseq won't be mutated.
9472     */
9473
9474    seqlen = PySequence_Fast_GET_SIZE(fseq);
9475    /* If empty sequence, return u"". */
9476    if (seqlen == 0) {
9477        Py_DECREF(fseq);
9478        _Py_RETURN_UNICODE_EMPTY();
9479    }
9480
9481    /* If singleton sequence with an exact Unicode, return that. */
9482    last_obj = NULL;
9483    items = PySequence_Fast_ITEMS(fseq);
9484    if (seqlen == 1) {
9485        if (PyUnicode_CheckExact(items[0])) {
9486            res = items[0];
9487            Py_INCREF(res);
9488            Py_DECREF(fseq);
9489            return res;
9490        }
9491        seplen = 0;
9492        maxchar = 0;
9493    }
9494    else {
9495        /* Set up sep and seplen */
9496        if (separator == NULL) {
9497            /* fall back to a blank space separator */
9498            sep = PyUnicode_FromOrdinal(' ');
9499            if (!sep)
9500                goto onError;
9501            seplen = 1;
9502            maxchar = 32;
9503        }
9504        else {
9505            if (!PyUnicode_Check(separator)) {
9506                PyErr_Format(PyExc_TypeError,
9507                             "separator: expected str instance,"
9508                             " %.80s found",
9509                             Py_TYPE(separator)->tp_name);
9510                goto onError;
9511            }
9512            if (PyUnicode_READY(separator))
9513                goto onError;
9514            sep = separator;
9515            seplen = PyUnicode_GET_LENGTH(separator);
9516            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9517            /* inc refcount to keep this code path symmetric with the
9518               above case of a blank separator */
9519            Py_INCREF(sep);
9520        }
9521        last_obj = sep;
9522    }
9523
9524    /* There are at least two things to join, or else we have a subclass
9525     * of str in the sequence.
9526     * Do a pre-pass to figure out the total amount of space we'll
9527     * need (sz), and see whether all argument are strings.
9528     */
9529    sz = 0;
9530#ifdef Py_DEBUG
9531    use_memcpy = 0;
9532#else
9533    use_memcpy = 1;
9534#endif
9535    for (i = 0; i < seqlen; i++) {
9536        const Py_ssize_t old_sz = sz;
9537        item = items[i];
9538        if (!PyUnicode_Check(item)) {
9539            PyErr_Format(PyExc_TypeError,
9540                         "sequence item %zd: expected str instance,"
9541                         " %.80s found",
9542                         i, Py_TYPE(item)->tp_name);
9543            goto onError;
9544        }
9545        if (PyUnicode_READY(item) == -1)
9546            goto onError;
9547        sz += PyUnicode_GET_LENGTH(item);
9548        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9549        maxchar = Py_MAX(maxchar, item_maxchar);
9550        if (i != 0)
9551            sz += seplen;
9552        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9553            PyErr_SetString(PyExc_OverflowError,
9554                            "join() result is too long for a Python string");
9555            goto onError;
9556        }
9557        if (use_memcpy && last_obj != NULL) {
9558            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9559                use_memcpy = 0;
9560        }
9561        last_obj = item;
9562    }
9563
9564    res = PyUnicode_New(sz, maxchar);
9565    if (res == NULL)
9566        goto onError;
9567
9568    /* Catenate everything. */
9569#ifdef Py_DEBUG
9570    use_memcpy = 0;
9571#else
9572    if (use_memcpy) {
9573        res_data = PyUnicode_1BYTE_DATA(res);
9574        kind = PyUnicode_KIND(res);
9575        if (seplen != 0)
9576            sep_data = PyUnicode_1BYTE_DATA(sep);
9577    }
9578#endif
9579    if (use_memcpy) {
9580        for (i = 0; i < seqlen; ++i) {
9581            Py_ssize_t itemlen;
9582            item = items[i];
9583
9584            /* Copy item, and maybe the separator. */
9585            if (i && seplen != 0) {
9586                Py_MEMCPY(res_data,
9587                          sep_data,
9588                          kind * seplen);
9589                res_data += kind * seplen;
9590            }
9591
9592            itemlen = PyUnicode_GET_LENGTH(item);
9593            if (itemlen != 0) {
9594                Py_MEMCPY(res_data,
9595                          PyUnicode_DATA(item),
9596                          kind * itemlen);
9597                res_data += kind * itemlen;
9598            }
9599        }
9600        assert(res_data == PyUnicode_1BYTE_DATA(res)
9601                           + kind * PyUnicode_GET_LENGTH(res));
9602    }
9603    else {
9604        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9605            Py_ssize_t itemlen;
9606            item = items[i];
9607
9608            /* Copy item, and maybe the separator. */
9609            if (i && seplen != 0) {
9610                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9611                res_offset += seplen;
9612            }
9613
9614            itemlen = PyUnicode_GET_LENGTH(item);
9615            if (itemlen != 0) {
9616                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9617                res_offset += itemlen;
9618            }
9619        }
9620        assert(res_offset == PyUnicode_GET_LENGTH(res));
9621    }
9622
9623    Py_DECREF(fseq);
9624    Py_XDECREF(sep);
9625    assert(_PyUnicode_CheckConsistency(res, 1));
9626    return res;
9627
9628  onError:
9629    Py_DECREF(fseq);
9630    Py_XDECREF(sep);
9631    Py_XDECREF(res);
9632    return NULL;
9633}
9634
9635#define FILL(kind, data, value, start, length) \
9636    do { \
9637        Py_ssize_t i_ = 0; \
9638        assert(kind != PyUnicode_WCHAR_KIND); \
9639        switch ((kind)) { \
9640        case PyUnicode_1BYTE_KIND: { \
9641            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9642            memset(to_, (unsigned char)value, (length)); \
9643            break; \
9644        } \
9645        case PyUnicode_2BYTE_KIND: { \
9646            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9647            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9648            break; \
9649        } \
9650        case PyUnicode_4BYTE_KIND: { \
9651            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9652            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9653            break; \
9654        default: assert(0); \
9655        } \
9656        } \
9657    } while (0)
9658
9659void
9660_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9661                    Py_UCS4 fill_char)
9662{
9663    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9664    const void *data = PyUnicode_DATA(unicode);
9665    assert(PyUnicode_IS_READY(unicode));
9666    assert(unicode_modifiable(unicode));
9667    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9668    assert(start >= 0);
9669    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9670    FILL(kind, data, fill_char, start, length);
9671}
9672
9673Py_ssize_t
9674PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9675               Py_UCS4 fill_char)
9676{
9677    Py_ssize_t maxlen;
9678
9679    if (!PyUnicode_Check(unicode)) {
9680        PyErr_BadInternalCall();
9681        return -1;
9682    }
9683    if (PyUnicode_READY(unicode) == -1)
9684        return -1;
9685    if (unicode_check_modifiable(unicode))
9686        return -1;
9687
9688    if (start < 0) {
9689        PyErr_SetString(PyExc_IndexError, "string index out of range");
9690        return -1;
9691    }
9692    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9693        PyErr_SetString(PyExc_ValueError,
9694                         "fill character is bigger than "
9695                         "the string maximum character");
9696        return -1;
9697    }
9698
9699    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9700    length = Py_MIN(maxlen, length);
9701    if (length <= 0)
9702        return 0;
9703
9704    _PyUnicode_FastFill(unicode, start, length, fill_char);
9705    return length;
9706}
9707
9708static PyObject *
9709pad(PyObject *self,
9710    Py_ssize_t left,
9711    Py_ssize_t right,
9712    Py_UCS4 fill)
9713{
9714    PyObject *u;
9715    Py_UCS4 maxchar;
9716    int kind;
9717    void *data;
9718
9719    if (left < 0)
9720        left = 0;
9721    if (right < 0)
9722        right = 0;
9723
9724    if (left == 0 && right == 0)
9725        return unicode_result_unchanged(self);
9726
9727    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9728        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9729        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9730        return NULL;
9731    }
9732    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9733    maxchar = Py_MAX(maxchar, fill);
9734    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9735    if (!u)
9736        return NULL;
9737
9738    kind = PyUnicode_KIND(u);
9739    data = PyUnicode_DATA(u);
9740    if (left)
9741        FILL(kind, data, fill, 0, left);
9742    if (right)
9743        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9744    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9745    assert(_PyUnicode_CheckConsistency(u, 1));
9746    return u;
9747}
9748
9749PyObject *
9750PyUnicode_Splitlines(PyObject *string, int keepends)
9751{
9752    PyObject *list;
9753
9754    string = PyUnicode_FromObject(string);
9755    if (string == NULL)
9756        return NULL;
9757    if (PyUnicode_READY(string) == -1) {
9758        Py_DECREF(string);
9759        return NULL;
9760    }
9761
9762    switch (PyUnicode_KIND(string)) {
9763    case PyUnicode_1BYTE_KIND:
9764        if (PyUnicode_IS_ASCII(string))
9765            list = asciilib_splitlines(
9766                string, PyUnicode_1BYTE_DATA(string),
9767                PyUnicode_GET_LENGTH(string), keepends);
9768        else
9769            list = ucs1lib_splitlines(
9770                string, PyUnicode_1BYTE_DATA(string),
9771                PyUnicode_GET_LENGTH(string), keepends);
9772        break;
9773    case PyUnicode_2BYTE_KIND:
9774        list = ucs2lib_splitlines(
9775            string, PyUnicode_2BYTE_DATA(string),
9776            PyUnicode_GET_LENGTH(string), keepends);
9777        break;
9778    case PyUnicode_4BYTE_KIND:
9779        list = ucs4lib_splitlines(
9780            string, PyUnicode_4BYTE_DATA(string),
9781            PyUnicode_GET_LENGTH(string), keepends);
9782        break;
9783    default:
9784        assert(0);
9785        list = 0;
9786    }
9787    Py_DECREF(string);
9788    return list;
9789}
9790
9791static PyObject *
9792split(PyObject *self,
9793      PyObject *substring,
9794      Py_ssize_t maxcount)
9795{
9796    int kind1, kind2, kind;
9797    void *buf1, *buf2;
9798    Py_ssize_t len1, len2;
9799    PyObject* out;
9800
9801    if (maxcount < 0)
9802        maxcount = PY_SSIZE_T_MAX;
9803
9804    if (PyUnicode_READY(self) == -1)
9805        return NULL;
9806
9807    if (substring == NULL)
9808        switch (PyUnicode_KIND(self)) {
9809        case PyUnicode_1BYTE_KIND:
9810            if (PyUnicode_IS_ASCII(self))
9811                return asciilib_split_whitespace(
9812                    self,  PyUnicode_1BYTE_DATA(self),
9813                    PyUnicode_GET_LENGTH(self), maxcount
9814                    );
9815            else
9816                return ucs1lib_split_whitespace(
9817                    self,  PyUnicode_1BYTE_DATA(self),
9818                    PyUnicode_GET_LENGTH(self), maxcount
9819                    );
9820        case PyUnicode_2BYTE_KIND:
9821            return ucs2lib_split_whitespace(
9822                self,  PyUnicode_2BYTE_DATA(self),
9823                PyUnicode_GET_LENGTH(self), maxcount
9824                );
9825        case PyUnicode_4BYTE_KIND:
9826            return ucs4lib_split_whitespace(
9827                self,  PyUnicode_4BYTE_DATA(self),
9828                PyUnicode_GET_LENGTH(self), maxcount
9829                );
9830        default:
9831            assert(0);
9832            return NULL;
9833        }
9834
9835    if (PyUnicode_READY(substring) == -1)
9836        return NULL;
9837
9838    kind1 = PyUnicode_KIND(self);
9839    kind2 = PyUnicode_KIND(substring);
9840    kind = kind1 > kind2 ? kind1 : kind2;
9841    buf1 = PyUnicode_DATA(self);
9842    buf2 = PyUnicode_DATA(substring);
9843    if (kind1 != kind)
9844        buf1 = _PyUnicode_AsKind(self, kind);
9845    if (!buf1)
9846        return NULL;
9847    if (kind2 != kind)
9848        buf2 = _PyUnicode_AsKind(substring, kind);
9849    if (!buf2) {
9850        if (kind1 != kind) PyMem_Free(buf1);
9851        return NULL;
9852    }
9853    len1 = PyUnicode_GET_LENGTH(self);
9854    len2 = PyUnicode_GET_LENGTH(substring);
9855
9856    switch (kind) {
9857    case PyUnicode_1BYTE_KIND:
9858        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9859            out = asciilib_split(
9860                self,  buf1, len1, buf2, len2, maxcount);
9861        else
9862            out = ucs1lib_split(
9863                self,  buf1, len1, buf2, len2, maxcount);
9864        break;
9865    case PyUnicode_2BYTE_KIND:
9866        out = ucs2lib_split(
9867            self,  buf1, len1, buf2, len2, maxcount);
9868        break;
9869    case PyUnicode_4BYTE_KIND:
9870        out = ucs4lib_split(
9871            self,  buf1, len1, buf2, len2, maxcount);
9872        break;
9873    default:
9874        out = NULL;
9875    }
9876    if (kind1 != kind)
9877        PyMem_Free(buf1);
9878    if (kind2 != kind)
9879        PyMem_Free(buf2);
9880    return out;
9881}
9882
9883static PyObject *
9884rsplit(PyObject *self,
9885       PyObject *substring,
9886       Py_ssize_t maxcount)
9887{
9888    int kind1, kind2, kind;
9889    void *buf1, *buf2;
9890    Py_ssize_t len1, len2;
9891    PyObject* out;
9892
9893    if (maxcount < 0)
9894        maxcount = PY_SSIZE_T_MAX;
9895
9896    if (PyUnicode_READY(self) == -1)
9897        return NULL;
9898
9899    if (substring == NULL)
9900        switch (PyUnicode_KIND(self)) {
9901        case PyUnicode_1BYTE_KIND:
9902            if (PyUnicode_IS_ASCII(self))
9903                return asciilib_rsplit_whitespace(
9904                    self,  PyUnicode_1BYTE_DATA(self),
9905                    PyUnicode_GET_LENGTH(self), maxcount
9906                    );
9907            else
9908                return ucs1lib_rsplit_whitespace(
9909                    self,  PyUnicode_1BYTE_DATA(self),
9910                    PyUnicode_GET_LENGTH(self), maxcount
9911                    );
9912        case PyUnicode_2BYTE_KIND:
9913            return ucs2lib_rsplit_whitespace(
9914                self,  PyUnicode_2BYTE_DATA(self),
9915                PyUnicode_GET_LENGTH(self), maxcount
9916                );
9917        case PyUnicode_4BYTE_KIND:
9918            return ucs4lib_rsplit_whitespace(
9919                self,  PyUnicode_4BYTE_DATA(self),
9920                PyUnicode_GET_LENGTH(self), maxcount
9921                );
9922        default:
9923            assert(0);
9924            return NULL;
9925        }
9926
9927    if (PyUnicode_READY(substring) == -1)
9928        return NULL;
9929
9930    kind1 = PyUnicode_KIND(self);
9931    kind2 = PyUnicode_KIND(substring);
9932    kind = kind1 > kind2 ? kind1 : kind2;
9933    buf1 = PyUnicode_DATA(self);
9934    buf2 = PyUnicode_DATA(substring);
9935    if (kind1 != kind)
9936        buf1 = _PyUnicode_AsKind(self, kind);
9937    if (!buf1)
9938        return NULL;
9939    if (kind2 != kind)
9940        buf2 = _PyUnicode_AsKind(substring, kind);
9941    if (!buf2) {
9942        if (kind1 != kind) PyMem_Free(buf1);
9943        return NULL;
9944    }
9945    len1 = PyUnicode_GET_LENGTH(self);
9946    len2 = PyUnicode_GET_LENGTH(substring);
9947
9948    switch (kind) {
9949    case PyUnicode_1BYTE_KIND:
9950        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
9951            out = asciilib_rsplit(
9952                self,  buf1, len1, buf2, len2, maxcount);
9953        else
9954            out = ucs1lib_rsplit(
9955                self,  buf1, len1, buf2, len2, maxcount);
9956        break;
9957    case PyUnicode_2BYTE_KIND:
9958        out = ucs2lib_rsplit(
9959            self,  buf1, len1, buf2, len2, maxcount);
9960        break;
9961    case PyUnicode_4BYTE_KIND:
9962        out = ucs4lib_rsplit(
9963            self,  buf1, len1, buf2, len2, maxcount);
9964        break;
9965    default:
9966        out = NULL;
9967    }
9968    if (kind1 != kind)
9969        PyMem_Free(buf1);
9970    if (kind2 != kind)
9971        PyMem_Free(buf2);
9972    return out;
9973}
9974
9975static Py_ssize_t
9976anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
9977            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
9978{
9979    switch (kind) {
9980    case PyUnicode_1BYTE_KIND:
9981        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
9982            return asciilib_find(buf1, len1, buf2, len2, offset);
9983        else
9984            return ucs1lib_find(buf1, len1, buf2, len2, offset);
9985    case PyUnicode_2BYTE_KIND:
9986        return ucs2lib_find(buf1, len1, buf2, len2, offset);
9987    case PyUnicode_4BYTE_KIND:
9988        return ucs4lib_find(buf1, len1, buf2, len2, offset);
9989    }
9990    assert(0);
9991    return -1;
9992}
9993
9994static Py_ssize_t
9995anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
9996             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
9997{
9998    switch (kind) {
9999    case PyUnicode_1BYTE_KIND:
10000        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10001            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10002        else
10003            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10004    case PyUnicode_2BYTE_KIND:
10005        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10006    case PyUnicode_4BYTE_KIND:
10007        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10008    }
10009    assert(0);
10010    return 0;
10011}
10012
10013static void
10014replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10015                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10016{
10017    int kind = PyUnicode_KIND(u);
10018    void *data = PyUnicode_DATA(u);
10019    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10020    if (kind == PyUnicode_1BYTE_KIND) {
10021        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10022                                      (Py_UCS1 *)data + len,
10023                                      u1, u2, maxcount);
10024    }
10025    else if (kind == PyUnicode_2BYTE_KIND) {
10026        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10027                                      (Py_UCS2 *)data + len,
10028                                      u1, u2, maxcount);
10029    }
10030    else {
10031        assert(kind == PyUnicode_4BYTE_KIND);
10032        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10033                                      (Py_UCS4 *)data + len,
10034                                      u1, u2, maxcount);
10035    }
10036}
10037
10038static PyObject *
10039replace(PyObject *self, PyObject *str1,
10040        PyObject *str2, Py_ssize_t maxcount)
10041{
10042    PyObject *u;
10043    char *sbuf = PyUnicode_DATA(self);
10044    char *buf1 = PyUnicode_DATA(str1);
10045    char *buf2 = PyUnicode_DATA(str2);
10046    int srelease = 0, release1 = 0, release2 = 0;
10047    int skind = PyUnicode_KIND(self);
10048    int kind1 = PyUnicode_KIND(str1);
10049    int kind2 = PyUnicode_KIND(str2);
10050    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10051    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10052    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10053    int mayshrink;
10054    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10055
10056    if (maxcount < 0)
10057        maxcount = PY_SSIZE_T_MAX;
10058    else if (maxcount == 0 || slen == 0)
10059        goto nothing;
10060
10061    if (str1 == str2)
10062        goto nothing;
10063
10064    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10065    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10066    if (maxchar < maxchar_str1)
10067        /* substring too wide to be present */
10068        goto nothing;
10069    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10070    /* Replacing str1 with str2 may cause a maxchar reduction in the
10071       result string. */
10072    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10073    maxchar = Py_MAX(maxchar, maxchar_str2);
10074
10075    if (len1 == len2) {
10076        /* same length */
10077        if (len1 == 0)
10078            goto nothing;
10079        if (len1 == 1) {
10080            /* replace characters */
10081            Py_UCS4 u1, u2;
10082            Py_ssize_t pos;
10083
10084            u1 = PyUnicode_READ(kind1, buf1, 0);
10085            pos = findchar(sbuf, skind, slen, u1, 1);
10086            if (pos < 0)
10087                goto nothing;
10088            u2 = PyUnicode_READ(kind2, buf2, 0);
10089            u = PyUnicode_New(slen, maxchar);
10090            if (!u)
10091                goto error;
10092
10093            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10094            replace_1char_inplace(u, pos, u1, u2, maxcount);
10095        }
10096        else {
10097            int rkind = skind;
10098            char *res;
10099            Py_ssize_t i;
10100
10101            if (kind1 < rkind) {
10102                /* widen substring */
10103                buf1 = _PyUnicode_AsKind(str1, rkind);
10104                if (!buf1) goto error;
10105                release1 = 1;
10106            }
10107            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10108            if (i < 0)
10109                goto nothing;
10110            if (rkind > kind2) {
10111                /* widen replacement */
10112                buf2 = _PyUnicode_AsKind(str2, rkind);
10113                if (!buf2) goto error;
10114                release2 = 1;
10115            }
10116            else if (rkind < kind2) {
10117                /* widen self and buf1 */
10118                rkind = kind2;
10119                if (release1) PyMem_Free(buf1);
10120                release1 = 0;
10121                sbuf = _PyUnicode_AsKind(self, rkind);
10122                if (!sbuf) goto error;
10123                srelease = 1;
10124                buf1 = _PyUnicode_AsKind(str1, rkind);
10125                if (!buf1) goto error;
10126                release1 = 1;
10127            }
10128            u = PyUnicode_New(slen, maxchar);
10129            if (!u)
10130                goto error;
10131            assert(PyUnicode_KIND(u) == rkind);
10132            res = PyUnicode_DATA(u);
10133
10134            memcpy(res, sbuf, rkind * slen);
10135            /* change everything in-place, starting with this one */
10136            memcpy(res + rkind * i,
10137                   buf2,
10138                   rkind * len2);
10139            i += len1;
10140
10141            while ( --maxcount > 0) {
10142                i = anylib_find(rkind, self,
10143                                sbuf+rkind*i, slen-i,
10144                                str1, buf1, len1, i);
10145                if (i == -1)
10146                    break;
10147                memcpy(res + rkind * i,
10148                       buf2,
10149                       rkind * len2);
10150                i += len1;
10151            }
10152        }
10153    }
10154    else {
10155        Py_ssize_t n, i, j, ires;
10156        Py_ssize_t new_size;
10157        int rkind = skind;
10158        char *res;
10159
10160        if (kind1 < rkind) {
10161            /* widen substring */
10162            buf1 = _PyUnicode_AsKind(str1, rkind);
10163            if (!buf1) goto error;
10164            release1 = 1;
10165        }
10166        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10167        if (n == 0)
10168            goto nothing;
10169        if (kind2 < rkind) {
10170            /* widen replacement */
10171            buf2 = _PyUnicode_AsKind(str2, rkind);
10172            if (!buf2) goto error;
10173            release2 = 1;
10174        }
10175        else if (kind2 > rkind) {
10176            /* widen self and buf1 */
10177            rkind = kind2;
10178            sbuf = _PyUnicode_AsKind(self, rkind);
10179            if (!sbuf) goto error;
10180            srelease = 1;
10181            if (release1) PyMem_Free(buf1);
10182            release1 = 0;
10183            buf1 = _PyUnicode_AsKind(str1, rkind);
10184            if (!buf1) goto error;
10185            release1 = 1;
10186        }
10187        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10188           PyUnicode_GET_LENGTH(str1))); */
10189        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10190                PyErr_SetString(PyExc_OverflowError,
10191                                "replace string is too long");
10192                goto error;
10193        }
10194        new_size = slen + n * (len2 - len1);
10195        if (new_size == 0) {
10196            _Py_INCREF_UNICODE_EMPTY();
10197            if (!unicode_empty)
10198                goto error;
10199            u = unicode_empty;
10200            goto done;
10201        }
10202        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10203            PyErr_SetString(PyExc_OverflowError,
10204                            "replace string is too long");
10205            goto error;
10206        }
10207        u = PyUnicode_New(new_size, maxchar);
10208        if (!u)
10209            goto error;
10210        assert(PyUnicode_KIND(u) == rkind);
10211        res = PyUnicode_DATA(u);
10212        ires = i = 0;
10213        if (len1 > 0) {
10214            while (n-- > 0) {
10215                /* look for next match */
10216                j = anylib_find(rkind, self,
10217                                sbuf + rkind * i, slen-i,
10218                                str1, buf1, len1, i);
10219                if (j == -1)
10220                    break;
10221                else if (j > i) {
10222                    /* copy unchanged part [i:j] */
10223                    memcpy(res + rkind * ires,
10224                           sbuf + rkind * i,
10225                           rkind * (j-i));
10226                    ires += j - i;
10227                }
10228                /* copy substitution string */
10229                if (len2 > 0) {
10230                    memcpy(res + rkind * ires,
10231                           buf2,
10232                           rkind * len2);
10233                    ires += len2;
10234                }
10235                i = j + len1;
10236            }
10237            if (i < slen)
10238                /* copy tail [i:] */
10239                memcpy(res + rkind * ires,
10240                       sbuf + rkind * i,
10241                       rkind * (slen-i));
10242        }
10243        else {
10244            /* interleave */
10245            while (n > 0) {
10246                memcpy(res + rkind * ires,
10247                       buf2,
10248                       rkind * len2);
10249                ires += len2;
10250                if (--n <= 0)
10251                    break;
10252                memcpy(res + rkind * ires,
10253                       sbuf + rkind * i,
10254                       rkind);
10255                ires++;
10256                i++;
10257            }
10258            memcpy(res + rkind * ires,
10259                   sbuf + rkind * i,
10260                   rkind * (slen-i));
10261        }
10262    }
10263
10264    if (mayshrink) {
10265        unicode_adjust_maxchar(&u);
10266        if (u == NULL)
10267            goto error;
10268    }
10269
10270  done:
10271    if (srelease)
10272        PyMem_FREE(sbuf);
10273    if (release1)
10274        PyMem_FREE(buf1);
10275    if (release2)
10276        PyMem_FREE(buf2);
10277    assert(_PyUnicode_CheckConsistency(u, 1));
10278    return u;
10279
10280  nothing:
10281    /* nothing to replace; return original string (when possible) */
10282    if (srelease)
10283        PyMem_FREE(sbuf);
10284    if (release1)
10285        PyMem_FREE(buf1);
10286    if (release2)
10287        PyMem_FREE(buf2);
10288    return unicode_result_unchanged(self);
10289
10290  error:
10291    if (srelease && sbuf)
10292        PyMem_FREE(sbuf);
10293    if (release1 && buf1)
10294        PyMem_FREE(buf1);
10295    if (release2 && buf2)
10296        PyMem_FREE(buf2);
10297    return NULL;
10298}
10299
10300/* --- Unicode Object Methods --------------------------------------------- */
10301
10302PyDoc_STRVAR(title__doc__,
10303             "S.title() -> str\n\
10304\n\
10305Return a titlecased version of S, i.e. words start with title case\n\
10306characters, all remaining cased characters have lower case.");
10307
10308static PyObject*
10309unicode_title(PyObject *self)
10310{
10311    if (PyUnicode_READY(self) == -1)
10312        return NULL;
10313    return case_operation(self, do_title);
10314}
10315
10316PyDoc_STRVAR(capitalize__doc__,
10317             "S.capitalize() -> str\n\
10318\n\
10319Return a capitalized version of S, i.e. make the first character\n\
10320have upper case and the rest lower case.");
10321
10322static PyObject*
10323unicode_capitalize(PyObject *self)
10324{
10325    if (PyUnicode_READY(self) == -1)
10326        return NULL;
10327    if (PyUnicode_GET_LENGTH(self) == 0)
10328        return unicode_result_unchanged(self);
10329    return case_operation(self, do_capitalize);
10330}
10331
10332PyDoc_STRVAR(casefold__doc__,
10333             "S.casefold() -> str\n\
10334\n\
10335Return a version of S suitable for caseless comparisons.");
10336
10337static PyObject *
10338unicode_casefold(PyObject *self)
10339{
10340    if (PyUnicode_READY(self) == -1)
10341        return NULL;
10342    if (PyUnicode_IS_ASCII(self))
10343        return ascii_upper_or_lower(self, 1);
10344    return case_operation(self, do_casefold);
10345}
10346
10347
10348/* Argument converter.  Coerces to a single unicode character */
10349
10350static int
10351convert_uc(PyObject *obj, void *addr)
10352{
10353    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10354    PyObject *uniobj;
10355
10356    uniobj = PyUnicode_FromObject(obj);
10357    if (uniobj == NULL) {
10358        PyErr_SetString(PyExc_TypeError,
10359                        "The fill character cannot be converted to Unicode");
10360        return 0;
10361    }
10362    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10363        PyErr_SetString(PyExc_TypeError,
10364                        "The fill character must be exactly one character long");
10365        Py_DECREF(uniobj);
10366        return 0;
10367    }
10368    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10369    Py_DECREF(uniobj);
10370    return 1;
10371}
10372
10373PyDoc_STRVAR(center__doc__,
10374             "S.center(width[, fillchar]) -> str\n\
10375\n\
10376Return S centered in a string of length width. Padding is\n\
10377done using the specified fill character (default is a space)");
10378
10379static PyObject *
10380unicode_center(PyObject *self, PyObject *args)
10381{
10382    Py_ssize_t marg, left;
10383    Py_ssize_t width;
10384    Py_UCS4 fillchar = ' ';
10385
10386    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10387        return NULL;
10388
10389    if (PyUnicode_READY(self) == -1)
10390        return NULL;
10391
10392    if (PyUnicode_GET_LENGTH(self) >= width)
10393        return unicode_result_unchanged(self);
10394
10395    marg = width - PyUnicode_GET_LENGTH(self);
10396    left = marg / 2 + (marg & width & 1);
10397
10398    return pad(self, left, marg - left, fillchar);
10399}
10400
10401/* This function assumes that str1 and str2 are readied by the caller. */
10402
10403static int
10404unicode_compare(PyObject *str1, PyObject *str2)
10405{
10406#define COMPARE(TYPE1, TYPE2) \
10407    do { \
10408        TYPE1* p1 = (TYPE1 *)data1; \
10409        TYPE2* p2 = (TYPE2 *)data2; \
10410        TYPE1* end = p1 + len; \
10411        Py_UCS4 c1, c2; \
10412        for (; p1 != end; p1++, p2++) { \
10413            c1 = *p1; \
10414            c2 = *p2; \
10415            if (c1 != c2) \
10416                return (c1 < c2) ? -1 : 1; \
10417        } \
10418    } \
10419    while (0)
10420
10421    int kind1, kind2;
10422    void *data1, *data2;
10423    Py_ssize_t len1, len2, len;
10424
10425    /* a string is equal to itself */
10426    if (str1 == str2)
10427        return 0;
10428
10429    kind1 = PyUnicode_KIND(str1);
10430    kind2 = PyUnicode_KIND(str2);
10431    data1 = PyUnicode_DATA(str1);
10432    data2 = PyUnicode_DATA(str2);
10433    len1 = PyUnicode_GET_LENGTH(str1);
10434    len2 = PyUnicode_GET_LENGTH(str2);
10435    len = Py_MIN(len1, len2);
10436
10437    switch(kind1) {
10438    case PyUnicode_1BYTE_KIND:
10439    {
10440        switch(kind2) {
10441        case PyUnicode_1BYTE_KIND:
10442        {
10443            int cmp = memcmp(data1, data2, len);
10444            /* normalize result of memcmp() into the range [-1; 1] */
10445            if (cmp < 0)
10446                return -1;
10447            if (cmp > 0)
10448                return 1;
10449            break;
10450        }
10451        case PyUnicode_2BYTE_KIND:
10452            COMPARE(Py_UCS1, Py_UCS2);
10453            break;
10454        case PyUnicode_4BYTE_KIND:
10455            COMPARE(Py_UCS1, Py_UCS4);
10456            break;
10457        default:
10458            assert(0);
10459        }
10460        break;
10461    }
10462    case PyUnicode_2BYTE_KIND:
10463    {
10464        switch(kind2) {
10465        case PyUnicode_1BYTE_KIND:
10466            COMPARE(Py_UCS2, Py_UCS1);
10467            break;
10468        case PyUnicode_2BYTE_KIND:
10469        {
10470            COMPARE(Py_UCS2, Py_UCS2);
10471            break;
10472        }
10473        case PyUnicode_4BYTE_KIND:
10474            COMPARE(Py_UCS2, Py_UCS4);
10475            break;
10476        default:
10477            assert(0);
10478        }
10479        break;
10480    }
10481    case PyUnicode_4BYTE_KIND:
10482    {
10483        switch(kind2) {
10484        case PyUnicode_1BYTE_KIND:
10485            COMPARE(Py_UCS4, Py_UCS1);
10486            break;
10487        case PyUnicode_2BYTE_KIND:
10488            COMPARE(Py_UCS4, Py_UCS2);
10489            break;
10490        case PyUnicode_4BYTE_KIND:
10491        {
10492#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10493            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10494            /* normalize result of wmemcmp() into the range [-1; 1] */
10495            if (cmp < 0)
10496                return -1;
10497            if (cmp > 0)
10498                return 1;
10499#else
10500            COMPARE(Py_UCS4, Py_UCS4);
10501#endif
10502            break;
10503        }
10504        default:
10505            assert(0);
10506        }
10507        break;
10508    }
10509    default:
10510        assert(0);
10511    }
10512
10513    if (len1 == len2)
10514        return 0;
10515    if (len1 < len2)
10516        return -1;
10517    else
10518        return 1;
10519
10520#undef COMPARE
10521}
10522
10523static int
10524unicode_compare_eq(PyObject *str1, PyObject *str2)
10525{
10526    int kind;
10527    void *data1, *data2;
10528    Py_ssize_t len;
10529    int cmp;
10530
10531    /* a string is equal to itself */
10532    if (str1 == str2)
10533        return 1;
10534
10535    len = PyUnicode_GET_LENGTH(str1);
10536    if (PyUnicode_GET_LENGTH(str2) != len)
10537        return 0;
10538    kind = PyUnicode_KIND(str1);
10539    if (PyUnicode_KIND(str2) != kind)
10540        return 0;
10541    data1 = PyUnicode_DATA(str1);
10542    data2 = PyUnicode_DATA(str2);
10543
10544    cmp = memcmp(data1, data2, len * kind);
10545    return (cmp == 0);
10546}
10547
10548
10549int
10550PyUnicode_Compare(PyObject *left, PyObject *right)
10551{
10552    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10553        if (PyUnicode_READY(left) == -1 ||
10554            PyUnicode_READY(right) == -1)
10555            return -1;
10556        return unicode_compare(left, right);
10557    }
10558    PyErr_Format(PyExc_TypeError,
10559                 "Can't compare %.100s and %.100s",
10560                 left->ob_type->tp_name,
10561                 right->ob_type->tp_name);
10562    return -1;
10563}
10564
10565int
10566PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10567{
10568    Py_ssize_t i;
10569    int kind;
10570    void *data;
10571    Py_UCS4 chr;
10572
10573    assert(_PyUnicode_CHECK(uni));
10574    if (PyUnicode_READY(uni) == -1)
10575        return -1;
10576    kind = PyUnicode_KIND(uni);
10577    data = PyUnicode_DATA(uni);
10578    /* Compare Unicode string and source character set string */
10579    for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10580        if (chr != str[i])
10581            return (chr < (unsigned char)(str[i])) ? -1 : 1;
10582    /* This check keeps Python strings that end in '\0' from comparing equal
10583     to C strings identical up to that point. */
10584    if (PyUnicode_GET_LENGTH(uni) != i || chr)
10585        return 1; /* uni is longer */
10586    if (str[i])
10587        return -1; /* str is longer */
10588    return 0;
10589}
10590
10591
10592#define TEST_COND(cond)                         \
10593    ((cond) ? Py_True : Py_False)
10594
10595PyObject *
10596PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10597{
10598    int result;
10599    PyObject *v;
10600
10601    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10602        Py_RETURN_NOTIMPLEMENTED;
10603
10604    if (PyUnicode_READY(left) == -1 ||
10605        PyUnicode_READY(right) == -1)
10606        return NULL;
10607
10608    if (op == Py_EQ || op == Py_NE) {
10609        result = unicode_compare_eq(left, right);
10610        if (op == Py_EQ)
10611            v = TEST_COND(result);
10612        else
10613            v = TEST_COND(!result);
10614    }
10615    else {
10616        result = unicode_compare(left, right);
10617
10618        /* Convert the return value to a Boolean */
10619        switch (op) {
10620        case Py_LE:
10621            v = TEST_COND(result <= 0);
10622            break;
10623        case Py_GE:
10624            v = TEST_COND(result >= 0);
10625            break;
10626        case Py_LT:
10627            v = TEST_COND(result == -1);
10628            break;
10629        case Py_GT:
10630            v = TEST_COND(result == 1);
10631            break;
10632        default:
10633            PyErr_BadArgument();
10634            return NULL;
10635        }
10636    }
10637    Py_INCREF(v);
10638    return v;
10639}
10640
10641int
10642PyUnicode_Contains(PyObject *container, PyObject *element)
10643{
10644    PyObject *str, *sub;
10645    int kind1, kind2;
10646    void *buf1, *buf2;
10647    Py_ssize_t len1, len2;
10648    int result;
10649
10650    /* Coerce the two arguments */
10651    sub = PyUnicode_FromObject(element);
10652    if (!sub) {
10653        PyErr_Format(PyExc_TypeError,
10654                     "'in <string>' requires string as left operand, not %s",
10655                     element->ob_type->tp_name);
10656        return -1;
10657    }
10658
10659    str = PyUnicode_FromObject(container);
10660    if (!str) {
10661        Py_DECREF(sub);
10662        return -1;
10663    }
10664
10665    kind1 = PyUnicode_KIND(str);
10666    kind2 = PyUnicode_KIND(sub);
10667    buf1 = PyUnicode_DATA(str);
10668    buf2 = PyUnicode_DATA(sub);
10669    if (kind2 != kind1) {
10670        if (kind2 > kind1) {
10671            Py_DECREF(sub);
10672            Py_DECREF(str);
10673            return 0;
10674        }
10675        buf2 = _PyUnicode_AsKind(sub, kind1);
10676    }
10677    if (!buf2) {
10678        Py_DECREF(sub);
10679        Py_DECREF(str);
10680        return -1;
10681    }
10682    len1 = PyUnicode_GET_LENGTH(str);
10683    len2 = PyUnicode_GET_LENGTH(sub);
10684
10685    switch (kind1) {
10686    case PyUnicode_1BYTE_KIND:
10687        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10688        break;
10689    case PyUnicode_2BYTE_KIND:
10690        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10691        break;
10692    case PyUnicode_4BYTE_KIND:
10693        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10694        break;
10695    default:
10696        result = -1;
10697        assert(0);
10698    }
10699
10700    Py_DECREF(str);
10701    Py_DECREF(sub);
10702
10703    if (kind2 != kind1)
10704        PyMem_Free(buf2);
10705
10706    return result;
10707}
10708
10709/* Concat to string or Unicode object giving a new Unicode object. */
10710
10711PyObject *
10712PyUnicode_Concat(PyObject *left, PyObject *right)
10713{
10714    PyObject *u = NULL, *v = NULL, *w;
10715    Py_UCS4 maxchar, maxchar2;
10716    Py_ssize_t u_len, v_len, new_len;
10717
10718    /* Coerce the two arguments */
10719    u = PyUnicode_FromObject(left);
10720    if (u == NULL)
10721        goto onError;
10722    v = PyUnicode_FromObject(right);
10723    if (v == NULL)
10724        goto onError;
10725
10726    /* Shortcuts */
10727    if (v == unicode_empty) {
10728        Py_DECREF(v);
10729        return u;
10730    }
10731    if (u == unicode_empty) {
10732        Py_DECREF(u);
10733        return v;
10734    }
10735
10736    u_len = PyUnicode_GET_LENGTH(u);
10737    v_len = PyUnicode_GET_LENGTH(v);
10738    if (u_len > PY_SSIZE_T_MAX - v_len) {
10739        PyErr_SetString(PyExc_OverflowError,
10740                        "strings are too large to concat");
10741        goto onError;
10742    }
10743    new_len = u_len + v_len;
10744
10745    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
10746    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
10747    maxchar = Py_MAX(maxchar, maxchar2);
10748
10749    /* Concat the two Unicode strings */
10750    w = PyUnicode_New(new_len, maxchar);
10751    if (w == NULL)
10752        goto onError;
10753    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
10754    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
10755    Py_DECREF(u);
10756    Py_DECREF(v);
10757    assert(_PyUnicode_CheckConsistency(w, 1));
10758    return w;
10759
10760  onError:
10761    Py_XDECREF(u);
10762    Py_XDECREF(v);
10763    return NULL;
10764}
10765
10766void
10767PyUnicode_Append(PyObject **p_left, PyObject *right)
10768{
10769    PyObject *left, *res;
10770    Py_UCS4 maxchar, maxchar2;
10771    Py_ssize_t left_len, right_len, new_len;
10772
10773    if (p_left == NULL) {
10774        if (!PyErr_Occurred())
10775            PyErr_BadInternalCall();
10776        return;
10777    }
10778    left = *p_left;
10779    if (right == NULL || left == NULL
10780        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
10781        if (!PyErr_Occurred())
10782            PyErr_BadInternalCall();
10783        goto error;
10784    }
10785
10786    if (PyUnicode_READY(left) == -1)
10787        goto error;
10788    if (PyUnicode_READY(right) == -1)
10789        goto error;
10790
10791    /* Shortcuts */
10792    if (left == unicode_empty) {
10793        Py_DECREF(left);
10794        Py_INCREF(right);
10795        *p_left = right;
10796        return;
10797    }
10798    if (right == unicode_empty)
10799        return;
10800
10801    left_len = PyUnicode_GET_LENGTH(left);
10802    right_len = PyUnicode_GET_LENGTH(right);
10803    if (left_len > PY_SSIZE_T_MAX - right_len) {
10804        PyErr_SetString(PyExc_OverflowError,
10805                        "strings are too large to concat");
10806        goto error;
10807    }
10808    new_len = left_len + right_len;
10809
10810    if (unicode_modifiable(left)
10811        && PyUnicode_CheckExact(right)
10812        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
10813        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
10814           to change the structure size, but characters are stored just after
10815           the structure, and so it requires to move all characters which is
10816           not so different than duplicating the string. */
10817        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
10818    {
10819        /* append inplace */
10820        if (unicode_resize(p_left, new_len) != 0)
10821            goto error;
10822
10823        /* copy 'right' into the newly allocated area of 'left' */
10824        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
10825    }
10826    else {
10827        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
10828        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
10829        maxchar = Py_MAX(maxchar, maxchar2);
10830
10831        /* Concat the two Unicode strings */
10832        res = PyUnicode_New(new_len, maxchar);
10833        if (res == NULL)
10834            goto error;
10835        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
10836        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
10837        Py_DECREF(left);
10838        *p_left = res;
10839    }
10840    assert(_PyUnicode_CheckConsistency(*p_left, 1));
10841    return;
10842
10843error:
10844    Py_CLEAR(*p_left);
10845}
10846
10847void
10848PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
10849{
10850    PyUnicode_Append(pleft, right);
10851    Py_XDECREF(right);
10852}
10853
10854PyDoc_STRVAR(count__doc__,
10855             "S.count(sub[, start[, end]]) -> int\n\
10856\n\
10857Return the number of non-overlapping occurrences of substring sub in\n\
10858string S[start:end].  Optional arguments start and end are\n\
10859interpreted as in slice notation.");
10860
10861static PyObject *
10862unicode_count(PyObject *self, PyObject *args)
10863{
10864    PyObject *substring;
10865    Py_ssize_t start = 0;
10866    Py_ssize_t end = PY_SSIZE_T_MAX;
10867    PyObject *result;
10868    int kind1, kind2, kind;
10869    void *buf1, *buf2;
10870    Py_ssize_t len1, len2, iresult;
10871
10872    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
10873                                            &start, &end))
10874        return NULL;
10875
10876    kind1 = PyUnicode_KIND(self);
10877    kind2 = PyUnicode_KIND(substring);
10878    if (kind2 > kind1)
10879        return PyLong_FromLong(0);
10880    kind = kind1;
10881    buf1 = PyUnicode_DATA(self);
10882    buf2 = PyUnicode_DATA(substring);
10883    if (kind2 != kind)
10884        buf2 = _PyUnicode_AsKind(substring, kind);
10885    if (!buf2) {
10886        Py_DECREF(substring);
10887        return NULL;
10888    }
10889    len1 = PyUnicode_GET_LENGTH(self);
10890    len2 = PyUnicode_GET_LENGTH(substring);
10891
10892    ADJUST_INDICES(start, end, len1);
10893    switch (kind) {
10894    case PyUnicode_1BYTE_KIND:
10895        iresult = ucs1lib_count(
10896            ((Py_UCS1*)buf1) + start, end - start,
10897            buf2, len2, PY_SSIZE_T_MAX
10898            );
10899        break;
10900    case PyUnicode_2BYTE_KIND:
10901        iresult = ucs2lib_count(
10902            ((Py_UCS2*)buf1) + start, end - start,
10903            buf2, len2, PY_SSIZE_T_MAX
10904            );
10905        break;
10906    case PyUnicode_4BYTE_KIND:
10907        iresult = ucs4lib_count(
10908            ((Py_UCS4*)buf1) + start, end - start,
10909            buf2, len2, PY_SSIZE_T_MAX
10910            );
10911        break;
10912    default:
10913        assert(0); iresult = 0;
10914    }
10915
10916    result = PyLong_FromSsize_t(iresult);
10917
10918    if (kind2 != kind)
10919        PyMem_Free(buf2);
10920
10921    Py_DECREF(substring);
10922
10923    return result;
10924}
10925
10926PyDoc_STRVAR(encode__doc__,
10927             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
10928\n\
10929Encode S using the codec registered for encoding. Default encoding\n\
10930is 'utf-8'. errors may be given to set a different error\n\
10931handling scheme. Default is 'strict' meaning that encoding errors raise\n\
10932a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
10933'xmlcharrefreplace' as well as any other name registered with\n\
10934codecs.register_error that can handle UnicodeEncodeErrors.");
10935
10936static PyObject *
10937unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
10938{
10939    static char *kwlist[] = {"encoding", "errors", 0};
10940    char *encoding = NULL;
10941    char *errors = NULL;
10942
10943    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
10944                                     kwlist, &encoding, &errors))
10945        return NULL;
10946    return PyUnicode_AsEncodedString(self, encoding, errors);
10947}
10948
10949PyDoc_STRVAR(expandtabs__doc__,
10950             "S.expandtabs([tabsize]) -> str\n\
10951\n\
10952Return a copy of S where all tab characters are expanded using spaces.\n\
10953If tabsize is not given, a tab size of 8 characters is assumed.");
10954
10955static PyObject*
10956unicode_expandtabs(PyObject *self, PyObject *args)
10957{
10958    Py_ssize_t i, j, line_pos, src_len, incr;
10959    Py_UCS4 ch;
10960    PyObject *u;
10961    void *src_data, *dest_data;
10962    int tabsize = 8;
10963    int kind;
10964    int found;
10965
10966    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
10967        return NULL;
10968
10969    if (PyUnicode_READY(self) == -1)
10970        return NULL;
10971
10972    /* First pass: determine size of output string */
10973    src_len = PyUnicode_GET_LENGTH(self);
10974    i = j = line_pos = 0;
10975    kind = PyUnicode_KIND(self);
10976    src_data = PyUnicode_DATA(self);
10977    found = 0;
10978    for (; i < src_len; i++) {
10979        ch = PyUnicode_READ(kind, src_data, i);
10980        if (ch == '\t') {
10981            found = 1;
10982            if (tabsize > 0) {
10983                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
10984                if (j > PY_SSIZE_T_MAX - incr)
10985                    goto overflow;
10986                line_pos += incr;
10987                j += incr;
10988            }
10989        }
10990        else {
10991            if (j > PY_SSIZE_T_MAX - 1)
10992                goto overflow;
10993            line_pos++;
10994            j++;
10995            if (ch == '\n' || ch == '\r')
10996                line_pos = 0;
10997        }
10998    }
10999    if (!found)
11000        return unicode_result_unchanged(self);
11001
11002    /* Second pass: create output string and fill it */
11003    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11004    if (!u)
11005        return NULL;
11006    dest_data = PyUnicode_DATA(u);
11007
11008    i = j = line_pos = 0;
11009
11010    for (; i < src_len; i++) {
11011        ch = PyUnicode_READ(kind, src_data, i);
11012        if (ch == '\t') {
11013            if (tabsize > 0) {
11014                incr = tabsize - (line_pos % tabsize);
11015                line_pos += incr;
11016                FILL(kind, dest_data, ' ', j, incr);
11017                j += incr;
11018            }
11019        }
11020        else {
11021            line_pos++;
11022            PyUnicode_WRITE(kind, dest_data, j, ch);
11023            j++;
11024            if (ch == '\n' || ch == '\r')
11025                line_pos = 0;
11026        }
11027    }
11028    assert (j == PyUnicode_GET_LENGTH(u));
11029    return unicode_result(u);
11030
11031  overflow:
11032    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11033    return NULL;
11034}
11035
11036PyDoc_STRVAR(find__doc__,
11037             "S.find(sub[, start[, end]]) -> int\n\
11038\n\
11039Return the lowest index in S where substring sub is found,\n\
11040such that sub is contained within S[start:end].  Optional\n\
11041arguments start and end are interpreted as in slice notation.\n\
11042\n\
11043Return -1 on failure.");
11044
11045static PyObject *
11046unicode_find(PyObject *self, PyObject *args)
11047{
11048    PyObject *substring;
11049    Py_ssize_t start;
11050    Py_ssize_t end;
11051    Py_ssize_t result;
11052
11053    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11054                                            &start, &end))
11055        return NULL;
11056
11057    if (PyUnicode_READY(self) == -1)
11058        return NULL;
11059    if (PyUnicode_READY(substring) == -1)
11060        return NULL;
11061
11062    result = any_find_slice(1, self, substring, start, end);
11063
11064    Py_DECREF(substring);
11065
11066    if (result == -2)
11067        return NULL;
11068
11069    return PyLong_FromSsize_t(result);
11070}
11071
11072static PyObject *
11073unicode_getitem(PyObject *self, Py_ssize_t index)
11074{
11075    void *data;
11076    enum PyUnicode_Kind kind;
11077    Py_UCS4 ch;
11078    PyObject *res;
11079
11080    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11081        PyErr_BadArgument();
11082        return NULL;
11083    }
11084    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11085        PyErr_SetString(PyExc_IndexError, "string index out of range");
11086        return NULL;
11087    }
11088    kind = PyUnicode_KIND(self);
11089    data = PyUnicode_DATA(self);
11090    ch = PyUnicode_READ(kind, data, index);
11091    if (ch < 256)
11092        return get_latin1_char(ch);
11093
11094    res = PyUnicode_New(1, ch);
11095    if (res == NULL)
11096        return NULL;
11097    kind = PyUnicode_KIND(res);
11098    data = PyUnicode_DATA(res);
11099    PyUnicode_WRITE(kind, data, 0, ch);
11100    assert(_PyUnicode_CheckConsistency(res, 1));
11101    return res;
11102}
11103
11104/* Believe it or not, this produces the same value for ASCII strings
11105   as bytes_hash(). */
11106static Py_hash_t
11107unicode_hash(PyObject *self)
11108{
11109    Py_ssize_t len;
11110    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11111
11112#ifdef Py_DEBUG
11113    assert(_Py_HashSecret_Initialized);
11114#endif
11115    if (_PyUnicode_HASH(self) != -1)
11116        return _PyUnicode_HASH(self);
11117    if (PyUnicode_READY(self) == -1)
11118        return -1;
11119    len = PyUnicode_GET_LENGTH(self);
11120    /*
11121      We make the hash of the empty string be 0, rather than using
11122      (prefix ^ suffix), since this slightly obfuscates the hash secret
11123    */
11124    if (len == 0) {
11125        _PyUnicode_HASH(self) = 0;
11126        return 0;
11127    }
11128
11129    /* The hash function as a macro, gets expanded three times below. */
11130#define HASH(P)                                            \
11131    x ^= (Py_uhash_t) *P << 7;                             \
11132    while (--len >= 0)                                     \
11133        x = (_PyHASH_MULTIPLIER * x) ^ (Py_uhash_t) *P++;  \
11134
11135    x = (Py_uhash_t) _Py_HashSecret.prefix;
11136    switch (PyUnicode_KIND(self)) {
11137    case PyUnicode_1BYTE_KIND: {
11138        const unsigned char *c = PyUnicode_1BYTE_DATA(self);
11139        HASH(c);
11140        break;
11141    }
11142    case PyUnicode_2BYTE_KIND: {
11143        const Py_UCS2 *s = PyUnicode_2BYTE_DATA(self);
11144        HASH(s);
11145        break;
11146    }
11147    default: {
11148        Py_UCS4 *l;
11149        assert(PyUnicode_KIND(self) == PyUnicode_4BYTE_KIND &&
11150               "Impossible switch case in unicode_hash");
11151        l = PyUnicode_4BYTE_DATA(self);
11152        HASH(l);
11153        break;
11154    }
11155    }
11156    x ^= (Py_uhash_t) PyUnicode_GET_LENGTH(self);
11157    x ^= (Py_uhash_t) _Py_HashSecret.suffix;
11158
11159    if (x == -1)
11160        x = -2;
11161    _PyUnicode_HASH(self) = x;
11162    return x;
11163}
11164#undef HASH
11165
11166PyDoc_STRVAR(index__doc__,
11167             "S.index(sub[, start[, end]]) -> int\n\
11168\n\
11169Like S.find() but raise ValueError when the substring is not found.");
11170
11171static PyObject *
11172unicode_index(PyObject *self, PyObject *args)
11173{
11174    Py_ssize_t result;
11175    PyObject *substring;
11176    Py_ssize_t start;
11177    Py_ssize_t end;
11178
11179    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11180                                            &start, &end))
11181        return NULL;
11182
11183    if (PyUnicode_READY(self) == -1)
11184        return NULL;
11185    if (PyUnicode_READY(substring) == -1)
11186        return NULL;
11187
11188    result = any_find_slice(1, self, substring, start, end);
11189
11190    Py_DECREF(substring);
11191
11192    if (result == -2)
11193        return NULL;
11194
11195    if (result < 0) {
11196        PyErr_SetString(PyExc_ValueError, "substring not found");
11197        return NULL;
11198    }
11199
11200    return PyLong_FromSsize_t(result);
11201}
11202
11203PyDoc_STRVAR(islower__doc__,
11204             "S.islower() -> bool\n\
11205\n\
11206Return True if all cased characters in S are lowercase and there is\n\
11207at least one cased character in S, False otherwise.");
11208
11209static PyObject*
11210unicode_islower(PyObject *self)
11211{
11212    Py_ssize_t i, length;
11213    int kind;
11214    void *data;
11215    int cased;
11216
11217    if (PyUnicode_READY(self) == -1)
11218        return NULL;
11219    length = PyUnicode_GET_LENGTH(self);
11220    kind = PyUnicode_KIND(self);
11221    data = PyUnicode_DATA(self);
11222
11223    /* Shortcut for single character strings */
11224    if (length == 1)
11225        return PyBool_FromLong(
11226            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11227
11228    /* Special case for empty strings */
11229    if (length == 0)
11230        return PyBool_FromLong(0);
11231
11232    cased = 0;
11233    for (i = 0; i < length; i++) {
11234        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11235
11236        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11237            return PyBool_FromLong(0);
11238        else if (!cased && Py_UNICODE_ISLOWER(ch))
11239            cased = 1;
11240    }
11241    return PyBool_FromLong(cased);
11242}
11243
11244PyDoc_STRVAR(isupper__doc__,
11245             "S.isupper() -> bool\n\
11246\n\
11247Return True if all cased characters in S are uppercase and there is\n\
11248at least one cased character in S, False otherwise.");
11249
11250static PyObject*
11251unicode_isupper(PyObject *self)
11252{
11253    Py_ssize_t i, length;
11254    int kind;
11255    void *data;
11256    int cased;
11257
11258    if (PyUnicode_READY(self) == -1)
11259        return NULL;
11260    length = PyUnicode_GET_LENGTH(self);
11261    kind = PyUnicode_KIND(self);
11262    data = PyUnicode_DATA(self);
11263
11264    /* Shortcut for single character strings */
11265    if (length == 1)
11266        return PyBool_FromLong(
11267            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11268
11269    /* Special case for empty strings */
11270    if (length == 0)
11271        return PyBool_FromLong(0);
11272
11273    cased = 0;
11274    for (i = 0; i < length; i++) {
11275        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11276
11277        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11278            return PyBool_FromLong(0);
11279        else if (!cased && Py_UNICODE_ISUPPER(ch))
11280            cased = 1;
11281    }
11282    return PyBool_FromLong(cased);
11283}
11284
11285PyDoc_STRVAR(istitle__doc__,
11286             "S.istitle() -> bool\n\
11287\n\
11288Return True if S is a titlecased string and there is at least one\n\
11289character in S, i.e. upper- and titlecase characters may only\n\
11290follow uncased characters and lowercase characters only cased ones.\n\
11291Return False otherwise.");
11292
11293static PyObject*
11294unicode_istitle(PyObject *self)
11295{
11296    Py_ssize_t i, length;
11297    int kind;
11298    void *data;
11299    int cased, previous_is_cased;
11300
11301    if (PyUnicode_READY(self) == -1)
11302        return NULL;
11303    length = PyUnicode_GET_LENGTH(self);
11304    kind = PyUnicode_KIND(self);
11305    data = PyUnicode_DATA(self);
11306
11307    /* Shortcut for single character strings */
11308    if (length == 1) {
11309        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11310        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11311                               (Py_UNICODE_ISUPPER(ch) != 0));
11312    }
11313
11314    /* Special case for empty strings */
11315    if (length == 0)
11316        return PyBool_FromLong(0);
11317
11318    cased = 0;
11319    previous_is_cased = 0;
11320    for (i = 0; i < length; i++) {
11321        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11322
11323        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11324            if (previous_is_cased)
11325                return PyBool_FromLong(0);
11326            previous_is_cased = 1;
11327            cased = 1;
11328        }
11329        else if (Py_UNICODE_ISLOWER(ch)) {
11330            if (!previous_is_cased)
11331                return PyBool_FromLong(0);
11332            previous_is_cased = 1;
11333            cased = 1;
11334        }
11335        else
11336            previous_is_cased = 0;
11337    }
11338    return PyBool_FromLong(cased);
11339}
11340
11341PyDoc_STRVAR(isspace__doc__,
11342             "S.isspace() -> bool\n\
11343\n\
11344Return True if all characters in S are whitespace\n\
11345and there is at least one character in S, False otherwise.");
11346
11347static PyObject*
11348unicode_isspace(PyObject *self)
11349{
11350    Py_ssize_t i, length;
11351    int kind;
11352    void *data;
11353
11354    if (PyUnicode_READY(self) == -1)
11355        return NULL;
11356    length = PyUnicode_GET_LENGTH(self);
11357    kind = PyUnicode_KIND(self);
11358    data = PyUnicode_DATA(self);
11359
11360    /* Shortcut for single character strings */
11361    if (length == 1)
11362        return PyBool_FromLong(
11363            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11364
11365    /* Special case for empty strings */
11366    if (length == 0)
11367        return PyBool_FromLong(0);
11368
11369    for (i = 0; i < length; i++) {
11370        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11371        if (!Py_UNICODE_ISSPACE(ch))
11372            return PyBool_FromLong(0);
11373    }
11374    return PyBool_FromLong(1);
11375}
11376
11377PyDoc_STRVAR(isalpha__doc__,
11378             "S.isalpha() -> bool\n\
11379\n\
11380Return True if all characters in S are alphabetic\n\
11381and there is at least one character in S, False otherwise.");
11382
11383static PyObject*
11384unicode_isalpha(PyObject *self)
11385{
11386    Py_ssize_t i, length;
11387    int kind;
11388    void *data;
11389
11390    if (PyUnicode_READY(self) == -1)
11391        return NULL;
11392    length = PyUnicode_GET_LENGTH(self);
11393    kind = PyUnicode_KIND(self);
11394    data = PyUnicode_DATA(self);
11395
11396    /* Shortcut for single character strings */
11397    if (length == 1)
11398        return PyBool_FromLong(
11399            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11400
11401    /* Special case for empty strings */
11402    if (length == 0)
11403        return PyBool_FromLong(0);
11404
11405    for (i = 0; i < length; i++) {
11406        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11407            return PyBool_FromLong(0);
11408    }
11409    return PyBool_FromLong(1);
11410}
11411
11412PyDoc_STRVAR(isalnum__doc__,
11413             "S.isalnum() -> bool\n\
11414\n\
11415Return True if all characters in S are alphanumeric\n\
11416and there is at least one character in S, False otherwise.");
11417
11418static PyObject*
11419unicode_isalnum(PyObject *self)
11420{
11421    int kind;
11422    void *data;
11423    Py_ssize_t len, i;
11424
11425    if (PyUnicode_READY(self) == -1)
11426        return NULL;
11427
11428    kind = PyUnicode_KIND(self);
11429    data = PyUnicode_DATA(self);
11430    len = PyUnicode_GET_LENGTH(self);
11431
11432    /* Shortcut for single character strings */
11433    if (len == 1) {
11434        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11435        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11436    }
11437
11438    /* Special case for empty strings */
11439    if (len == 0)
11440        return PyBool_FromLong(0);
11441
11442    for (i = 0; i < len; i++) {
11443        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11444        if (!Py_UNICODE_ISALNUM(ch))
11445            return PyBool_FromLong(0);
11446    }
11447    return PyBool_FromLong(1);
11448}
11449
11450PyDoc_STRVAR(isdecimal__doc__,
11451             "S.isdecimal() -> bool\n\
11452\n\
11453Return True if there are only decimal characters in S,\n\
11454False otherwise.");
11455
11456static PyObject*
11457unicode_isdecimal(PyObject *self)
11458{
11459    Py_ssize_t i, length;
11460    int kind;
11461    void *data;
11462
11463    if (PyUnicode_READY(self) == -1)
11464        return NULL;
11465    length = PyUnicode_GET_LENGTH(self);
11466    kind = PyUnicode_KIND(self);
11467    data = PyUnicode_DATA(self);
11468
11469    /* Shortcut for single character strings */
11470    if (length == 1)
11471        return PyBool_FromLong(
11472            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11473
11474    /* Special case for empty strings */
11475    if (length == 0)
11476        return PyBool_FromLong(0);
11477
11478    for (i = 0; i < length; i++) {
11479        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11480            return PyBool_FromLong(0);
11481    }
11482    return PyBool_FromLong(1);
11483}
11484
11485PyDoc_STRVAR(isdigit__doc__,
11486             "S.isdigit() -> bool\n\
11487\n\
11488Return True if all characters in S are digits\n\
11489and there is at least one character in S, False otherwise.");
11490
11491static PyObject*
11492unicode_isdigit(PyObject *self)
11493{
11494    Py_ssize_t i, length;
11495    int kind;
11496    void *data;
11497
11498    if (PyUnicode_READY(self) == -1)
11499        return NULL;
11500    length = PyUnicode_GET_LENGTH(self);
11501    kind = PyUnicode_KIND(self);
11502    data = PyUnicode_DATA(self);
11503
11504    /* Shortcut for single character strings */
11505    if (length == 1) {
11506        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11507        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11508    }
11509
11510    /* Special case for empty strings */
11511    if (length == 0)
11512        return PyBool_FromLong(0);
11513
11514    for (i = 0; i < length; i++) {
11515        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11516            return PyBool_FromLong(0);
11517    }
11518    return PyBool_FromLong(1);
11519}
11520
11521PyDoc_STRVAR(isnumeric__doc__,
11522             "S.isnumeric() -> bool\n\
11523\n\
11524Return True if there are only numeric characters in S,\n\
11525False otherwise.");
11526
11527static PyObject*
11528unicode_isnumeric(PyObject *self)
11529{
11530    Py_ssize_t i, length;
11531    int kind;
11532    void *data;
11533
11534    if (PyUnicode_READY(self) == -1)
11535        return NULL;
11536    length = PyUnicode_GET_LENGTH(self);
11537    kind = PyUnicode_KIND(self);
11538    data = PyUnicode_DATA(self);
11539
11540    /* Shortcut for single character strings */
11541    if (length == 1)
11542        return PyBool_FromLong(
11543            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11544
11545    /* Special case for empty strings */
11546    if (length == 0)
11547        return PyBool_FromLong(0);
11548
11549    for (i = 0; i < length; i++) {
11550        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11551            return PyBool_FromLong(0);
11552    }
11553    return PyBool_FromLong(1);
11554}
11555
11556int
11557PyUnicode_IsIdentifier(PyObject *self)
11558{
11559    int kind;
11560    void *data;
11561    Py_ssize_t i;
11562    Py_UCS4 first;
11563
11564    if (PyUnicode_READY(self) == -1) {
11565        Py_FatalError("identifier not ready");
11566        return 0;
11567    }
11568
11569    /* Special case for empty strings */
11570    if (PyUnicode_GET_LENGTH(self) == 0)
11571        return 0;
11572    kind = PyUnicode_KIND(self);
11573    data = PyUnicode_DATA(self);
11574
11575    /* PEP 3131 says that the first character must be in
11576       XID_Start and subsequent characters in XID_Continue,
11577       and for the ASCII range, the 2.x rules apply (i.e
11578       start with letters and underscore, continue with
11579       letters, digits, underscore). However, given the current
11580       definition of XID_Start and XID_Continue, it is sufficient
11581       to check just for these, except that _ must be allowed
11582       as starting an identifier.  */
11583    first = PyUnicode_READ(kind, data, 0);
11584    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11585        return 0;
11586
11587    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11588        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11589            return 0;
11590    return 1;
11591}
11592
11593PyDoc_STRVAR(isidentifier__doc__,
11594             "S.isidentifier() -> bool\n\
11595\n\
11596Return True if S is a valid identifier according\n\
11597to the language definition.\n\
11598\n\
11599Use keyword.iskeyword() to test for reserved identifiers\n\
11600such as \"def\" and \"class\".\n");
11601
11602static PyObject*
11603unicode_isidentifier(PyObject *self)
11604{
11605    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11606}
11607
11608PyDoc_STRVAR(isprintable__doc__,
11609             "S.isprintable() -> bool\n\
11610\n\
11611Return True if all characters in S are considered\n\
11612printable in repr() or S is empty, False otherwise.");
11613
11614static PyObject*
11615unicode_isprintable(PyObject *self)
11616{
11617    Py_ssize_t i, length;
11618    int kind;
11619    void *data;
11620
11621    if (PyUnicode_READY(self) == -1)
11622        return NULL;
11623    length = PyUnicode_GET_LENGTH(self);
11624    kind = PyUnicode_KIND(self);
11625    data = PyUnicode_DATA(self);
11626
11627    /* Shortcut for single character strings */
11628    if (length == 1)
11629        return PyBool_FromLong(
11630            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11631
11632    for (i = 0; i < length; i++) {
11633        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11634            Py_RETURN_FALSE;
11635        }
11636    }
11637    Py_RETURN_TRUE;
11638}
11639
11640PyDoc_STRVAR(join__doc__,
11641             "S.join(iterable) -> str\n\
11642\n\
11643Return a string which is the concatenation of the strings in the\n\
11644iterable.  The separator between elements is S.");
11645
11646static PyObject*
11647unicode_join(PyObject *self, PyObject *data)
11648{
11649    return PyUnicode_Join(self, data);
11650}
11651
11652static Py_ssize_t
11653unicode_length(PyObject *self)
11654{
11655    if (PyUnicode_READY(self) == -1)
11656        return -1;
11657    return PyUnicode_GET_LENGTH(self);
11658}
11659
11660PyDoc_STRVAR(ljust__doc__,
11661             "S.ljust(width[, fillchar]) -> str\n\
11662\n\
11663Return S left-justified in a Unicode string of length width. Padding is\n\
11664done using the specified fill character (default is a space).");
11665
11666static PyObject *
11667unicode_ljust(PyObject *self, PyObject *args)
11668{
11669    Py_ssize_t width;
11670    Py_UCS4 fillchar = ' ';
11671
11672    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11673        return NULL;
11674
11675    if (PyUnicode_READY(self) == -1)
11676        return NULL;
11677
11678    if (PyUnicode_GET_LENGTH(self) >= width)
11679        return unicode_result_unchanged(self);
11680
11681    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11682}
11683
11684PyDoc_STRVAR(lower__doc__,
11685             "S.lower() -> str\n\
11686\n\
11687Return a copy of the string S converted to lowercase.");
11688
11689static PyObject*
11690unicode_lower(PyObject *self)
11691{
11692    if (PyUnicode_READY(self) == -1)
11693        return NULL;
11694    if (PyUnicode_IS_ASCII(self))
11695        return ascii_upper_or_lower(self, 1);
11696    return case_operation(self, do_lower);
11697}
11698
11699#define LEFTSTRIP 0
11700#define RIGHTSTRIP 1
11701#define BOTHSTRIP 2
11702
11703/* Arrays indexed by above */
11704static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11705
11706#define STRIPNAME(i) (stripformat[i]+3)
11707
11708/* externally visible for str.strip(unicode) */
11709PyObject *
11710_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11711{
11712    void *data;
11713    int kind;
11714    Py_ssize_t i, j, len;
11715    BLOOM_MASK sepmask;
11716    Py_ssize_t seplen;
11717
11718    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11719        return NULL;
11720
11721    kind = PyUnicode_KIND(self);
11722    data = PyUnicode_DATA(self);
11723    len = PyUnicode_GET_LENGTH(self);
11724    seplen = PyUnicode_GET_LENGTH(sepobj);
11725    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11726                              PyUnicode_DATA(sepobj),
11727                              seplen);
11728
11729    i = 0;
11730    if (striptype != RIGHTSTRIP) {
11731        while (i < len) {
11732            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11733            if (!BLOOM(sepmask, ch))
11734                break;
11735            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11736                break;
11737            i++;
11738        }
11739    }
11740
11741    j = len;
11742    if (striptype != LEFTSTRIP) {
11743        j--;
11744        while (j >= i) {
11745            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11746            if (!BLOOM(sepmask, ch))
11747                break;
11748            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11749                break;
11750            j--;
11751        }
11752
11753        j++;
11754    }
11755
11756    return PyUnicode_Substring(self, i, j);
11757}
11758
11759PyObject*
11760PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
11761{
11762    unsigned char *data;
11763    int kind;
11764    Py_ssize_t length;
11765
11766    if (PyUnicode_READY(self) == -1)
11767        return NULL;
11768
11769    length = PyUnicode_GET_LENGTH(self);
11770    end = Py_MIN(end, length);
11771
11772    if (start == 0 && end == length)
11773        return unicode_result_unchanged(self);
11774
11775    if (start < 0 || end < 0) {
11776        PyErr_SetString(PyExc_IndexError, "string index out of range");
11777        return NULL;
11778    }
11779    if (start >= length || end < start)
11780        _Py_RETURN_UNICODE_EMPTY();
11781
11782    length = end - start;
11783    if (PyUnicode_IS_ASCII(self)) {
11784        data = PyUnicode_1BYTE_DATA(self);
11785        return _PyUnicode_FromASCII((char*)(data + start), length);
11786    }
11787    else {
11788        kind = PyUnicode_KIND(self);
11789        data = PyUnicode_1BYTE_DATA(self);
11790        return PyUnicode_FromKindAndData(kind,
11791                                         data + kind * start,
11792                                         length);
11793    }
11794}
11795
11796static PyObject *
11797do_strip(PyObject *self, int striptype)
11798{
11799    Py_ssize_t len, i, j;
11800
11801    if (PyUnicode_READY(self) == -1)
11802        return NULL;
11803
11804    len = PyUnicode_GET_LENGTH(self);
11805
11806    if (PyUnicode_IS_ASCII(self)) {
11807        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
11808
11809        i = 0;
11810        if (striptype != RIGHTSTRIP) {
11811            while (i < len) {
11812                Py_UCS1 ch = data[i];
11813                if (!_Py_ascii_whitespace[ch])
11814                    break;
11815                i++;
11816            }
11817        }
11818
11819        j = len;
11820        if (striptype != LEFTSTRIP) {
11821            j--;
11822            while (j >= i) {
11823                Py_UCS1 ch = data[j];
11824                if (!_Py_ascii_whitespace[ch])
11825                    break;
11826                j--;
11827            }
11828            j++;
11829        }
11830    }
11831    else {
11832        int kind = PyUnicode_KIND(self);
11833        void *data = PyUnicode_DATA(self);
11834
11835        i = 0;
11836        if (striptype != RIGHTSTRIP) {
11837            while (i < len) {
11838                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11839                if (!Py_UNICODE_ISSPACE(ch))
11840                    break;
11841                i++;
11842            }
11843        }
11844
11845        j = len;
11846        if (striptype != LEFTSTRIP) {
11847            j--;
11848            while (j >= i) {
11849                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11850                if (!Py_UNICODE_ISSPACE(ch))
11851                    break;
11852                j--;
11853            }
11854            j++;
11855        }
11856    }
11857
11858    return PyUnicode_Substring(self, i, j);
11859}
11860
11861
11862static PyObject *
11863do_argstrip(PyObject *self, int striptype, PyObject *args)
11864{
11865    PyObject *sep = NULL;
11866
11867    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
11868        return NULL;
11869
11870    if (sep != NULL && sep != Py_None) {
11871        if (PyUnicode_Check(sep))
11872            return _PyUnicode_XStrip(self, striptype, sep);
11873        else {
11874            PyErr_Format(PyExc_TypeError,
11875                         "%s arg must be None or str",
11876                         STRIPNAME(striptype));
11877            return NULL;
11878        }
11879    }
11880
11881    return do_strip(self, striptype);
11882}
11883
11884
11885PyDoc_STRVAR(strip__doc__,
11886             "S.strip([chars]) -> str\n\
11887\n\
11888Return a copy of the string S with leading and trailing\n\
11889whitespace removed.\n\
11890If chars is given and not None, remove characters in chars instead.");
11891
11892static PyObject *
11893unicode_strip(PyObject *self, PyObject *args)
11894{
11895    if (PyTuple_GET_SIZE(args) == 0)
11896        return do_strip(self, BOTHSTRIP); /* Common case */
11897    else
11898        return do_argstrip(self, BOTHSTRIP, args);
11899}
11900
11901
11902PyDoc_STRVAR(lstrip__doc__,
11903             "S.lstrip([chars]) -> str\n\
11904\n\
11905Return a copy of the string S with leading whitespace removed.\n\
11906If chars is given and not None, remove characters in chars instead.");
11907
11908static PyObject *
11909unicode_lstrip(PyObject *self, PyObject *args)
11910{
11911    if (PyTuple_GET_SIZE(args) == 0)
11912        return do_strip(self, LEFTSTRIP); /* Common case */
11913    else
11914        return do_argstrip(self, LEFTSTRIP, args);
11915}
11916
11917
11918PyDoc_STRVAR(rstrip__doc__,
11919             "S.rstrip([chars]) -> str\n\
11920\n\
11921Return a copy of the string S with trailing whitespace removed.\n\
11922If chars is given and not None, remove characters in chars instead.");
11923
11924static PyObject *
11925unicode_rstrip(PyObject *self, PyObject *args)
11926{
11927    if (PyTuple_GET_SIZE(args) == 0)
11928        return do_strip(self, RIGHTSTRIP); /* Common case */
11929    else
11930        return do_argstrip(self, RIGHTSTRIP, args);
11931}
11932
11933
11934static PyObject*
11935unicode_repeat(PyObject *str, Py_ssize_t len)
11936{
11937    PyObject *u;
11938    Py_ssize_t nchars, n;
11939
11940    if (len < 1)
11941        _Py_RETURN_UNICODE_EMPTY();
11942
11943    /* no repeat, return original string */
11944    if (len == 1)
11945        return unicode_result_unchanged(str);
11946
11947    if (PyUnicode_READY(str) == -1)
11948        return NULL;
11949
11950    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
11951        PyErr_SetString(PyExc_OverflowError,
11952                        "repeated string is too long");
11953        return NULL;
11954    }
11955    nchars = len * PyUnicode_GET_LENGTH(str);
11956
11957    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
11958    if (!u)
11959        return NULL;
11960    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
11961
11962    if (PyUnicode_GET_LENGTH(str) == 1) {
11963        const int kind = PyUnicode_KIND(str);
11964        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
11965        if (kind == PyUnicode_1BYTE_KIND) {
11966            void *to = PyUnicode_DATA(u);
11967            memset(to, (unsigned char)fill_char, len);
11968        }
11969        else if (kind == PyUnicode_2BYTE_KIND) {
11970            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
11971            for (n = 0; n < len; ++n)
11972                ucs2[n] = fill_char;
11973        } else {
11974            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
11975            assert(kind == PyUnicode_4BYTE_KIND);
11976            for (n = 0; n < len; ++n)
11977                ucs4[n] = fill_char;
11978        }
11979    }
11980    else {
11981        /* number of characters copied this far */
11982        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
11983        const Py_ssize_t char_size = PyUnicode_KIND(str);
11984        char *to = (char *) PyUnicode_DATA(u);
11985        Py_MEMCPY(to, PyUnicode_DATA(str),
11986                  PyUnicode_GET_LENGTH(str) * char_size);
11987        while (done < nchars) {
11988            n = (done <= nchars-done) ? done : nchars-done;
11989            Py_MEMCPY(to + (done * char_size), to, n * char_size);
11990            done += n;
11991        }
11992    }
11993
11994    assert(_PyUnicode_CheckConsistency(u, 1));
11995    return u;
11996}
11997
11998PyObject *
11999PyUnicode_Replace(PyObject *obj,
12000                  PyObject *subobj,
12001                  PyObject *replobj,
12002                  Py_ssize_t maxcount)
12003{
12004    PyObject *self;
12005    PyObject *str1;
12006    PyObject *str2;
12007    PyObject *result;
12008
12009    self = PyUnicode_FromObject(obj);
12010    if (self == NULL)
12011        return NULL;
12012    str1 = PyUnicode_FromObject(subobj);
12013    if (str1 == NULL) {
12014        Py_DECREF(self);
12015        return NULL;
12016    }
12017    str2 = PyUnicode_FromObject(replobj);
12018    if (str2 == NULL) {
12019        Py_DECREF(self);
12020        Py_DECREF(str1);
12021        return NULL;
12022    }
12023    if (PyUnicode_READY(self) == -1 ||
12024        PyUnicode_READY(str1) == -1 ||
12025        PyUnicode_READY(str2) == -1)
12026        result = NULL;
12027    else
12028        result = replace(self, str1, str2, maxcount);
12029    Py_DECREF(self);
12030    Py_DECREF(str1);
12031    Py_DECREF(str2);
12032    return result;
12033}
12034
12035PyDoc_STRVAR(replace__doc__,
12036             "S.replace(old, new[, count]) -> str\n\
12037\n\
12038Return a copy of S with all occurrences of substring\n\
12039old replaced by new.  If the optional argument count is\n\
12040given, only the first count occurrences are replaced.");
12041
12042static PyObject*
12043unicode_replace(PyObject *self, PyObject *args)
12044{
12045    PyObject *str1;
12046    PyObject *str2;
12047    Py_ssize_t maxcount = -1;
12048    PyObject *result;
12049
12050    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12051        return NULL;
12052    if (PyUnicode_READY(self) == -1)
12053        return NULL;
12054    str1 = PyUnicode_FromObject(str1);
12055    if (str1 == NULL)
12056        return NULL;
12057    str2 = PyUnicode_FromObject(str2);
12058    if (str2 == NULL) {
12059        Py_DECREF(str1);
12060        return NULL;
12061    }
12062    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12063        result = NULL;
12064    else
12065        result = replace(self, str1, str2, maxcount);
12066
12067    Py_DECREF(str1);
12068    Py_DECREF(str2);
12069    return result;
12070}
12071
12072static PyObject *
12073unicode_repr(PyObject *unicode)
12074{
12075    PyObject *repr;
12076    Py_ssize_t isize;
12077    Py_ssize_t osize, squote, dquote, i, o;
12078    Py_UCS4 max, quote;
12079    int ikind, okind, unchanged;
12080    void *idata, *odata;
12081
12082    if (PyUnicode_READY(unicode) == -1)
12083        return NULL;
12084
12085    isize = PyUnicode_GET_LENGTH(unicode);
12086    idata = PyUnicode_DATA(unicode);
12087
12088    /* Compute length of output, quote characters, and
12089       maximum character */
12090    osize = 0;
12091    max = 127;
12092    squote = dquote = 0;
12093    ikind = PyUnicode_KIND(unicode);
12094    for (i = 0; i < isize; i++) {
12095        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12096        switch (ch) {
12097        case '\'': squote++; osize++; break;
12098        case '"':  dquote++; osize++; break;
12099        case '\\': case '\t': case '\r': case '\n':
12100            osize += 2; break;
12101        default:
12102            /* Fast-path ASCII */
12103            if (ch < ' ' || ch == 0x7f)
12104                osize += 4; /* \xHH */
12105            else if (ch < 0x7f)
12106                osize++;
12107            else if (Py_UNICODE_ISPRINTABLE(ch)) {
12108                osize++;
12109                max = ch > max ? ch : max;
12110            }
12111            else if (ch < 0x100)
12112                osize += 4; /* \xHH */
12113            else if (ch < 0x10000)
12114                osize += 6; /* \uHHHH */
12115            else
12116                osize += 10; /* \uHHHHHHHH */
12117        }
12118    }
12119
12120    quote = '\'';
12121    unchanged = (osize == isize);
12122    if (squote) {
12123        unchanged = 0;
12124        if (dquote)
12125            /* Both squote and dquote present. Use squote,
12126               and escape them */
12127            osize += squote;
12128        else
12129            quote = '"';
12130    }
12131    osize += 2;   /* quotes */
12132
12133    repr = PyUnicode_New(osize, max);
12134    if (repr == NULL)
12135        return NULL;
12136    okind = PyUnicode_KIND(repr);
12137    odata = PyUnicode_DATA(repr);
12138
12139    PyUnicode_WRITE(okind, odata, 0, quote);
12140    PyUnicode_WRITE(okind, odata, osize-1, quote);
12141    if (unchanged) {
12142        _PyUnicode_FastCopyCharacters(repr, 1,
12143                                      unicode, 0,
12144                                      isize);
12145    }
12146    else {
12147        for (i = 0, o = 1; i < isize; i++) {
12148            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12149
12150            /* Escape quotes and backslashes */
12151            if ((ch == quote) || (ch == '\\')) {
12152                PyUnicode_WRITE(okind, odata, o++, '\\');
12153                PyUnicode_WRITE(okind, odata, o++, ch);
12154                continue;
12155            }
12156
12157            /* Map special whitespace to '\t', \n', '\r' */
12158            if (ch == '\t') {
12159                PyUnicode_WRITE(okind, odata, o++, '\\');
12160                PyUnicode_WRITE(okind, odata, o++, 't');
12161            }
12162            else if (ch == '\n') {
12163                PyUnicode_WRITE(okind, odata, o++, '\\');
12164                PyUnicode_WRITE(okind, odata, o++, 'n');
12165            }
12166            else if (ch == '\r') {
12167                PyUnicode_WRITE(okind, odata, o++, '\\');
12168                PyUnicode_WRITE(okind, odata, o++, 'r');
12169            }
12170
12171            /* Map non-printable US ASCII to '\xhh' */
12172            else if (ch < ' ' || ch == 0x7F) {
12173                PyUnicode_WRITE(okind, odata, o++, '\\');
12174                PyUnicode_WRITE(okind, odata, o++, 'x');
12175                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12176                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12177            }
12178
12179            /* Copy ASCII characters as-is */
12180            else if (ch < 0x7F) {
12181                PyUnicode_WRITE(okind, odata, o++, ch);
12182            }
12183
12184            /* Non-ASCII characters */
12185            else {
12186                /* Map Unicode whitespace and control characters
12187                   (categories Z* and C* except ASCII space)
12188                */
12189                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12190                    PyUnicode_WRITE(okind, odata, o++, '\\');
12191                    /* Map 8-bit characters to '\xhh' */
12192                    if (ch <= 0xff) {
12193                        PyUnicode_WRITE(okind, odata, o++, 'x');
12194                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12195                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12196                    }
12197                    /* Map 16-bit characters to '\uxxxx' */
12198                    else if (ch <= 0xffff) {
12199                        PyUnicode_WRITE(okind, odata, o++, 'u');
12200                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12201                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12202                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12203                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12204                    }
12205                    /* Map 21-bit characters to '\U00xxxxxx' */
12206                    else {
12207                        PyUnicode_WRITE(okind, odata, o++, 'U');
12208                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12209                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12210                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12211                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12212                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12213                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12214                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12215                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12216                    }
12217                }
12218                /* Copy characters as-is */
12219                else {
12220                    PyUnicode_WRITE(okind, odata, o++, ch);
12221                }
12222            }
12223        }
12224    }
12225    /* Closing quote already added at the beginning */
12226    assert(_PyUnicode_CheckConsistency(repr, 1));
12227    return repr;
12228}
12229
12230PyDoc_STRVAR(rfind__doc__,
12231             "S.rfind(sub[, start[, end]]) -> int\n\
12232\n\
12233Return the highest index in S where substring sub is found,\n\
12234such that sub is contained within S[start:end].  Optional\n\
12235arguments start and end are interpreted as in slice notation.\n\
12236\n\
12237Return -1 on failure.");
12238
12239static PyObject *
12240unicode_rfind(PyObject *self, PyObject *args)
12241{
12242    PyObject *substring;
12243    Py_ssize_t start;
12244    Py_ssize_t end;
12245    Py_ssize_t result;
12246
12247    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12248                                            &start, &end))
12249        return NULL;
12250
12251    if (PyUnicode_READY(self) == -1) {
12252        Py_DECREF(substring);
12253        return NULL;
12254    }
12255    if (PyUnicode_READY(substring) == -1) {
12256        Py_DECREF(substring);
12257        return NULL;
12258    }
12259
12260    result = any_find_slice(-1, self, substring, start, end);
12261
12262    Py_DECREF(substring);
12263
12264    if (result == -2)
12265        return NULL;
12266
12267    return PyLong_FromSsize_t(result);
12268}
12269
12270PyDoc_STRVAR(rindex__doc__,
12271             "S.rindex(sub[, start[, end]]) -> int\n\
12272\n\
12273Like S.rfind() but raise ValueError when the substring is not found.");
12274
12275static PyObject *
12276unicode_rindex(PyObject *self, PyObject *args)
12277{
12278    PyObject *substring;
12279    Py_ssize_t start;
12280    Py_ssize_t end;
12281    Py_ssize_t result;
12282
12283    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12284                                            &start, &end))
12285        return NULL;
12286
12287    if (PyUnicode_READY(self) == -1) {
12288        Py_DECREF(substring);
12289        return NULL;
12290    }
12291    if (PyUnicode_READY(substring) == -1) {
12292        Py_DECREF(substring);
12293        return NULL;
12294    }
12295
12296    result = any_find_slice(-1, self, substring, start, end);
12297
12298    Py_DECREF(substring);
12299
12300    if (result == -2)
12301        return NULL;
12302
12303    if (result < 0) {
12304        PyErr_SetString(PyExc_ValueError, "substring not found");
12305        return NULL;
12306    }
12307
12308    return PyLong_FromSsize_t(result);
12309}
12310
12311PyDoc_STRVAR(rjust__doc__,
12312             "S.rjust(width[, fillchar]) -> str\n\
12313\n\
12314Return S right-justified in a string of length width. Padding is\n\
12315done using the specified fill character (default is a space).");
12316
12317static PyObject *
12318unicode_rjust(PyObject *self, PyObject *args)
12319{
12320    Py_ssize_t width;
12321    Py_UCS4 fillchar = ' ';
12322
12323    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12324        return NULL;
12325
12326    if (PyUnicode_READY(self) == -1)
12327        return NULL;
12328
12329    if (PyUnicode_GET_LENGTH(self) >= width)
12330        return unicode_result_unchanged(self);
12331
12332    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12333}
12334
12335PyObject *
12336PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12337{
12338    PyObject *result;
12339
12340    s = PyUnicode_FromObject(s);
12341    if (s == NULL)
12342        return NULL;
12343    if (sep != NULL) {
12344        sep = PyUnicode_FromObject(sep);
12345        if (sep == NULL) {
12346            Py_DECREF(s);
12347            return NULL;
12348        }
12349    }
12350
12351    result = split(s, sep, maxsplit);
12352
12353    Py_DECREF(s);
12354    Py_XDECREF(sep);
12355    return result;
12356}
12357
12358PyDoc_STRVAR(split__doc__,
12359             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12360\n\
12361Return a list of the words in S, using sep as the\n\
12362delimiter string.  If maxsplit is given, at most maxsplit\n\
12363splits are done. If sep is not specified or is None, any\n\
12364whitespace string is a separator and empty strings are\n\
12365removed from the result.");
12366
12367static PyObject*
12368unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12369{
12370    static char *kwlist[] = {"sep", "maxsplit", 0};
12371    PyObject *substring = Py_None;
12372    Py_ssize_t maxcount = -1;
12373
12374    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12375                                     kwlist, &substring, &maxcount))
12376        return NULL;
12377
12378    if (substring == Py_None)
12379        return split(self, NULL, maxcount);
12380    else if (PyUnicode_Check(substring))
12381        return split(self, substring, maxcount);
12382    else
12383        return PyUnicode_Split(self, substring, maxcount);
12384}
12385
12386PyObject *
12387PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12388{
12389    PyObject* str_obj;
12390    PyObject* sep_obj;
12391    PyObject* out;
12392    int kind1, kind2, kind;
12393    void *buf1 = NULL, *buf2 = NULL;
12394    Py_ssize_t len1, len2;
12395
12396    str_obj = PyUnicode_FromObject(str_in);
12397    if (!str_obj)
12398        return NULL;
12399    sep_obj = PyUnicode_FromObject(sep_in);
12400    if (!sep_obj) {
12401        Py_DECREF(str_obj);
12402        return NULL;
12403    }
12404    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12405        Py_DECREF(sep_obj);
12406        Py_DECREF(str_obj);
12407        return NULL;
12408    }
12409
12410    kind1 = PyUnicode_KIND(str_obj);
12411    kind2 = PyUnicode_KIND(sep_obj);
12412    kind = Py_MAX(kind1, kind2);
12413    buf1 = PyUnicode_DATA(str_obj);
12414    if (kind1 != kind)
12415        buf1 = _PyUnicode_AsKind(str_obj, kind);
12416    if (!buf1)
12417        goto onError;
12418    buf2 = PyUnicode_DATA(sep_obj);
12419    if (kind2 != kind)
12420        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12421    if (!buf2)
12422        goto onError;
12423    len1 = PyUnicode_GET_LENGTH(str_obj);
12424    len2 = PyUnicode_GET_LENGTH(sep_obj);
12425
12426    switch (PyUnicode_KIND(str_obj)) {
12427    case PyUnicode_1BYTE_KIND:
12428        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12429            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12430        else
12431            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12432        break;
12433    case PyUnicode_2BYTE_KIND:
12434        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12435        break;
12436    case PyUnicode_4BYTE_KIND:
12437        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12438        break;
12439    default:
12440        assert(0);
12441        out = 0;
12442    }
12443
12444    Py_DECREF(sep_obj);
12445    Py_DECREF(str_obj);
12446    if (kind1 != kind)
12447        PyMem_Free(buf1);
12448    if (kind2 != kind)
12449        PyMem_Free(buf2);
12450
12451    return out;
12452  onError:
12453    Py_DECREF(sep_obj);
12454    Py_DECREF(str_obj);
12455    if (kind1 != kind && buf1)
12456        PyMem_Free(buf1);
12457    if (kind2 != kind && buf2)
12458        PyMem_Free(buf2);
12459    return NULL;
12460}
12461
12462
12463PyObject *
12464PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12465{
12466    PyObject* str_obj;
12467    PyObject* sep_obj;
12468    PyObject* out;
12469    int kind1, kind2, kind;
12470    void *buf1 = NULL, *buf2 = NULL;
12471    Py_ssize_t len1, len2;
12472
12473    str_obj = PyUnicode_FromObject(str_in);
12474    if (!str_obj)
12475        return NULL;
12476    sep_obj = PyUnicode_FromObject(sep_in);
12477    if (!sep_obj) {
12478        Py_DECREF(str_obj);
12479        return NULL;
12480    }
12481
12482    kind1 = PyUnicode_KIND(str_in);
12483    kind2 = PyUnicode_KIND(sep_obj);
12484    kind = Py_MAX(kind1, kind2);
12485    buf1 = PyUnicode_DATA(str_in);
12486    if (kind1 != kind)
12487        buf1 = _PyUnicode_AsKind(str_in, kind);
12488    if (!buf1)
12489        goto onError;
12490    buf2 = PyUnicode_DATA(sep_obj);
12491    if (kind2 != kind)
12492        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12493    if (!buf2)
12494        goto onError;
12495    len1 = PyUnicode_GET_LENGTH(str_obj);
12496    len2 = PyUnicode_GET_LENGTH(sep_obj);
12497
12498    switch (PyUnicode_KIND(str_in)) {
12499    case PyUnicode_1BYTE_KIND:
12500        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12501            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12502        else
12503            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12504        break;
12505    case PyUnicode_2BYTE_KIND:
12506        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12507        break;
12508    case PyUnicode_4BYTE_KIND:
12509        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12510        break;
12511    default:
12512        assert(0);
12513        out = 0;
12514    }
12515
12516    Py_DECREF(sep_obj);
12517    Py_DECREF(str_obj);
12518    if (kind1 != kind)
12519        PyMem_Free(buf1);
12520    if (kind2 != kind)
12521        PyMem_Free(buf2);
12522
12523    return out;
12524  onError:
12525    Py_DECREF(sep_obj);
12526    Py_DECREF(str_obj);
12527    if (kind1 != kind && buf1)
12528        PyMem_Free(buf1);
12529    if (kind2 != kind && buf2)
12530        PyMem_Free(buf2);
12531    return NULL;
12532}
12533
12534PyDoc_STRVAR(partition__doc__,
12535             "S.partition(sep) -> (head, sep, tail)\n\
12536\n\
12537Search for the separator sep in S, and return the part before it,\n\
12538the separator itself, and the part after it.  If the separator is not\n\
12539found, return S and two empty strings.");
12540
12541static PyObject*
12542unicode_partition(PyObject *self, PyObject *separator)
12543{
12544    return PyUnicode_Partition(self, separator);
12545}
12546
12547PyDoc_STRVAR(rpartition__doc__,
12548             "S.rpartition(sep) -> (head, sep, tail)\n\
12549\n\
12550Search for the separator sep in S, starting at the end of S, and return\n\
12551the part before it, the separator itself, and the part after it.  If the\n\
12552separator is not found, return two empty strings and S.");
12553
12554static PyObject*
12555unicode_rpartition(PyObject *self, PyObject *separator)
12556{
12557    return PyUnicode_RPartition(self, separator);
12558}
12559
12560PyObject *
12561PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12562{
12563    PyObject *result;
12564
12565    s = PyUnicode_FromObject(s);
12566    if (s == NULL)
12567        return NULL;
12568    if (sep != NULL) {
12569        sep = PyUnicode_FromObject(sep);
12570        if (sep == NULL) {
12571            Py_DECREF(s);
12572            return NULL;
12573        }
12574    }
12575
12576    result = rsplit(s, sep, maxsplit);
12577
12578    Py_DECREF(s);
12579    Py_XDECREF(sep);
12580    return result;
12581}
12582
12583PyDoc_STRVAR(rsplit__doc__,
12584             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12585\n\
12586Return a list of the words in S, using sep as the\n\
12587delimiter string, starting at the end of the string and\n\
12588working to the front.  If maxsplit is given, at most maxsplit\n\
12589splits are done. If sep is not specified, any whitespace string\n\
12590is a separator.");
12591
12592static PyObject*
12593unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12594{
12595    static char *kwlist[] = {"sep", "maxsplit", 0};
12596    PyObject *substring = Py_None;
12597    Py_ssize_t maxcount = -1;
12598
12599    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12600                                     kwlist, &substring, &maxcount))
12601        return NULL;
12602
12603    if (substring == Py_None)
12604        return rsplit(self, NULL, maxcount);
12605    else if (PyUnicode_Check(substring))
12606        return rsplit(self, substring, maxcount);
12607    else
12608        return PyUnicode_RSplit(self, substring, maxcount);
12609}
12610
12611PyDoc_STRVAR(splitlines__doc__,
12612             "S.splitlines([keepends]) -> list of strings\n\
12613\n\
12614Return a list of the lines in S, breaking at line boundaries.\n\
12615Line breaks are not included in the resulting list unless keepends\n\
12616is given and true.");
12617
12618static PyObject*
12619unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12620{
12621    static char *kwlist[] = {"keepends", 0};
12622    int keepends = 0;
12623
12624    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12625                                     kwlist, &keepends))
12626        return NULL;
12627
12628    return PyUnicode_Splitlines(self, keepends);
12629}
12630
12631static
12632PyObject *unicode_str(PyObject *self)
12633{
12634    return unicode_result_unchanged(self);
12635}
12636
12637PyDoc_STRVAR(swapcase__doc__,
12638             "S.swapcase() -> str\n\
12639\n\
12640Return a copy of S with uppercase characters converted to lowercase\n\
12641and vice versa.");
12642
12643static PyObject*
12644unicode_swapcase(PyObject *self)
12645{
12646    if (PyUnicode_READY(self) == -1)
12647        return NULL;
12648    return case_operation(self, do_swapcase);
12649}
12650
12651PyDoc_STRVAR(maketrans__doc__,
12652             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
12653\n\
12654Return a translation table usable for str.translate().\n\
12655If there is only one argument, it must be a dictionary mapping Unicode\n\
12656ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
12657Character keys will be then converted to ordinals.\n\
12658If there are two arguments, they must be strings of equal length, and\n\
12659in the resulting dictionary, each character in x will be mapped to the\n\
12660character at the same position in y. If there is a third argument, it\n\
12661must be a string, whose characters will be mapped to None in the result.");
12662
12663static PyObject*
12664unicode_maketrans(PyObject *null, PyObject *args)
12665{
12666    PyObject *x, *y = NULL, *z = NULL;
12667    PyObject *new = NULL, *key, *value;
12668    Py_ssize_t i = 0;
12669    int res;
12670
12671    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
12672        return NULL;
12673    new = PyDict_New();
12674    if (!new)
12675        return NULL;
12676    if (y != NULL) {
12677        int x_kind, y_kind, z_kind;
12678        void *x_data, *y_data, *z_data;
12679
12680        /* x must be a string too, of equal length */
12681        if (!PyUnicode_Check(x)) {
12682            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12683                            "be a string if there is a second argument");
12684            goto err;
12685        }
12686        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12687            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12688                            "arguments must have equal length");
12689            goto err;
12690        }
12691        /* create entries for translating chars in x to those in y */
12692        x_kind = PyUnicode_KIND(x);
12693        y_kind = PyUnicode_KIND(y);
12694        x_data = PyUnicode_DATA(x);
12695        y_data = PyUnicode_DATA(y);
12696        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
12697            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
12698            if (!key)
12699                goto err;
12700            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
12701            if (!value) {
12702                Py_DECREF(key);
12703                goto err;
12704            }
12705            res = PyDict_SetItem(new, key, value);
12706            Py_DECREF(key);
12707            Py_DECREF(value);
12708            if (res < 0)
12709                goto err;
12710        }
12711        /* create entries for deleting chars in z */
12712        if (z != NULL) {
12713            z_kind = PyUnicode_KIND(z);
12714            z_data = PyUnicode_DATA(z);
12715            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
12716                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
12717                if (!key)
12718                    goto err;
12719                res = PyDict_SetItem(new, key, Py_None);
12720                Py_DECREF(key);
12721                if (res < 0)
12722                    goto err;
12723            }
12724        }
12725    } else {
12726        int kind;
12727        void *data;
12728
12729        /* x must be a dict */
12730        if (!PyDict_CheckExact(x)) {
12731            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
12732                            "to maketrans it must be a dict");
12733            goto err;
12734        }
12735        /* copy entries into the new dict, converting string keys to int keys */
12736        while (PyDict_Next(x, &i, &key, &value)) {
12737            if (PyUnicode_Check(key)) {
12738                /* convert string keys to integer keys */
12739                PyObject *newkey;
12740                if (PyUnicode_GET_LENGTH(key) != 1) {
12741                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
12742                                    "table must be of length 1");
12743                    goto err;
12744                }
12745                kind = PyUnicode_KIND(key);
12746                data = PyUnicode_DATA(key);
12747                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
12748                if (!newkey)
12749                    goto err;
12750                res = PyDict_SetItem(new, newkey, value);
12751                Py_DECREF(newkey);
12752                if (res < 0)
12753                    goto err;
12754            } else if (PyLong_Check(key)) {
12755                /* just keep integer keys */
12756                if (PyDict_SetItem(new, key, value) < 0)
12757                    goto err;
12758            } else {
12759                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
12760                                "be strings or integers");
12761                goto err;
12762            }
12763        }
12764    }
12765    return new;
12766  err:
12767    Py_DECREF(new);
12768    return NULL;
12769}
12770
12771PyDoc_STRVAR(translate__doc__,
12772             "S.translate(table) -> str\n\
12773\n\
12774Return a copy of the string S, where all characters have been mapped\n\
12775through the given translation table, which must be a mapping of\n\
12776Unicode ordinals to Unicode ordinals, strings, or None.\n\
12777Unmapped characters are left untouched. Characters mapped to None\n\
12778are deleted.");
12779
12780static PyObject*
12781unicode_translate(PyObject *self, PyObject *table)
12782{
12783    return _PyUnicode_TranslateCharmap(self, table, "ignore");
12784}
12785
12786PyDoc_STRVAR(upper__doc__,
12787             "S.upper() -> str\n\
12788\n\
12789Return a copy of S converted to uppercase.");
12790
12791static PyObject*
12792unicode_upper(PyObject *self)
12793{
12794    if (PyUnicode_READY(self) == -1)
12795        return NULL;
12796    if (PyUnicode_IS_ASCII(self))
12797        return ascii_upper_or_lower(self, 0);
12798    return case_operation(self, do_upper);
12799}
12800
12801PyDoc_STRVAR(zfill__doc__,
12802             "S.zfill(width) -> str\n\
12803\n\
12804Pad a numeric string S with zeros on the left, to fill a field\n\
12805of the specified width. The string S is never truncated.");
12806
12807static PyObject *
12808unicode_zfill(PyObject *self, PyObject *args)
12809{
12810    Py_ssize_t fill;
12811    PyObject *u;
12812    Py_ssize_t width;
12813    int kind;
12814    void *data;
12815    Py_UCS4 chr;
12816
12817    if (!PyArg_ParseTuple(args, "n:zfill", &width))
12818        return NULL;
12819
12820    if (PyUnicode_READY(self) == -1)
12821        return NULL;
12822
12823    if (PyUnicode_GET_LENGTH(self) >= width)
12824        return unicode_result_unchanged(self);
12825
12826    fill = width - PyUnicode_GET_LENGTH(self);
12827
12828    u = pad(self, fill, 0, '0');
12829
12830    if (u == NULL)
12831        return NULL;
12832
12833    kind = PyUnicode_KIND(u);
12834    data = PyUnicode_DATA(u);
12835    chr = PyUnicode_READ(kind, data, fill);
12836
12837    if (chr == '+' || chr == '-') {
12838        /* move sign to beginning of string */
12839        PyUnicode_WRITE(kind, data, 0, chr);
12840        PyUnicode_WRITE(kind, data, fill, '0');
12841    }
12842
12843    assert(_PyUnicode_CheckConsistency(u, 1));
12844    return u;
12845}
12846
12847#if 0
12848static PyObject *
12849unicode__decimal2ascii(PyObject *self)
12850{
12851    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
12852}
12853#endif
12854
12855PyDoc_STRVAR(startswith__doc__,
12856             "S.startswith(prefix[, start[, end]]) -> bool\n\
12857\n\
12858Return True if S starts with the specified prefix, False otherwise.\n\
12859With optional start, test S beginning at that position.\n\
12860With optional end, stop comparing S at that position.\n\
12861prefix can also be a tuple of strings to try.");
12862
12863static PyObject *
12864unicode_startswith(PyObject *self,
12865                   PyObject *args)
12866{
12867    PyObject *subobj;
12868    PyObject *substring;
12869    Py_ssize_t start = 0;
12870    Py_ssize_t end = PY_SSIZE_T_MAX;
12871    int result;
12872
12873    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
12874        return NULL;
12875    if (PyTuple_Check(subobj)) {
12876        Py_ssize_t i;
12877        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12878            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
12879            if (substring == NULL)
12880                return NULL;
12881            result = tailmatch(self, substring, start, end, -1);
12882            Py_DECREF(substring);
12883            if (result == -1)
12884                return NULL;
12885            if (result) {
12886                Py_RETURN_TRUE;
12887            }
12888        }
12889        /* nothing matched */
12890        Py_RETURN_FALSE;
12891    }
12892    substring = PyUnicode_FromObject(subobj);
12893    if (substring == NULL) {
12894        if (PyErr_ExceptionMatches(PyExc_TypeError))
12895            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
12896                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12897        return NULL;
12898    }
12899    result = tailmatch(self, substring, start, end, -1);
12900    Py_DECREF(substring);
12901    if (result == -1)
12902        return NULL;
12903    return PyBool_FromLong(result);
12904}
12905
12906
12907PyDoc_STRVAR(endswith__doc__,
12908             "S.endswith(suffix[, start[, end]]) -> bool\n\
12909\n\
12910Return True if S ends with the specified suffix, False otherwise.\n\
12911With optional start, test S beginning at that position.\n\
12912With optional end, stop comparing S at that position.\n\
12913suffix can also be a tuple of strings to try.");
12914
12915static PyObject *
12916unicode_endswith(PyObject *self,
12917                 PyObject *args)
12918{
12919    PyObject *subobj;
12920    PyObject *substring;
12921    Py_ssize_t start = 0;
12922    Py_ssize_t end = PY_SSIZE_T_MAX;
12923    int result;
12924
12925    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
12926        return NULL;
12927    if (PyTuple_Check(subobj)) {
12928        Py_ssize_t i;
12929        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
12930            substring = PyUnicode_FromObject(
12931                PyTuple_GET_ITEM(subobj, i));
12932            if (substring == NULL)
12933                return NULL;
12934            result = tailmatch(self, substring, start, end, +1);
12935            Py_DECREF(substring);
12936            if (result == -1)
12937                return NULL;
12938            if (result) {
12939                Py_RETURN_TRUE;
12940            }
12941        }
12942        Py_RETURN_FALSE;
12943    }
12944    substring = PyUnicode_FromObject(subobj);
12945    if (substring == NULL) {
12946        if (PyErr_ExceptionMatches(PyExc_TypeError))
12947            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
12948                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
12949        return NULL;
12950    }
12951    result = tailmatch(self, substring, start, end, +1);
12952    Py_DECREF(substring);
12953    if (result == -1)
12954        return NULL;
12955    return PyBool_FromLong(result);
12956}
12957
12958Py_LOCAL_INLINE(void)
12959_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
12960{
12961    if (!writer->readonly)
12962        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
12963    else {
12964        /* Copy-on-write mode: set buffer size to 0 so
12965         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
12966         * next write. */
12967        writer->size = 0;
12968    }
12969    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
12970    writer->data = PyUnicode_DATA(writer->buffer);
12971    writer->kind = PyUnicode_KIND(writer->buffer);
12972}
12973
12974void
12975_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
12976{
12977    memset(writer, 0, sizeof(*writer));
12978#ifdef Py_DEBUG
12979    writer->kind = 5;    /* invalid kind */
12980#endif
12981    writer->min_char = 127;
12982}
12983
12984int
12985_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
12986                                 Py_ssize_t length, Py_UCS4 maxchar)
12987{
12988    Py_ssize_t newlen;
12989    PyObject *newbuffer;
12990
12991    assert(length > 0);
12992
12993    if (length > PY_SSIZE_T_MAX - writer->pos) {
12994        PyErr_NoMemory();
12995        return -1;
12996    }
12997    newlen = writer->pos + length;
12998
12999    maxchar = Py_MAX(maxchar, writer->min_char);
13000
13001    if (writer->buffer == NULL) {
13002        assert(!writer->readonly);
13003        if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
13004            /* overallocate 25% to limit the number of resize */
13005            newlen += newlen / 4;
13006        }
13007        if (newlen < writer->min_length)
13008            newlen = writer->min_length;
13009
13010        writer->buffer = PyUnicode_New(newlen, maxchar);
13011        if (writer->buffer == NULL)
13012            return -1;
13013    }
13014    else if (newlen > writer->size) {
13015        if (writer->overallocate && newlen <= (PY_SSIZE_T_MAX - newlen / 4)) {
13016            /* overallocate 25% to limit the number of resize */
13017            newlen += newlen / 4;
13018        }
13019        if (newlen < writer->min_length)
13020            newlen = writer->min_length;
13021
13022        if (maxchar > writer->maxchar || writer->readonly) {
13023            /* resize + widen */
13024            newbuffer = PyUnicode_New(newlen, maxchar);
13025            if (newbuffer == NULL)
13026                return -1;
13027            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13028                                          writer->buffer, 0, writer->pos);
13029            Py_DECREF(writer->buffer);
13030            writer->readonly = 0;
13031        }
13032        else {
13033            newbuffer = resize_compact(writer->buffer, newlen);
13034            if (newbuffer == NULL)
13035                return -1;
13036        }
13037        writer->buffer = newbuffer;
13038    }
13039    else if (maxchar > writer->maxchar) {
13040        assert(!writer->readonly);
13041        newbuffer = PyUnicode_New(writer->size, maxchar);
13042        if (newbuffer == NULL)
13043            return -1;
13044        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13045                                      writer->buffer, 0, writer->pos);
13046        Py_DECREF(writer->buffer);
13047        writer->buffer = newbuffer;
13048    }
13049    _PyUnicodeWriter_Update(writer);
13050    return 0;
13051}
13052
13053Py_LOCAL_INLINE(int)
13054_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13055{
13056    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13057        return -1;
13058    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13059    writer->pos++;
13060    return 0;
13061}
13062
13063int
13064_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13065{
13066    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13067}
13068
13069int
13070_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13071{
13072    Py_UCS4 maxchar;
13073    Py_ssize_t len;
13074
13075    if (PyUnicode_READY(str) == -1)
13076        return -1;
13077    len = PyUnicode_GET_LENGTH(str);
13078    if (len == 0)
13079        return 0;
13080    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13081    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13082        if (writer->buffer == NULL && !writer->overallocate) {
13083            writer->readonly = 1;
13084            Py_INCREF(str);
13085            writer->buffer = str;
13086            _PyUnicodeWriter_Update(writer);
13087            writer->pos += len;
13088            return 0;
13089        }
13090        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13091            return -1;
13092    }
13093    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13094                                  str, 0, len);
13095    writer->pos += len;
13096    return 0;
13097}
13098
13099int
13100_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13101                                Py_ssize_t start, Py_ssize_t end)
13102{
13103    Py_UCS4 maxchar;
13104    Py_ssize_t len;
13105
13106    if (PyUnicode_READY(str) == -1)
13107        return -1;
13108
13109    assert(0 <= start);
13110    assert(end <= PyUnicode_GET_LENGTH(str));
13111    assert(start <= end);
13112
13113    if (end == 0)
13114        return 0;
13115
13116    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13117        return _PyUnicodeWriter_WriteStr(writer, str);
13118
13119    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13120        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13121    else
13122        maxchar = writer->maxchar;
13123    len = end - start;
13124
13125    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13126        return -1;
13127
13128    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13129                                  str, start, len);
13130    writer->pos += len;
13131    return 0;
13132}
13133
13134int
13135_PyUnicodeWriter_WriteCstr(_PyUnicodeWriter *writer, const char *str, Py_ssize_t len)
13136{
13137    Py_UCS4 maxchar;
13138
13139    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13140    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13141        return -1;
13142    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13143    writer->pos += len;
13144    return 0;
13145}
13146
13147PyObject *
13148_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13149{
13150    if (writer->pos == 0) {
13151        Py_XDECREF(writer->buffer);
13152        _Py_RETURN_UNICODE_EMPTY();
13153    }
13154    if (writer->readonly) {
13155        assert(PyUnicode_GET_LENGTH(writer->buffer) == writer->pos);
13156        return writer->buffer;
13157    }
13158    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13159        PyObject *newbuffer;
13160        newbuffer = resize_compact(writer->buffer, writer->pos);
13161        if (newbuffer == NULL) {
13162            Py_DECREF(writer->buffer);
13163            return NULL;
13164        }
13165        writer->buffer = newbuffer;
13166    }
13167    assert(_PyUnicode_CheckConsistency(writer->buffer, 1));
13168    return unicode_result_ready(writer->buffer);
13169}
13170
13171void
13172_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13173{
13174    Py_CLEAR(writer->buffer);
13175}
13176
13177#include "stringlib/unicode_format.h"
13178
13179PyDoc_STRVAR(format__doc__,
13180             "S.format(*args, **kwargs) -> str\n\
13181\n\
13182Return a formatted version of S, using substitutions from args and kwargs.\n\
13183The substitutions are identified by braces ('{' and '}').");
13184
13185PyDoc_STRVAR(format_map__doc__,
13186             "S.format_map(mapping) -> str\n\
13187\n\
13188Return a formatted version of S, using substitutions from mapping.\n\
13189The substitutions are identified by braces ('{' and '}').");
13190
13191static PyObject *
13192unicode__format__(PyObject* self, PyObject* args)
13193{
13194    PyObject *format_spec;
13195    _PyUnicodeWriter writer;
13196    int ret;
13197
13198    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13199        return NULL;
13200
13201    if (PyUnicode_READY(self) == -1)
13202        return NULL;
13203    _PyUnicodeWriter_Init(&writer);
13204    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13205                                          self, format_spec, 0,
13206                                          PyUnicode_GET_LENGTH(format_spec));
13207    if (ret == -1) {
13208        _PyUnicodeWriter_Dealloc(&writer);
13209        return NULL;
13210    }
13211    return _PyUnicodeWriter_Finish(&writer);
13212}
13213
13214PyDoc_STRVAR(p_format__doc__,
13215             "S.__format__(format_spec) -> str\n\
13216\n\
13217Return a formatted version of S as described by format_spec.");
13218
13219static PyObject *
13220unicode__sizeof__(PyObject *v)
13221{
13222    Py_ssize_t size;
13223
13224    /* If it's a compact object, account for base structure +
13225       character data. */
13226    if (PyUnicode_IS_COMPACT_ASCII(v))
13227        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13228    else if (PyUnicode_IS_COMPACT(v))
13229        size = sizeof(PyCompactUnicodeObject) +
13230            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13231    else {
13232        /* If it is a two-block object, account for base object, and
13233           for character block if present. */
13234        size = sizeof(PyUnicodeObject);
13235        if (_PyUnicode_DATA_ANY(v))
13236            size += (PyUnicode_GET_LENGTH(v) + 1) *
13237                PyUnicode_KIND(v);
13238    }
13239    /* If the wstr pointer is present, account for it unless it is shared
13240       with the data pointer. Check if the data is not shared. */
13241    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13242        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13243    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13244        size += PyUnicode_UTF8_LENGTH(v) + 1;
13245
13246    return PyLong_FromSsize_t(size);
13247}
13248
13249PyDoc_STRVAR(sizeof__doc__,
13250             "S.__sizeof__() -> size of S in memory, in bytes");
13251
13252static PyObject *
13253unicode_getnewargs(PyObject *v)
13254{
13255    PyObject *copy = _PyUnicode_Copy(v);
13256    if (!copy)
13257        return NULL;
13258    return Py_BuildValue("(N)", copy);
13259}
13260
13261static PyMethodDef unicode_methods[] = {
13262    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13263    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13264    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13265    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13266    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13267    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13268    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13269    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13270    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13271    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13272    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
13273    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13274    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13275    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13276    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13277    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13278    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13279    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13280    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13281    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13282    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13283    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13284    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13285    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13286    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13287    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13288    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13289    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13290    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13291    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13292    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13293    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13294    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13295    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13296    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13297    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13298    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13299    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13300    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13301    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13302    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13303    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13304    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13305    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13306    {"maketrans", (PyCFunction) unicode_maketrans,
13307     METH_VARARGS | METH_STATIC, maketrans__doc__},
13308    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13309#if 0
13310    /* These methods are just used for debugging the implementation. */
13311    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13312#endif
13313
13314    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13315    {NULL, NULL}
13316};
13317
13318static PyObject *
13319unicode_mod(PyObject *v, PyObject *w)
13320{
13321    if (!PyUnicode_Check(v))
13322        Py_RETURN_NOTIMPLEMENTED;
13323    return PyUnicode_Format(v, w);
13324}
13325
13326static PyNumberMethods unicode_as_number = {
13327    0,              /*nb_add*/
13328    0,              /*nb_subtract*/
13329    0,              /*nb_multiply*/
13330    unicode_mod,            /*nb_remainder*/
13331};
13332
13333static PySequenceMethods unicode_as_sequence = {
13334    (lenfunc) unicode_length,       /* sq_length */
13335    PyUnicode_Concat,           /* sq_concat */
13336    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13337    (ssizeargfunc) unicode_getitem,     /* sq_item */
13338    0,                  /* sq_slice */
13339    0,                  /* sq_ass_item */
13340    0,                  /* sq_ass_slice */
13341    PyUnicode_Contains,         /* sq_contains */
13342};
13343
13344static PyObject*
13345unicode_subscript(PyObject* self, PyObject* item)
13346{
13347    if (PyUnicode_READY(self) == -1)
13348        return NULL;
13349
13350    if (PyIndex_Check(item)) {
13351        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13352        if (i == -1 && PyErr_Occurred())
13353            return NULL;
13354        if (i < 0)
13355            i += PyUnicode_GET_LENGTH(self);
13356        return unicode_getitem(self, i);
13357    } else if (PySlice_Check(item)) {
13358        Py_ssize_t start, stop, step, slicelength, cur, i;
13359        PyObject *result;
13360        void *src_data, *dest_data;
13361        int src_kind, dest_kind;
13362        Py_UCS4 ch, max_char, kind_limit;
13363
13364        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13365                                 &start, &stop, &step, &slicelength) < 0) {
13366            return NULL;
13367        }
13368
13369        if (slicelength <= 0) {
13370            _Py_RETURN_UNICODE_EMPTY();
13371        } else if (start == 0 && step == 1 &&
13372                   slicelength == PyUnicode_GET_LENGTH(self)) {
13373            return unicode_result_unchanged(self);
13374        } else if (step == 1) {
13375            return PyUnicode_Substring(self,
13376                                       start, start + slicelength);
13377        }
13378        /* General case */
13379        src_kind = PyUnicode_KIND(self);
13380        src_data = PyUnicode_DATA(self);
13381        if (!PyUnicode_IS_ASCII(self)) {
13382            kind_limit = kind_maxchar_limit(src_kind);
13383            max_char = 0;
13384            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13385                ch = PyUnicode_READ(src_kind, src_data, cur);
13386                if (ch > max_char) {
13387                    max_char = ch;
13388                    if (max_char >= kind_limit)
13389                        break;
13390                }
13391            }
13392        }
13393        else
13394            max_char = 127;
13395        result = PyUnicode_New(slicelength, max_char);
13396        if (result == NULL)
13397            return NULL;
13398        dest_kind = PyUnicode_KIND(result);
13399        dest_data = PyUnicode_DATA(result);
13400
13401        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13402            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13403            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13404        }
13405        assert(_PyUnicode_CheckConsistency(result, 1));
13406        return result;
13407    } else {
13408        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13409        return NULL;
13410    }
13411}
13412
13413static PyMappingMethods unicode_as_mapping = {
13414    (lenfunc)unicode_length,        /* mp_length */
13415    (binaryfunc)unicode_subscript,  /* mp_subscript */
13416    (objobjargproc)0,           /* mp_ass_subscript */
13417};
13418
13419
13420/* Helpers for PyUnicode_Format() */
13421
13422struct unicode_formatter_t {
13423    PyObject *args;
13424    int args_owned;
13425    Py_ssize_t arglen, argidx;
13426    PyObject *dict;
13427
13428    enum PyUnicode_Kind fmtkind;
13429    Py_ssize_t fmtcnt, fmtpos;
13430    void *fmtdata;
13431    PyObject *fmtstr;
13432
13433    _PyUnicodeWriter writer;
13434};
13435
13436struct unicode_format_arg_t {
13437    Py_UCS4 ch;
13438    int flags;
13439    Py_ssize_t width;
13440    int prec;
13441    int sign;
13442};
13443
13444static PyObject *
13445unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13446{
13447    Py_ssize_t argidx = ctx->argidx;
13448
13449    if (argidx < ctx->arglen) {
13450        ctx->argidx++;
13451        if (ctx->arglen < 0)
13452            return ctx->args;
13453        else
13454            return PyTuple_GetItem(ctx->args, argidx);
13455    }
13456    PyErr_SetString(PyExc_TypeError,
13457                    "not enough arguments for format string");
13458    return NULL;
13459}
13460
13461/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13462
13463/* Format a float into the writer if the writer is not NULL, or into *p_output
13464   otherwise.
13465
13466   Return 0 on success, raise an exception and return -1 on error. */
13467static int
13468formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13469            PyObject **p_output,
13470            _PyUnicodeWriter *writer)
13471{
13472    char *p;
13473    double x;
13474    Py_ssize_t len;
13475    int prec;
13476    int dtoa_flags;
13477
13478    x = PyFloat_AsDouble(v);
13479    if (x == -1.0 && PyErr_Occurred())
13480        return -1;
13481
13482    prec = arg->prec;
13483    if (prec < 0)
13484        prec = 6;
13485
13486    if (arg->flags & F_ALT)
13487        dtoa_flags = Py_DTSF_ALT;
13488    else
13489        dtoa_flags = 0;
13490    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13491    if (p == NULL)
13492        return -1;
13493    len = strlen(p);
13494    if (writer) {
13495        if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1) {
13496            PyMem_Free(p);
13497            return -1;
13498        }
13499        unicode_write_cstr(writer->buffer, writer->pos, p, len);
13500        writer->pos += len;
13501    }
13502    else
13503        *p_output = _PyUnicode_FromASCII(p, len);
13504    PyMem_Free(p);
13505    return 0;
13506}
13507
13508/* formatlong() emulates the format codes d, u, o, x and X, and
13509 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13510 * Python's regular ints.
13511 * Return value:  a new PyUnicodeObject*, or NULL if error.
13512 *     The output string is of the form
13513 *         "-"? ("0x" | "0X")? digit+
13514 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13515 *         set in flags.  The case of hex digits will be correct,
13516 *     There will be at least prec digits, zero-filled on the left if
13517 *         necessary to get that many.
13518 * val          object to be converted
13519 * flags        bitmask of format flags; only F_ALT is looked at
13520 * prec         minimum number of digits; 0-fill on left if needed
13521 * type         a character in [duoxX]; u acts the same as d
13522 *
13523 * CAUTION:  o, x and X conversions on regular ints can never
13524 * produce a '-' sign, but can for Python's unbounded ints.
13525 */
13526static PyObject*
13527formatlong(PyObject *val, struct unicode_format_arg_t *arg)
13528{
13529    PyObject *result = NULL;
13530    char *buf;
13531    Py_ssize_t i;
13532    int sign;           /* 1 if '-', else 0 */
13533    int len;            /* number of characters */
13534    Py_ssize_t llen;
13535    int numdigits;      /* len == numnondigits + numdigits */
13536    int numnondigits = 0;
13537    int prec = arg->prec;
13538    int type = arg->ch;
13539
13540    /* Avoid exceeding SSIZE_T_MAX */
13541    if (prec > INT_MAX-3) {
13542        PyErr_SetString(PyExc_OverflowError,
13543                        "precision too large");
13544        return NULL;
13545    }
13546
13547    assert(PyLong_Check(val));
13548
13549    switch (type) {
13550    default:
13551        assert(!"'type' not in [diuoxX]");
13552    case 'd':
13553    case 'i':
13554    case 'u':
13555        /* Special-case boolean: we want 0/1 */
13556        if (PyBool_Check(val))
13557            result = PyNumber_ToBase(val, 10);
13558        else
13559            result = Py_TYPE(val)->tp_str(val);
13560        break;
13561    case 'o':
13562        numnondigits = 2;
13563        result = PyNumber_ToBase(val, 8);
13564        break;
13565    case 'x':
13566    case 'X':
13567        numnondigits = 2;
13568        result = PyNumber_ToBase(val, 16);
13569        break;
13570    }
13571    if (!result)
13572        return NULL;
13573
13574    assert(unicode_modifiable(result));
13575    assert(PyUnicode_IS_READY(result));
13576    assert(PyUnicode_IS_ASCII(result));
13577
13578    /* To modify the string in-place, there can only be one reference. */
13579    if (Py_REFCNT(result) != 1) {
13580        PyErr_BadInternalCall();
13581        return NULL;
13582    }
13583    buf = PyUnicode_DATA(result);
13584    llen = PyUnicode_GET_LENGTH(result);
13585    if (llen > INT_MAX) {
13586        PyErr_SetString(PyExc_ValueError,
13587                        "string too large in _PyBytes_FormatLong");
13588        return NULL;
13589    }
13590    len = (int)llen;
13591    sign = buf[0] == '-';
13592    numnondigits += sign;
13593    numdigits = len - numnondigits;
13594    assert(numdigits > 0);
13595
13596    /* Get rid of base marker unless F_ALT */
13597    if (((arg->flags & F_ALT) == 0 &&
13598        (type == 'o' || type == 'x' || type == 'X'))) {
13599        assert(buf[sign] == '0');
13600        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13601               buf[sign+1] == 'o');
13602        numnondigits -= 2;
13603        buf += 2;
13604        len -= 2;
13605        if (sign)
13606            buf[0] = '-';
13607        assert(len == numnondigits + numdigits);
13608        assert(numdigits > 0);
13609    }
13610
13611    /* Fill with leading zeroes to meet minimum width. */
13612    if (prec > numdigits) {
13613        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13614                                numnondigits + prec);
13615        char *b1;
13616        if (!r1) {
13617            Py_DECREF(result);
13618            return NULL;
13619        }
13620        b1 = PyBytes_AS_STRING(r1);
13621        for (i = 0; i < numnondigits; ++i)
13622            *b1++ = *buf++;
13623        for (i = 0; i < prec - numdigits; i++)
13624            *b1++ = '0';
13625        for (i = 0; i < numdigits; i++)
13626            *b1++ = *buf++;
13627        *b1 = '\0';
13628        Py_DECREF(result);
13629        result = r1;
13630        buf = PyBytes_AS_STRING(result);
13631        len = numnondigits + prec;
13632    }
13633
13634    /* Fix up case for hex conversions. */
13635    if (type == 'X') {
13636        /* Need to convert all lower case letters to upper case.
13637           and need to convert 0x to 0X (and -0x to -0X). */
13638        for (i = 0; i < len; i++)
13639            if (buf[i] >= 'a' && buf[i] <= 'x')
13640                buf[i] -= 'a'-'A';
13641    }
13642    if (!PyUnicode_Check(result)
13643        || buf != PyUnicode_DATA(result)) {
13644        PyObject *unicode;
13645        unicode = _PyUnicode_FromASCII(buf, len);
13646        Py_DECREF(result);
13647        result = unicode;
13648    }
13649    else if (len != PyUnicode_GET_LENGTH(result)) {
13650        if (PyUnicode_Resize(&result, len) < 0)
13651            Py_CLEAR(result);
13652    }
13653    return result;
13654}
13655
13656/* Format an integer.
13657 * Return 1 if the number has been formatted into the writer,
13658 *        0 if the number has been formatted into *p_output
13659 *       -1 and raise an exception on error */
13660static int
13661mainformatlong(PyObject *v,
13662               struct unicode_format_arg_t *arg,
13663               PyObject **p_output,
13664               _PyUnicodeWriter *writer)
13665{
13666    PyObject *iobj, *res;
13667    char type = (char)arg->ch;
13668
13669    if (!PyNumber_Check(v))
13670        goto wrongtype;
13671
13672    if (!PyLong_Check(v)) {
13673        iobj = PyNumber_Long(v);
13674        if (iobj == NULL) {
13675            if (PyErr_ExceptionMatches(PyExc_TypeError))
13676                goto wrongtype;
13677            return -1;
13678        }
13679        assert(PyLong_Check(iobj));
13680    }
13681    else {
13682        iobj = v;
13683        Py_INCREF(iobj);
13684    }
13685
13686    if (PyLong_CheckExact(v)
13687        && arg->width == -1 && arg->prec == -1
13688        && !(arg->flags & (F_SIGN | F_BLANK))
13689        && type != 'X')
13690    {
13691        /* Fast path */
13692        int alternate = arg->flags & F_ALT;
13693        int base;
13694
13695        switch(type)
13696        {
13697            default:
13698                assert(0 && "'type' not in [diuoxX]");
13699            case 'd':
13700            case 'i':
13701            case 'u':
13702                base = 10;
13703                break;
13704            case 'o':
13705                base = 8;
13706                break;
13707            case 'x':
13708            case 'X':
13709                base = 16;
13710                break;
13711        }
13712
13713        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
13714            Py_DECREF(iobj);
13715            return -1;
13716        }
13717        Py_DECREF(iobj);
13718        return 1;
13719    }
13720
13721    res = formatlong(iobj, arg);
13722    Py_DECREF(iobj);
13723    if (res == NULL)
13724        return -1;
13725    *p_output = res;
13726    return 0;
13727
13728wrongtype:
13729    PyErr_Format(PyExc_TypeError,
13730            "%%%c format: a number is required, "
13731            "not %.200s",
13732            type, Py_TYPE(v)->tp_name);
13733    return -1;
13734}
13735
13736static Py_UCS4
13737formatchar(PyObject *v)
13738{
13739    /* presume that the buffer is at least 3 characters long */
13740    if (PyUnicode_Check(v)) {
13741        if (PyUnicode_GET_LENGTH(v) == 1) {
13742            return PyUnicode_READ_CHAR(v, 0);
13743        }
13744        goto onError;
13745    }
13746    else {
13747        /* Integer input truncated to a character */
13748        long x;
13749        x = PyLong_AsLong(v);
13750        if (x == -1 && PyErr_Occurred())
13751            goto onError;
13752
13753        if (x < 0 || x > MAX_UNICODE) {
13754            PyErr_SetString(PyExc_OverflowError,
13755                            "%c arg not in range(0x110000)");
13756            return (Py_UCS4) -1;
13757        }
13758
13759        return (Py_UCS4) x;
13760    }
13761
13762  onError:
13763    PyErr_SetString(PyExc_TypeError,
13764                    "%c requires int or char");
13765    return (Py_UCS4) -1;
13766}
13767
13768/* Parse options of an argument: flags, width, precision.
13769   Handle also "%(name)" syntax.
13770
13771   Return 0 if the argument has been formatted into arg->str.
13772   Return 1 if the argument has been written into ctx->writer,
13773   Raise an exception and return -1 on error. */
13774static int
13775unicode_format_arg_parse(struct unicode_formatter_t *ctx,
13776                         struct unicode_format_arg_t *arg)
13777{
13778#define FORMAT_READ(ctx) \
13779        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
13780
13781    PyObject *v;
13782
13783    if (arg->ch == '(') {
13784        /* Get argument value from a dictionary. Example: "%(name)s". */
13785        Py_ssize_t keystart;
13786        Py_ssize_t keylen;
13787        PyObject *key;
13788        int pcount = 1;
13789
13790        if (ctx->dict == NULL) {
13791            PyErr_SetString(PyExc_TypeError,
13792                            "format requires a mapping");
13793            return -1;
13794        }
13795        ++ctx->fmtpos;
13796        --ctx->fmtcnt;
13797        keystart = ctx->fmtpos;
13798        /* Skip over balanced parentheses */
13799        while (pcount > 0 && --ctx->fmtcnt >= 0) {
13800            arg->ch = FORMAT_READ(ctx);
13801            if (arg->ch == ')')
13802                --pcount;
13803            else if (arg->ch == '(')
13804                ++pcount;
13805            ctx->fmtpos++;
13806        }
13807        keylen = ctx->fmtpos - keystart - 1;
13808        if (ctx->fmtcnt < 0 || pcount > 0) {
13809            PyErr_SetString(PyExc_ValueError,
13810                            "incomplete format key");
13811            return -1;
13812        }
13813        key = PyUnicode_Substring(ctx->fmtstr,
13814                                  keystart, keystart + keylen);
13815        if (key == NULL)
13816            return -1;
13817        if (ctx->args_owned) {
13818            Py_DECREF(ctx->args);
13819            ctx->args_owned = 0;
13820        }
13821        ctx->args = PyObject_GetItem(ctx->dict, key);
13822        Py_DECREF(key);
13823        if (ctx->args == NULL)
13824            return -1;
13825        ctx->args_owned = 1;
13826        ctx->arglen = -1;
13827        ctx->argidx = -2;
13828    }
13829
13830    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
13831    while (--ctx->fmtcnt >= 0) {
13832        arg->ch = FORMAT_READ(ctx);
13833        ctx->fmtpos++;
13834        switch (arg->ch) {
13835        case '-': arg->flags |= F_LJUST; continue;
13836        case '+': arg->flags |= F_SIGN; continue;
13837        case ' ': arg->flags |= F_BLANK; continue;
13838        case '#': arg->flags |= F_ALT; continue;
13839        case '0': arg->flags |= F_ZERO; continue;
13840        }
13841        break;
13842    }
13843
13844    /* Parse width. Example: "%10s" => width=10 */
13845    if (arg->ch == '*') {
13846        v = unicode_format_getnextarg(ctx);
13847        if (v == NULL)
13848            return -1;
13849        if (!PyLong_Check(v)) {
13850            PyErr_SetString(PyExc_TypeError,
13851                            "* wants int");
13852            return -1;
13853        }
13854        arg->width = PyLong_AsSsize_t(v);
13855        if (arg->width == -1 && PyErr_Occurred())
13856            return -1;
13857        if (arg->width < 0) {
13858            arg->flags |= F_LJUST;
13859            arg->width = -arg->width;
13860        }
13861        if (--ctx->fmtcnt >= 0) {
13862            arg->ch = FORMAT_READ(ctx);
13863            ctx->fmtpos++;
13864        }
13865    }
13866    else if (arg->ch >= '0' && arg->ch <= '9') {
13867        arg->width = arg->ch - '0';
13868        while (--ctx->fmtcnt >= 0) {
13869            arg->ch = FORMAT_READ(ctx);
13870            ctx->fmtpos++;
13871            if (arg->ch < '0' || arg->ch > '9')
13872                break;
13873            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
13874               mixing signed and unsigned comparison. Since arg->ch is between
13875               '0' and '9', casting to int is safe. */
13876            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
13877                PyErr_SetString(PyExc_ValueError,
13878                                "width too big");
13879                return -1;
13880            }
13881            arg->width = arg->width*10 + (arg->ch - '0');
13882        }
13883    }
13884
13885    /* Parse precision. Example: "%.3f" => prec=3 */
13886    if (arg->ch == '.') {
13887        arg->prec = 0;
13888        if (--ctx->fmtcnt >= 0) {
13889            arg->ch = FORMAT_READ(ctx);
13890            ctx->fmtpos++;
13891        }
13892        if (arg->ch == '*') {
13893            v = unicode_format_getnextarg(ctx);
13894            if (v == NULL)
13895                return -1;
13896            if (!PyLong_Check(v)) {
13897                PyErr_SetString(PyExc_TypeError,
13898                                "* wants int");
13899                return -1;
13900            }
13901            arg->prec = _PyLong_AsInt(v);
13902            if (arg->prec == -1 && PyErr_Occurred())
13903                return -1;
13904            if (arg->prec < 0)
13905                arg->prec = 0;
13906            if (--ctx->fmtcnt >= 0) {
13907                arg->ch = FORMAT_READ(ctx);
13908                ctx->fmtpos++;
13909            }
13910        }
13911        else if (arg->ch >= '0' && arg->ch <= '9') {
13912            arg->prec = arg->ch - '0';
13913            while (--ctx->fmtcnt >= 0) {
13914                arg->ch = FORMAT_READ(ctx);
13915                ctx->fmtpos++;
13916                if (arg->ch < '0' || arg->ch > '9')
13917                    break;
13918                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
13919                    PyErr_SetString(PyExc_ValueError,
13920                                    "precision too big");
13921                    return -1;
13922                }
13923                arg->prec = arg->prec*10 + (arg->ch - '0');
13924            }
13925        }
13926    }
13927
13928    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
13929    if (ctx->fmtcnt >= 0) {
13930        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
13931            if (--ctx->fmtcnt >= 0) {
13932                arg->ch = FORMAT_READ(ctx);
13933                ctx->fmtpos++;
13934            }
13935        }
13936    }
13937    if (ctx->fmtcnt < 0) {
13938        PyErr_SetString(PyExc_ValueError,
13939                        "incomplete format");
13940        return -1;
13941    }
13942    return 0;
13943
13944#undef FORMAT_READ
13945}
13946
13947/* Format one argument. Supported conversion specifiers:
13948
13949   - "s", "r", "a": any type
13950   - "i", "d", "u", "o", "x", "X": int
13951   - "e", "E", "f", "F", "g", "G": float
13952   - "c": int or str (1 character)
13953
13954   When possible, the output is written directly into the Unicode writer
13955   (ctx->writer). A string is created when padding is required.
13956
13957   Return 0 if the argument has been formatted into *p_str,
13958          1 if the argument has been written into ctx->writer,
13959         -1 on error. */
13960static int
13961unicode_format_arg_format(struct unicode_formatter_t *ctx,
13962                          struct unicode_format_arg_t *arg,
13963                          PyObject **p_str)
13964{
13965    PyObject *v;
13966    _PyUnicodeWriter *writer = &ctx->writer;
13967
13968    if (ctx->fmtcnt == 0)
13969        ctx->writer.overallocate = 0;
13970
13971    if (arg->ch == '%') {
13972        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
13973            return -1;
13974        return 1;
13975    }
13976
13977    v = unicode_format_getnextarg(ctx);
13978    if (v == NULL)
13979        return -1;
13980
13981
13982    switch (arg->ch) {
13983    case 's':
13984    case 'r':
13985    case 'a':
13986        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
13987            /* Fast path */
13988            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
13989                return -1;
13990            return 1;
13991        }
13992
13993        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
13994            *p_str = v;
13995            Py_INCREF(*p_str);
13996        }
13997        else {
13998            if (arg->ch == 's')
13999                *p_str = PyObject_Str(v);
14000            else if (arg->ch == 'r')
14001                *p_str = PyObject_Repr(v);
14002            else
14003                *p_str = PyObject_ASCII(v);
14004        }
14005        break;
14006
14007    case 'i':
14008    case 'd':
14009    case 'u':
14010    case 'o':
14011    case 'x':
14012    case 'X':
14013    {
14014        int ret = mainformatlong(v, arg, p_str, writer);
14015        if (ret != 0)
14016            return ret;
14017        arg->sign = 1;
14018        break;
14019    }
14020
14021    case 'e':
14022    case 'E':
14023    case 'f':
14024    case 'F':
14025    case 'g':
14026    case 'G':
14027        if (arg->width == -1 && arg->prec == -1
14028            && !(arg->flags & (F_SIGN | F_BLANK)))
14029        {
14030            /* Fast path */
14031            if (formatfloat(v, arg, NULL, writer) == -1)
14032                return -1;
14033            return 1;
14034        }
14035
14036        arg->sign = 1;
14037        if (formatfloat(v, arg, p_str, NULL) == -1)
14038            return -1;
14039        break;
14040
14041    case 'c':
14042    {
14043        Py_UCS4 ch = formatchar(v);
14044        if (ch == (Py_UCS4) -1)
14045            return -1;
14046        if (arg->width == -1 && arg->prec == -1) {
14047            /* Fast path */
14048            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14049                return -1;
14050            return 1;
14051        }
14052        *p_str = PyUnicode_FromOrdinal(ch);
14053        break;
14054    }
14055
14056    default:
14057        PyErr_Format(PyExc_ValueError,
14058                     "unsupported format character '%c' (0x%x) "
14059                     "at index %zd",
14060                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14061                     (int)arg->ch,
14062                     ctx->fmtpos - 1);
14063        return -1;
14064    }
14065    if (*p_str == NULL)
14066        return -1;
14067    assert (PyUnicode_Check(*p_str));
14068    return 0;
14069}
14070
14071static int
14072unicode_format_arg_output(struct unicode_formatter_t *ctx,
14073                          struct unicode_format_arg_t *arg,
14074                          PyObject *str)
14075{
14076    Py_ssize_t len;
14077    enum PyUnicode_Kind kind;
14078    void *pbuf;
14079    Py_ssize_t pindex;
14080    Py_UCS4 signchar;
14081    Py_ssize_t buflen;
14082    Py_UCS4 maxchar;
14083    Py_ssize_t sublen;
14084    _PyUnicodeWriter *writer = &ctx->writer;
14085    Py_UCS4 fill;
14086
14087    fill = ' ';
14088    if (arg->sign && arg->flags & F_ZERO)
14089        fill = '0';
14090
14091    if (PyUnicode_READY(str) == -1)
14092        return -1;
14093
14094    len = PyUnicode_GET_LENGTH(str);
14095    if ((arg->width == -1 || arg->width <= len)
14096        && (arg->prec == -1 || arg->prec >= len)
14097        && !(arg->flags & (F_SIGN | F_BLANK)))
14098    {
14099        /* Fast path */
14100        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14101            return -1;
14102        return 0;
14103    }
14104
14105    /* Truncate the string for "s", "r" and "a" formats
14106       if the precision is set */
14107    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14108        if (arg->prec >= 0 && len > arg->prec)
14109            len = arg->prec;
14110    }
14111
14112    /* Adjust sign and width */
14113    kind = PyUnicode_KIND(str);
14114    pbuf = PyUnicode_DATA(str);
14115    pindex = 0;
14116    signchar = '\0';
14117    if (arg->sign) {
14118        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14119        if (ch == '-' || ch == '+') {
14120            signchar = ch;
14121            len--;
14122            pindex++;
14123        }
14124        else if (arg->flags & F_SIGN)
14125            signchar = '+';
14126        else if (arg->flags & F_BLANK)
14127            signchar = ' ';
14128        else
14129            arg->sign = 0;
14130    }
14131    if (arg->width < len)
14132        arg->width = len;
14133
14134    /* Prepare the writer */
14135    maxchar = writer->maxchar;
14136    if (!(arg->flags & F_LJUST)) {
14137        if (arg->sign) {
14138            if ((arg->width-1) > len)
14139                maxchar = Py_MAX(maxchar, fill);
14140        }
14141        else {
14142            if (arg->width > len)
14143                maxchar = Py_MAX(maxchar, fill);
14144        }
14145    }
14146    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14147        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14148        maxchar = Py_MAX(maxchar, strmaxchar);
14149    }
14150
14151    buflen = arg->width;
14152    if (arg->sign && len == arg->width)
14153        buflen++;
14154    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14155        return -1;
14156
14157    /* Write the sign if needed */
14158    if (arg->sign) {
14159        if (fill != ' ') {
14160            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14161            writer->pos += 1;
14162        }
14163        if (arg->width > len)
14164            arg->width--;
14165    }
14166
14167    /* Write the numeric prefix for "x", "X" and "o" formats
14168       if the alternate form is used.
14169       For example, write "0x" for the "%#x" format. */
14170    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14171        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14172        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14173        if (fill != ' ') {
14174            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14175            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14176            writer->pos += 2;
14177            pindex += 2;
14178        }
14179        arg->width -= 2;
14180        if (arg->width < 0)
14181            arg->width = 0;
14182        len -= 2;
14183    }
14184
14185    /* Pad left with the fill character if needed */
14186    if (arg->width > len && !(arg->flags & F_LJUST)) {
14187        sublen = arg->width - len;
14188        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14189        writer->pos += sublen;
14190        arg->width = len;
14191    }
14192
14193    /* If padding with spaces: write sign if needed and/or numeric prefix if
14194       the alternate form is used */
14195    if (fill == ' ') {
14196        if (arg->sign) {
14197            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14198            writer->pos += 1;
14199        }
14200        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14201            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14202            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14203            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14204            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14205            writer->pos += 2;
14206            pindex += 2;
14207        }
14208    }
14209
14210    /* Write characters */
14211    if (len) {
14212        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14213                                      str, pindex, len);
14214        writer->pos += len;
14215    }
14216
14217    /* Pad right with the fill character if needed */
14218    if (arg->width > len) {
14219        sublen = arg->width - len;
14220        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14221        writer->pos += sublen;
14222    }
14223    return 0;
14224}
14225
14226/* Helper of PyUnicode_Format(): format one arg.
14227   Return 0 on success, raise an exception and return -1 on error. */
14228static int
14229unicode_format_arg(struct unicode_formatter_t *ctx)
14230{
14231    struct unicode_format_arg_t arg;
14232    PyObject *str;
14233    int ret;
14234
14235    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14236    arg.flags = 0;
14237    arg.width = -1;
14238    arg.prec = -1;
14239    arg.sign = 0;
14240    str = NULL;
14241
14242    ret = unicode_format_arg_parse(ctx, &arg);
14243    if (ret == -1)
14244        return -1;
14245
14246    ret = unicode_format_arg_format(ctx, &arg, &str);
14247    if (ret == -1)
14248        return -1;
14249
14250    if (ret != 1) {
14251        ret = unicode_format_arg_output(ctx, &arg, str);
14252        Py_DECREF(str);
14253        if (ret == -1)
14254            return -1;
14255    }
14256
14257    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14258        PyErr_SetString(PyExc_TypeError,
14259                        "not all arguments converted during string formatting");
14260        return -1;
14261    }
14262    return 0;
14263}
14264
14265PyObject *
14266PyUnicode_Format(PyObject *format, PyObject *args)
14267{
14268    struct unicode_formatter_t ctx;
14269
14270    if (format == NULL || args == NULL) {
14271        PyErr_BadInternalCall();
14272        return NULL;
14273    }
14274
14275    ctx.fmtstr = PyUnicode_FromObject(format);
14276    if (ctx.fmtstr == NULL)
14277        return NULL;
14278    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14279        Py_DECREF(ctx.fmtstr);
14280        return NULL;
14281    }
14282    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14283    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14284    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14285    ctx.fmtpos = 0;
14286
14287    _PyUnicodeWriter_Init(&ctx.writer);
14288    ctx.writer.min_length = ctx.fmtcnt + 100;
14289    ctx.writer.overallocate = 1;
14290
14291    if (PyTuple_Check(args)) {
14292        ctx.arglen = PyTuple_Size(args);
14293        ctx.argidx = 0;
14294    }
14295    else {
14296        ctx.arglen = -1;
14297        ctx.argidx = -2;
14298    }
14299    ctx.args_owned = 0;
14300    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14301        ctx.dict = args;
14302    else
14303        ctx.dict = NULL;
14304    ctx.args = args;
14305
14306    while (--ctx.fmtcnt >= 0) {
14307        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14308            Py_ssize_t nonfmtpos;
14309
14310            nonfmtpos = ctx.fmtpos++;
14311            while (ctx.fmtcnt >= 0 &&
14312                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14313                ctx.fmtpos++;
14314                ctx.fmtcnt--;
14315            }
14316            if (ctx.fmtcnt < 0) {
14317                ctx.fmtpos--;
14318                ctx.writer.overallocate = 0;
14319            }
14320
14321            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14322                                                nonfmtpos, ctx.fmtpos) < 0)
14323                goto onError;
14324        }
14325        else {
14326            ctx.fmtpos++;
14327            if (unicode_format_arg(&ctx) == -1)
14328                goto onError;
14329        }
14330    }
14331
14332    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14333        PyErr_SetString(PyExc_TypeError,
14334                        "not all arguments converted during string formatting");
14335        goto onError;
14336    }
14337
14338    if (ctx.args_owned) {
14339        Py_DECREF(ctx.args);
14340    }
14341    Py_DECREF(ctx.fmtstr);
14342    return _PyUnicodeWriter_Finish(&ctx.writer);
14343
14344  onError:
14345    Py_DECREF(ctx.fmtstr);
14346    _PyUnicodeWriter_Dealloc(&ctx.writer);
14347    if (ctx.args_owned) {
14348        Py_DECREF(ctx.args);
14349    }
14350    return NULL;
14351}
14352
14353static PyObject *
14354unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14355
14356static PyObject *
14357unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14358{
14359    PyObject *x = NULL;
14360    static char *kwlist[] = {"object", "encoding", "errors", 0};
14361    char *encoding = NULL;
14362    char *errors = NULL;
14363
14364    if (type != &PyUnicode_Type)
14365        return unicode_subtype_new(type, args, kwds);
14366    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14367                                     kwlist, &x, &encoding, &errors))
14368        return NULL;
14369    if (x == NULL)
14370        _Py_RETURN_UNICODE_EMPTY();
14371    if (encoding == NULL && errors == NULL)
14372        return PyObject_Str(x);
14373    else
14374        return PyUnicode_FromEncodedObject(x, encoding, errors);
14375}
14376
14377static PyObject *
14378unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14379{
14380    PyObject *unicode, *self;
14381    Py_ssize_t length, char_size;
14382    int share_wstr, share_utf8;
14383    unsigned int kind;
14384    void *data;
14385
14386    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14387
14388    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14389    if (unicode == NULL)
14390        return NULL;
14391    assert(_PyUnicode_CHECK(unicode));
14392    if (PyUnicode_READY(unicode) == -1) {
14393        Py_DECREF(unicode);
14394        return NULL;
14395    }
14396
14397    self = type->tp_alloc(type, 0);
14398    if (self == NULL) {
14399        Py_DECREF(unicode);
14400        return NULL;
14401    }
14402    kind = PyUnicode_KIND(unicode);
14403    length = PyUnicode_GET_LENGTH(unicode);
14404
14405    _PyUnicode_LENGTH(self) = length;
14406#ifdef Py_DEBUG
14407    _PyUnicode_HASH(self) = -1;
14408#else
14409    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14410#endif
14411    _PyUnicode_STATE(self).interned = 0;
14412    _PyUnicode_STATE(self).kind = kind;
14413    _PyUnicode_STATE(self).compact = 0;
14414    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14415    _PyUnicode_STATE(self).ready = 1;
14416    _PyUnicode_WSTR(self) = NULL;
14417    _PyUnicode_UTF8_LENGTH(self) = 0;
14418    _PyUnicode_UTF8(self) = NULL;
14419    _PyUnicode_WSTR_LENGTH(self) = 0;
14420    _PyUnicode_DATA_ANY(self) = NULL;
14421
14422    share_utf8 = 0;
14423    share_wstr = 0;
14424    if (kind == PyUnicode_1BYTE_KIND) {
14425        char_size = 1;
14426        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14427            share_utf8 = 1;
14428    }
14429    else if (kind == PyUnicode_2BYTE_KIND) {
14430        char_size = 2;
14431        if (sizeof(wchar_t) == 2)
14432            share_wstr = 1;
14433    }
14434    else {
14435        assert(kind == PyUnicode_4BYTE_KIND);
14436        char_size = 4;
14437        if (sizeof(wchar_t) == 4)
14438            share_wstr = 1;
14439    }
14440
14441    /* Ensure we won't overflow the length. */
14442    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14443        PyErr_NoMemory();
14444        goto onError;
14445    }
14446    data = PyObject_MALLOC((length + 1) * char_size);
14447    if (data == NULL) {
14448        PyErr_NoMemory();
14449        goto onError;
14450    }
14451
14452    _PyUnicode_DATA_ANY(self) = data;
14453    if (share_utf8) {
14454        _PyUnicode_UTF8_LENGTH(self) = length;
14455        _PyUnicode_UTF8(self) = data;
14456    }
14457    if (share_wstr) {
14458        _PyUnicode_WSTR_LENGTH(self) = length;
14459        _PyUnicode_WSTR(self) = (wchar_t *)data;
14460    }
14461
14462    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14463              kind * (length + 1));
14464    assert(_PyUnicode_CheckConsistency(self, 1));
14465#ifdef Py_DEBUG
14466    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14467#endif
14468    Py_DECREF(unicode);
14469    return self;
14470
14471onError:
14472    Py_DECREF(unicode);
14473    Py_DECREF(self);
14474    return NULL;
14475}
14476
14477PyDoc_STRVAR(unicode_doc,
14478"str(object='') -> str\n\
14479str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14480\n\
14481Create a new string object from the given object. If encoding or\n\
14482errors is specified, then the object must expose a data buffer\n\
14483that will be decoded using the given encoding and error handler.\n\
14484Otherwise, returns the result of object.__str__() (if defined)\n\
14485or repr(object).\n\
14486encoding defaults to sys.getdefaultencoding().\n\
14487errors defaults to 'strict'.");
14488
14489static PyObject *unicode_iter(PyObject *seq);
14490
14491PyTypeObject PyUnicode_Type = {
14492    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14493    "str",              /* tp_name */
14494    sizeof(PyUnicodeObject),        /* tp_size */
14495    0,                  /* tp_itemsize */
14496    /* Slots */
14497    (destructor)unicode_dealloc,    /* tp_dealloc */
14498    0,                  /* tp_print */
14499    0,                  /* tp_getattr */
14500    0,                  /* tp_setattr */
14501    0,                  /* tp_reserved */
14502    unicode_repr,           /* tp_repr */
14503    &unicode_as_number,         /* tp_as_number */
14504    &unicode_as_sequence,       /* tp_as_sequence */
14505    &unicode_as_mapping,        /* tp_as_mapping */
14506    (hashfunc) unicode_hash,        /* tp_hash*/
14507    0,                  /* tp_call*/
14508    (reprfunc) unicode_str,     /* tp_str */
14509    PyObject_GenericGetAttr,        /* tp_getattro */
14510    0,                  /* tp_setattro */
14511    0,                  /* tp_as_buffer */
14512    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14513    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14514    unicode_doc,            /* tp_doc */
14515    0,                  /* tp_traverse */
14516    0,                  /* tp_clear */
14517    PyUnicode_RichCompare,      /* tp_richcompare */
14518    0,                  /* tp_weaklistoffset */
14519    unicode_iter,           /* tp_iter */
14520    0,                  /* tp_iternext */
14521    unicode_methods,            /* tp_methods */
14522    0,                  /* tp_members */
14523    0,                  /* tp_getset */
14524    &PyBaseObject_Type,         /* tp_base */
14525    0,                  /* tp_dict */
14526    0,                  /* tp_descr_get */
14527    0,                  /* tp_descr_set */
14528    0,                  /* tp_dictoffset */
14529    0,                  /* tp_init */
14530    0,                  /* tp_alloc */
14531    unicode_new,            /* tp_new */
14532    PyObject_Del,           /* tp_free */
14533};
14534
14535/* Initialize the Unicode implementation */
14536
14537int _PyUnicode_Init(void)
14538{
14539    /* XXX - move this array to unicodectype.c ? */
14540    Py_UCS2 linebreak[] = {
14541        0x000A, /* LINE FEED */
14542        0x000D, /* CARRIAGE RETURN */
14543        0x001C, /* FILE SEPARATOR */
14544        0x001D, /* GROUP SEPARATOR */
14545        0x001E, /* RECORD SEPARATOR */
14546        0x0085, /* NEXT LINE */
14547        0x2028, /* LINE SEPARATOR */
14548        0x2029, /* PARAGRAPH SEPARATOR */
14549    };
14550
14551    /* Init the implementation */
14552    _Py_INCREF_UNICODE_EMPTY();
14553    if (!unicode_empty)
14554        Py_FatalError("Can't create empty string");
14555    Py_DECREF(unicode_empty);
14556
14557    if (PyType_Ready(&PyUnicode_Type) < 0)
14558        Py_FatalError("Can't initialize 'unicode'");
14559
14560    /* initialize the linebreak bloom filter */
14561    bloom_linebreak = make_bloom_mask(
14562        PyUnicode_2BYTE_KIND, linebreak,
14563        Py_ARRAY_LENGTH(linebreak));
14564
14565    PyType_Ready(&EncodingMapType);
14566
14567    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14568        Py_FatalError("Can't initialize field name iterator type");
14569
14570    if (PyType_Ready(&PyFormatterIter_Type) < 0)
14571        Py_FatalError("Can't initialize formatter iter type");
14572
14573#ifdef HAVE_MBCS
14574    winver.dwOSVersionInfoSize = sizeof(winver);
14575    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
14576        PyErr_SetFromWindowsErr(0);
14577        return -1;
14578    }
14579#endif
14580    return 0;
14581}
14582
14583/* Finalize the Unicode implementation */
14584
14585int
14586PyUnicode_ClearFreeList(void)
14587{
14588    return 0;
14589}
14590
14591void
14592_PyUnicode_Fini(void)
14593{
14594    int i;
14595
14596    Py_CLEAR(unicode_empty);
14597
14598    for (i = 0; i < 256; i++)
14599        Py_CLEAR(unicode_latin1[i]);
14600    _PyUnicode_ClearStaticStrings();
14601    (void)PyUnicode_ClearFreeList();
14602}
14603
14604void
14605PyUnicode_InternInPlace(PyObject **p)
14606{
14607    register PyObject *s = *p;
14608    PyObject *t;
14609#ifdef Py_DEBUG
14610    assert(s != NULL);
14611    assert(_PyUnicode_CHECK(s));
14612#else
14613    if (s == NULL || !PyUnicode_Check(s))
14614        return;
14615#endif
14616    /* If it's a subclass, we don't really know what putting
14617       it in the interned dict might do. */
14618    if (!PyUnicode_CheckExact(s))
14619        return;
14620    if (PyUnicode_CHECK_INTERNED(s))
14621        return;
14622    if (interned == NULL) {
14623        interned = PyDict_New();
14624        if (interned == NULL) {
14625            PyErr_Clear(); /* Don't leave an exception */
14626            return;
14627        }
14628    }
14629    /* It might be that the GetItem call fails even
14630       though the key is present in the dictionary,
14631       namely when this happens during a stack overflow. */
14632    Py_ALLOW_RECURSION
14633    t = PyDict_GetItem(interned, s);
14634    Py_END_ALLOW_RECURSION
14635
14636    if (t) {
14637        Py_INCREF(t);
14638        Py_DECREF(*p);
14639        *p = t;
14640        return;
14641    }
14642
14643    PyThreadState_GET()->recursion_critical = 1;
14644    if (PyDict_SetItem(interned, s, s) < 0) {
14645        PyErr_Clear();
14646        PyThreadState_GET()->recursion_critical = 0;
14647        return;
14648    }
14649    PyThreadState_GET()->recursion_critical = 0;
14650    /* The two references in interned are not counted by refcnt.
14651       The deallocator will take care of this */
14652    Py_REFCNT(s) -= 2;
14653    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
14654}
14655
14656void
14657PyUnicode_InternImmortal(PyObject **p)
14658{
14659    PyUnicode_InternInPlace(p);
14660    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
14661        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
14662        Py_INCREF(*p);
14663    }
14664}
14665
14666PyObject *
14667PyUnicode_InternFromString(const char *cp)
14668{
14669    PyObject *s = PyUnicode_FromString(cp);
14670    if (s == NULL)
14671        return NULL;
14672    PyUnicode_InternInPlace(&s);
14673    return s;
14674}
14675
14676void
14677_Py_ReleaseInternedUnicodeStrings(void)
14678{
14679    PyObject *keys;
14680    PyObject *s;
14681    Py_ssize_t i, n;
14682    Py_ssize_t immortal_size = 0, mortal_size = 0;
14683
14684    if (interned == NULL || !PyDict_Check(interned))
14685        return;
14686    keys = PyDict_Keys(interned);
14687    if (keys == NULL || !PyList_Check(keys)) {
14688        PyErr_Clear();
14689        return;
14690    }
14691
14692    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
14693       detector, interned unicode strings are not forcibly deallocated;
14694       rather, we give them their stolen references back, and then clear
14695       and DECREF the interned dict. */
14696
14697    n = PyList_GET_SIZE(keys);
14698    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
14699            n);
14700    for (i = 0; i < n; i++) {
14701        s = PyList_GET_ITEM(keys, i);
14702        if (PyUnicode_READY(s) == -1) {
14703            assert(0 && "could not ready string");
14704            fprintf(stderr, "could not ready string\n");
14705        }
14706        switch (PyUnicode_CHECK_INTERNED(s)) {
14707        case SSTATE_NOT_INTERNED:
14708            /* XXX Shouldn't happen */
14709            break;
14710        case SSTATE_INTERNED_IMMORTAL:
14711            Py_REFCNT(s) += 1;
14712            immortal_size += PyUnicode_GET_LENGTH(s);
14713            break;
14714        case SSTATE_INTERNED_MORTAL:
14715            Py_REFCNT(s) += 2;
14716            mortal_size += PyUnicode_GET_LENGTH(s);
14717            break;
14718        default:
14719            Py_FatalError("Inconsistent interned string state.");
14720        }
14721        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
14722    }
14723    fprintf(stderr, "total size of all interned strings: "
14724            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
14725            "mortal/immortal\n", mortal_size, immortal_size);
14726    Py_DECREF(keys);
14727    PyDict_Clear(interned);
14728    Py_CLEAR(interned);
14729}
14730
14731
14732/********************* Unicode Iterator **************************/
14733
14734typedef struct {
14735    PyObject_HEAD
14736    Py_ssize_t it_index;
14737    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
14738} unicodeiterobject;
14739
14740static void
14741unicodeiter_dealloc(unicodeiterobject *it)
14742{
14743    _PyObject_GC_UNTRACK(it);
14744    Py_XDECREF(it->it_seq);
14745    PyObject_GC_Del(it);
14746}
14747
14748static int
14749unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
14750{
14751    Py_VISIT(it->it_seq);
14752    return 0;
14753}
14754
14755static PyObject *
14756unicodeiter_next(unicodeiterobject *it)
14757{
14758    PyObject *seq, *item;
14759
14760    assert(it != NULL);
14761    seq = it->it_seq;
14762    if (seq == NULL)
14763        return NULL;
14764    assert(_PyUnicode_CHECK(seq));
14765
14766    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
14767        int kind = PyUnicode_KIND(seq);
14768        void *data = PyUnicode_DATA(seq);
14769        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
14770        item = PyUnicode_FromOrdinal(chr);
14771        if (item != NULL)
14772            ++it->it_index;
14773        return item;
14774    }
14775
14776    Py_DECREF(seq);
14777    it->it_seq = NULL;
14778    return NULL;
14779}
14780
14781static PyObject *
14782unicodeiter_len(unicodeiterobject *it)
14783{
14784    Py_ssize_t len = 0;
14785    if (it->it_seq)
14786        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
14787    return PyLong_FromSsize_t(len);
14788}
14789
14790PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
14791
14792static PyObject *
14793unicodeiter_reduce(unicodeiterobject *it)
14794{
14795    if (it->it_seq != NULL) {
14796        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
14797                             it->it_seq, it->it_index);
14798    } else {
14799        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
14800        if (u == NULL)
14801            return NULL;
14802        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
14803    }
14804}
14805
14806PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
14807
14808static PyObject *
14809unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
14810{
14811    Py_ssize_t index = PyLong_AsSsize_t(state);
14812    if (index == -1 && PyErr_Occurred())
14813        return NULL;
14814    if (index < 0)
14815        index = 0;
14816    it->it_index = index;
14817    Py_RETURN_NONE;
14818}
14819
14820PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
14821
14822static PyMethodDef unicodeiter_methods[] = {
14823    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
14824     length_hint_doc},
14825    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
14826     reduce_doc},
14827    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
14828     setstate_doc},
14829    {NULL,      NULL}       /* sentinel */
14830};
14831
14832PyTypeObject PyUnicodeIter_Type = {
14833    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14834    "str_iterator",         /* tp_name */
14835    sizeof(unicodeiterobject),      /* tp_basicsize */
14836    0,                  /* tp_itemsize */
14837    /* methods */
14838    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
14839    0,                  /* tp_print */
14840    0,                  /* tp_getattr */
14841    0,                  /* tp_setattr */
14842    0,                  /* tp_reserved */
14843    0,                  /* tp_repr */
14844    0,                  /* tp_as_number */
14845    0,                  /* tp_as_sequence */
14846    0,                  /* tp_as_mapping */
14847    0,                  /* tp_hash */
14848    0,                  /* tp_call */
14849    0,                  /* tp_str */
14850    PyObject_GenericGetAttr,        /* tp_getattro */
14851    0,                  /* tp_setattro */
14852    0,                  /* tp_as_buffer */
14853    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
14854    0,                  /* tp_doc */
14855    (traverseproc)unicodeiter_traverse, /* tp_traverse */
14856    0,                  /* tp_clear */
14857    0,                  /* tp_richcompare */
14858    0,                  /* tp_weaklistoffset */
14859    PyObject_SelfIter,          /* tp_iter */
14860    (iternextfunc)unicodeiter_next,     /* tp_iternext */
14861    unicodeiter_methods,            /* tp_methods */
14862    0,
14863};
14864
14865static PyObject *
14866unicode_iter(PyObject *seq)
14867{
14868    unicodeiterobject *it;
14869
14870    if (!PyUnicode_Check(seq)) {
14871        PyErr_BadInternalCall();
14872        return NULL;
14873    }
14874    if (PyUnicode_READY(seq) == -1)
14875        return NULL;
14876    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
14877    if (it == NULL)
14878        return NULL;
14879    it->it_index = 0;
14880    Py_INCREF(seq);
14881    it->it_seq = seq;
14882    _PyObject_GC_TRACK(it);
14883    return (PyObject *)it;
14884}
14885
14886
14887size_t
14888Py_UNICODE_strlen(const Py_UNICODE *u)
14889{
14890    int res = 0;
14891    while(*u++)
14892        res++;
14893    return res;
14894}
14895
14896Py_UNICODE*
14897Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
14898{
14899    Py_UNICODE *u = s1;
14900    while ((*u++ = *s2++));
14901    return s1;
14902}
14903
14904Py_UNICODE*
14905Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14906{
14907    Py_UNICODE *u = s1;
14908    while ((*u++ = *s2++))
14909        if (n-- == 0)
14910            break;
14911    return s1;
14912}
14913
14914Py_UNICODE*
14915Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
14916{
14917    Py_UNICODE *u1 = s1;
14918    u1 += Py_UNICODE_strlen(u1);
14919    Py_UNICODE_strcpy(u1, s2);
14920    return s1;
14921}
14922
14923int
14924Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
14925{
14926    while (*s1 && *s2 && *s1 == *s2)
14927        s1++, s2++;
14928    if (*s1 && *s2)
14929        return (*s1 < *s2) ? -1 : +1;
14930    if (*s1)
14931        return 1;
14932    if (*s2)
14933        return -1;
14934    return 0;
14935}
14936
14937int
14938Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
14939{
14940    register Py_UNICODE u1, u2;
14941    for (; n != 0; n--) {
14942        u1 = *s1;
14943        u2 = *s2;
14944        if (u1 != u2)
14945            return (u1 < u2) ? -1 : +1;
14946        if (u1 == '\0')
14947            return 0;
14948        s1++;
14949        s2++;
14950    }
14951    return 0;
14952}
14953
14954Py_UNICODE*
14955Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
14956{
14957    const Py_UNICODE *p;
14958    for (p = s; *p; p++)
14959        if (*p == c)
14960            return (Py_UNICODE*)p;
14961    return NULL;
14962}
14963
14964Py_UNICODE*
14965Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
14966{
14967    const Py_UNICODE *p;
14968    p = s + Py_UNICODE_strlen(s);
14969    while (p != s) {
14970        p--;
14971        if (*p == c)
14972            return (Py_UNICODE*)p;
14973    }
14974    return NULL;
14975}
14976
14977Py_UNICODE*
14978PyUnicode_AsUnicodeCopy(PyObject *unicode)
14979{
14980    Py_UNICODE *u, *copy;
14981    Py_ssize_t len, size;
14982
14983    if (!PyUnicode_Check(unicode)) {
14984        PyErr_BadArgument();
14985        return NULL;
14986    }
14987    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
14988    if (u == NULL)
14989        return NULL;
14990    /* Ensure we won't overflow the size. */
14991    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
14992        PyErr_NoMemory();
14993        return NULL;
14994    }
14995    size = len + 1; /* copy the null character */
14996    size *= sizeof(Py_UNICODE);
14997    copy = PyMem_Malloc(size);
14998    if (copy == NULL) {
14999        PyErr_NoMemory();
15000        return NULL;
15001    }
15002    memcpy(copy, u, size);
15003    return copy;
15004}
15005
15006/* A _string module, to export formatter_parser and formatter_field_name_split
15007   to the string.Formatter class implemented in Python. */
15008
15009static PyMethodDef _string_methods[] = {
15010    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15011     METH_O, PyDoc_STR("split the argument as a field name")},
15012    {"formatter_parser", (PyCFunction) formatter_parser,
15013     METH_O, PyDoc_STR("parse the argument as a format string")},
15014    {NULL, NULL}
15015};
15016
15017static struct PyModuleDef _string_module = {
15018    PyModuleDef_HEAD_INIT,
15019    "_string",
15020    PyDoc_STR("string helper module"),
15021    0,
15022    _string_methods,
15023    NULL,
15024    NULL,
15025    NULL,
15026    NULL
15027};
15028
15029PyMODINIT_FUNC
15030PyInit__string(void)
15031{
15032    return PyModule_Create(&_string_module);
15033}
15034
15035
15036#ifdef __cplusplus
15037}
15038#endif
15039