unicodeobject.c revision f6d1f1fa8a503f218a2103ba1e6768c6cfdb7c50
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com>.
5
6Major speed upgrades to the method implementations at the Reykjavik
7NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
8
9Copyright (c) Corporation for National Research Initiatives.
10
11--------------------------------------------------------------------
12The original string type implementation is:
13
14  Copyright (c) 1999 by Secret Labs AB
15  Copyright (c) 1999 by Fredrik Lundh
16
17By obtaining, using, and/or copying this software and/or its
18associated documentation, you agree that you have read, understood,
19and will comply with the following terms and conditions:
20
21Permission to use, copy, modify, and distribute this software and its
22associated documentation for any purpose and without fee is hereby
23granted, provided that the above copyright notice appears in all
24copies, and that both that copyright notice and this permission notice
25appear in supporting documentation, and that the name of Secret Labs
26AB or the author not be used in advertising or publicity pertaining to
27distribution of the software without specific, written prior
28permission.
29
30SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
31THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
32FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
33ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
34WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
35ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
36OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
37--------------------------------------------------------------------
38
39*/
40
41#define PY_SSIZE_T_CLEAN
42#include "Python.h"
43#include "ucnhash.h"
44#include "bytes_methods.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/*[clinic input]
51class str "PyUnicodeObject *" "&PyUnicode_Type"
52[clinic start generated code]*/
53/*[clinic end generated code: output=da39a3ee5e6b4b0d input=604e916854800fa8]*/
54
55/* --- Globals ------------------------------------------------------------
56
57NOTE: In the interpreter's initialization phase, some globals are currently
58      initialized dynamically as needed. In the process Unicode objects may
59      be created before the Unicode type is ready.
60
61*/
62
63
64#ifdef __cplusplus
65extern "C" {
66#endif
67
68/* Maximum code point of Unicode 6.0: 0x10ffff (1,114,111) */
69#define MAX_UNICODE 0x10ffff
70
71#ifdef Py_DEBUG
72#  define _PyUnicode_CHECK(op) _PyUnicode_CheckConsistency(op, 0)
73#else
74#  define _PyUnicode_CHECK(op) PyUnicode_Check(op)
75#endif
76
77#define _PyUnicode_UTF8(op)                             \
78    (((PyCompactUnicodeObject*)(op))->utf8)
79#define PyUnicode_UTF8(op)                              \
80    (assert(_PyUnicode_CHECK(op)),                      \
81     assert(PyUnicode_IS_READY(op)),                    \
82     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
83         ((char*)((PyASCIIObject*)(op) + 1)) :          \
84         _PyUnicode_UTF8(op))
85#define _PyUnicode_UTF8_LENGTH(op)                      \
86    (((PyCompactUnicodeObject*)(op))->utf8_length)
87#define PyUnicode_UTF8_LENGTH(op)                       \
88    (assert(_PyUnicode_CHECK(op)),                      \
89     assert(PyUnicode_IS_READY(op)),                    \
90     PyUnicode_IS_COMPACT_ASCII(op) ?                   \
91         ((PyASCIIObject*)(op))->length :               \
92         _PyUnicode_UTF8_LENGTH(op))
93#define _PyUnicode_WSTR(op)                             \
94    (((PyASCIIObject*)(op))->wstr)
95#define _PyUnicode_WSTR_LENGTH(op)                      \
96    (((PyCompactUnicodeObject*)(op))->wstr_length)
97#define _PyUnicode_LENGTH(op)                           \
98    (((PyASCIIObject *)(op))->length)
99#define _PyUnicode_STATE(op)                            \
100    (((PyASCIIObject *)(op))->state)
101#define _PyUnicode_HASH(op)                             \
102    (((PyASCIIObject *)(op))->hash)
103#define _PyUnicode_KIND(op)                             \
104    (assert(_PyUnicode_CHECK(op)),                      \
105     ((PyASCIIObject *)(op))->state.kind)
106#define _PyUnicode_GET_LENGTH(op)                       \
107    (assert(_PyUnicode_CHECK(op)),                      \
108     ((PyASCIIObject *)(op))->length)
109#define _PyUnicode_DATA_ANY(op)                         \
110    (((PyUnicodeObject*)(op))->data.any)
111
112#undef PyUnicode_READY
113#define PyUnicode_READY(op)                             \
114    (assert(_PyUnicode_CHECK(op)),                      \
115     (PyUnicode_IS_READY(op) ?                          \
116      0 :                                               \
117      _PyUnicode_Ready(op)))
118
119#define _PyUnicode_SHARE_UTF8(op)                       \
120    (assert(_PyUnicode_CHECK(op)),                      \
121     assert(!PyUnicode_IS_COMPACT_ASCII(op)),           \
122     (_PyUnicode_UTF8(op) == PyUnicode_DATA(op)))
123#define _PyUnicode_SHARE_WSTR(op)                       \
124    (assert(_PyUnicode_CHECK(op)),                      \
125     (_PyUnicode_WSTR(unicode) == PyUnicode_DATA(op)))
126
127/* true if the Unicode object has an allocated UTF-8 memory block
128   (not shared with other data) */
129#define _PyUnicode_HAS_UTF8_MEMORY(op)                  \
130    ((!PyUnicode_IS_COMPACT_ASCII(op)                   \
131      && _PyUnicode_UTF8(op)                            \
132      && _PyUnicode_UTF8(op) != PyUnicode_DATA(op)))
133
134/* true if the Unicode object has an allocated wstr memory block
135   (not shared with other data) */
136#define _PyUnicode_HAS_WSTR_MEMORY(op)                  \
137    ((_PyUnicode_WSTR(op) &&                            \
138      (!PyUnicode_IS_READY(op) ||                       \
139       _PyUnicode_WSTR(op) != PyUnicode_DATA(op))))
140
141/* Generic helper macro to convert characters of different types.
142   from_type and to_type have to be valid type names, begin and end
143   are pointers to the source characters which should be of type
144   "from_type *".  to is a pointer of type "to_type *" and points to the
145   buffer where the result characters are written to. */
146#define _PyUnicode_CONVERT_BYTES(from_type, to_type, begin, end, to) \
147    do {                                                \
148        to_type *_to = (to_type *)(to);                \
149        const from_type *_iter = (from_type *)(begin);  \
150        const from_type *_end = (from_type *)(end);     \
151        Py_ssize_t n = (_end) - (_iter);                \
152        const from_type *_unrolled_end =                \
153            _iter + _Py_SIZE_ROUND_DOWN(n, 4);          \
154        while (_iter < (_unrolled_end)) {               \
155            _to[0] = (to_type) _iter[0];                \
156            _to[1] = (to_type) _iter[1];                \
157            _to[2] = (to_type) _iter[2];                \
158            _to[3] = (to_type) _iter[3];                \
159            _iter += 4; _to += 4;                       \
160        }                                               \
161        while (_iter < (_end))                          \
162            *_to++ = (to_type) *_iter++;                \
163    } while (0)
164
165/* This dictionary holds all interned unicode strings.  Note that references
166   to strings in this dictionary are *not* counted in the string's ob_refcnt.
167   When the interned string reaches a refcnt of 0 the string deallocation
168   function will delete the reference from this dictionary.
169
170   Another way to look at this is that to say that the actual reference
171   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
172*/
173static PyObject *interned = NULL;
174
175/* The empty Unicode object is shared to improve performance. */
176static PyObject *unicode_empty = NULL;
177
178#define _Py_INCREF_UNICODE_EMPTY()                      \
179    do {                                                \
180        if (unicode_empty != NULL)                      \
181            Py_INCREF(unicode_empty);                   \
182        else {                                          \
183            unicode_empty = PyUnicode_New(0, 0);        \
184            if (unicode_empty != NULL) {                \
185                Py_INCREF(unicode_empty);               \
186                assert(_PyUnicode_CheckConsistency(unicode_empty, 1)); \
187            }                                           \
188        }                                               \
189    } while (0)
190
191#define _Py_RETURN_UNICODE_EMPTY()                      \
192    do {                                                \
193        _Py_INCREF_UNICODE_EMPTY();                     \
194        return unicode_empty;                           \
195    } while (0)
196
197/* Forward declaration */
198Py_LOCAL_INLINE(int)
199_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch);
200
201/* List of static strings. */
202static _Py_Identifier *static_strings = NULL;
203
204/* Single character Unicode strings in the Latin-1 range are being
205   shared as well. */
206static PyObject *unicode_latin1[256] = {NULL};
207
208/* Fast detection of the most frequent whitespace characters */
209const unsigned char _Py_ascii_whitespace[] = {
210    0, 0, 0, 0, 0, 0, 0, 0,
211/*     case 0x0009: * CHARACTER TABULATION */
212/*     case 0x000A: * LINE FEED */
213/*     case 0x000B: * LINE TABULATION */
214/*     case 0x000C: * FORM FEED */
215/*     case 0x000D: * CARRIAGE RETURN */
216    0, 1, 1, 1, 1, 1, 0, 0,
217    0, 0, 0, 0, 0, 0, 0, 0,
218/*     case 0x001C: * FILE SEPARATOR */
219/*     case 0x001D: * GROUP SEPARATOR */
220/*     case 0x001E: * RECORD SEPARATOR */
221/*     case 0x001F: * UNIT SEPARATOR */
222    0, 0, 0, 0, 1, 1, 1, 1,
223/*     case 0x0020: * SPACE */
224    1, 0, 0, 0, 0, 0, 0, 0,
225    0, 0, 0, 0, 0, 0, 0, 0,
226    0, 0, 0, 0, 0, 0, 0, 0,
227    0, 0, 0, 0, 0, 0, 0, 0,
228
229    0, 0, 0, 0, 0, 0, 0, 0,
230    0, 0, 0, 0, 0, 0, 0, 0,
231    0, 0, 0, 0, 0, 0, 0, 0,
232    0, 0, 0, 0, 0, 0, 0, 0,
233    0, 0, 0, 0, 0, 0, 0, 0,
234    0, 0, 0, 0, 0, 0, 0, 0,
235    0, 0, 0, 0, 0, 0, 0, 0,
236    0, 0, 0, 0, 0, 0, 0, 0
237};
238
239/* forward */
240static PyUnicodeObject *_PyUnicode_New(Py_ssize_t length);
241static PyObject* get_latin1_char(unsigned char ch);
242static int unicode_modifiable(PyObject *unicode);
243
244
245static PyObject *
246_PyUnicode_FromUCS1(const Py_UCS1 *s, Py_ssize_t size);
247static PyObject *
248_PyUnicode_FromUCS2(const Py_UCS2 *s, Py_ssize_t size);
249static PyObject *
250_PyUnicode_FromUCS4(const Py_UCS4 *s, Py_ssize_t size);
251
252static PyObject *
253unicode_encode_call_errorhandler(const char *errors,
254       PyObject **errorHandler,const char *encoding, const char *reason,
255       PyObject *unicode, PyObject **exceptionObject,
256       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
257
258static void
259raise_encode_exception(PyObject **exceptionObject,
260                       const char *encoding,
261                       PyObject *unicode,
262                       Py_ssize_t startpos, Py_ssize_t endpos,
263                       const char *reason);
264
265/* Same for linebreaks */
266static unsigned char ascii_linebreak[] = {
267    0, 0, 0, 0, 0, 0, 0, 0,
268/*         0x000A, * LINE FEED */
269/*         0x000B, * LINE TABULATION */
270/*         0x000C, * FORM FEED */
271/*         0x000D, * CARRIAGE RETURN */
272    0, 0, 1, 1, 1, 1, 0, 0,
273    0, 0, 0, 0, 0, 0, 0, 0,
274/*         0x001C, * FILE SEPARATOR */
275/*         0x001D, * GROUP SEPARATOR */
276/*         0x001E, * RECORD SEPARATOR */
277    0, 0, 0, 0, 1, 1, 1, 0,
278    0, 0, 0, 0, 0, 0, 0, 0,
279    0, 0, 0, 0, 0, 0, 0, 0,
280    0, 0, 0, 0, 0, 0, 0, 0,
281    0, 0, 0, 0, 0, 0, 0, 0,
282
283    0, 0, 0, 0, 0, 0, 0, 0,
284    0, 0, 0, 0, 0, 0, 0, 0,
285    0, 0, 0, 0, 0, 0, 0, 0,
286    0, 0, 0, 0, 0, 0, 0, 0,
287    0, 0, 0, 0, 0, 0, 0, 0,
288    0, 0, 0, 0, 0, 0, 0, 0,
289    0, 0, 0, 0, 0, 0, 0, 0,
290    0, 0, 0, 0, 0, 0, 0, 0
291};
292
293/* The max unicode value is always 0x10FFFF while using the PEP-393 API.
294   This function is kept for backward compatibility with the old API. */
295Py_UNICODE
296PyUnicode_GetMax(void)
297{
298#ifdef Py_UNICODE_WIDE
299    return 0x10FFFF;
300#else
301    /* This is actually an illegal character, so it should
302       not be passed to unichr. */
303    return 0xFFFF;
304#endif
305}
306
307#ifdef Py_DEBUG
308int
309_PyUnicode_CheckConsistency(PyObject *op, int check_content)
310{
311    PyASCIIObject *ascii;
312    unsigned int kind;
313
314    assert(PyUnicode_Check(op));
315
316    ascii = (PyASCIIObject *)op;
317    kind = ascii->state.kind;
318
319    if (ascii->state.ascii == 1 && ascii->state.compact == 1) {
320        assert(kind == PyUnicode_1BYTE_KIND);
321        assert(ascii->state.ready == 1);
322    }
323    else {
324        PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
325        void *data;
326
327        if (ascii->state.compact == 1) {
328            data = compact + 1;
329            assert(kind == PyUnicode_1BYTE_KIND
330                   || kind == PyUnicode_2BYTE_KIND
331                   || kind == PyUnicode_4BYTE_KIND);
332            assert(ascii->state.ascii == 0);
333            assert(ascii->state.ready == 1);
334            assert (compact->utf8 != data);
335        }
336        else {
337            PyUnicodeObject *unicode = (PyUnicodeObject *)op;
338
339            data = unicode->data.any;
340            if (kind == PyUnicode_WCHAR_KIND) {
341                assert(ascii->length == 0);
342                assert(ascii->hash == -1);
343                assert(ascii->state.compact == 0);
344                assert(ascii->state.ascii == 0);
345                assert(ascii->state.ready == 0);
346                assert(ascii->state.interned == SSTATE_NOT_INTERNED);
347                assert(ascii->wstr != NULL);
348                assert(data == NULL);
349                assert(compact->utf8 == NULL);
350            }
351            else {
352                assert(kind == PyUnicode_1BYTE_KIND
353                       || kind == PyUnicode_2BYTE_KIND
354                       || kind == PyUnicode_4BYTE_KIND);
355                assert(ascii->state.compact == 0);
356                assert(ascii->state.ready == 1);
357                assert(data != NULL);
358                if (ascii->state.ascii) {
359                    assert (compact->utf8 == data);
360                    assert (compact->utf8_length == ascii->length);
361                }
362                else
363                    assert (compact->utf8 != data);
364            }
365        }
366        if (kind != PyUnicode_WCHAR_KIND) {
367            if (
368#if SIZEOF_WCHAR_T == 2
369                kind == PyUnicode_2BYTE_KIND
370#else
371                kind == PyUnicode_4BYTE_KIND
372#endif
373               )
374            {
375                assert(ascii->wstr == data);
376                assert(compact->wstr_length == ascii->length);
377            } else
378                assert(ascii->wstr != data);
379        }
380
381        if (compact->utf8 == NULL)
382            assert(compact->utf8_length == 0);
383        if (ascii->wstr == NULL)
384            assert(compact->wstr_length == 0);
385    }
386    /* check that the best kind is used */
387    if (check_content && kind != PyUnicode_WCHAR_KIND)
388    {
389        Py_ssize_t i;
390        Py_UCS4 maxchar = 0;
391        void *data;
392        Py_UCS4 ch;
393
394        data = PyUnicode_DATA(ascii);
395        for (i=0; i < ascii->length; i++)
396        {
397            ch = PyUnicode_READ(kind, data, i);
398            if (ch > maxchar)
399                maxchar = ch;
400        }
401        if (kind == PyUnicode_1BYTE_KIND) {
402            if (ascii->state.ascii == 0) {
403                assert(maxchar >= 128);
404                assert(maxchar <= 255);
405            }
406            else
407                assert(maxchar < 128);
408        }
409        else if (kind == PyUnicode_2BYTE_KIND) {
410            assert(maxchar >= 0x100);
411            assert(maxchar <= 0xFFFF);
412        }
413        else {
414            assert(maxchar >= 0x10000);
415            assert(maxchar <= MAX_UNICODE);
416        }
417        assert(PyUnicode_READ(kind, data, ascii->length) == 0);
418    }
419    return 1;
420}
421#endif
422
423static PyObject*
424unicode_result_wchar(PyObject *unicode)
425{
426#ifndef Py_DEBUG
427    Py_ssize_t len;
428
429    len = _PyUnicode_WSTR_LENGTH(unicode);
430    if (len == 0) {
431        Py_DECREF(unicode);
432        _Py_RETURN_UNICODE_EMPTY();
433    }
434
435    if (len == 1) {
436        wchar_t ch = _PyUnicode_WSTR(unicode)[0];
437        if ((Py_UCS4)ch < 256) {
438            PyObject *latin1_char = get_latin1_char((unsigned char)ch);
439            Py_DECREF(unicode);
440            return latin1_char;
441        }
442    }
443
444    if (_PyUnicode_Ready(unicode) < 0) {
445        Py_DECREF(unicode);
446        return NULL;
447    }
448#else
449    assert(Py_REFCNT(unicode) == 1);
450
451    /* don't make the result ready in debug mode to ensure that the caller
452       makes the string ready before using it */
453    assert(_PyUnicode_CheckConsistency(unicode, 1));
454#endif
455    return unicode;
456}
457
458static PyObject*
459unicode_result_ready(PyObject *unicode)
460{
461    Py_ssize_t length;
462
463    length = PyUnicode_GET_LENGTH(unicode);
464    if (length == 0) {
465        if (unicode != unicode_empty) {
466            Py_DECREF(unicode);
467            _Py_RETURN_UNICODE_EMPTY();
468        }
469        return unicode_empty;
470    }
471
472    if (length == 1) {
473        void *data = PyUnicode_DATA(unicode);
474        int kind = PyUnicode_KIND(unicode);
475        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
476        if (ch < 256) {
477            PyObject *latin1_char = unicode_latin1[ch];
478            if (latin1_char != NULL) {
479                if (unicode != latin1_char) {
480                    Py_INCREF(latin1_char);
481                    Py_DECREF(unicode);
482                }
483                return latin1_char;
484            }
485            else {
486                assert(_PyUnicode_CheckConsistency(unicode, 1));
487                Py_INCREF(unicode);
488                unicode_latin1[ch] = unicode;
489                return unicode;
490            }
491        }
492    }
493
494    assert(_PyUnicode_CheckConsistency(unicode, 1));
495    return unicode;
496}
497
498static PyObject*
499unicode_result(PyObject *unicode)
500{
501    assert(_PyUnicode_CHECK(unicode));
502    if (PyUnicode_IS_READY(unicode))
503        return unicode_result_ready(unicode);
504    else
505        return unicode_result_wchar(unicode);
506}
507
508static PyObject*
509unicode_result_unchanged(PyObject *unicode)
510{
511    if (PyUnicode_CheckExact(unicode)) {
512        if (PyUnicode_READY(unicode) == -1)
513            return NULL;
514        Py_INCREF(unicode);
515        return unicode;
516    }
517    else
518        /* Subtype -- return genuine unicode string with the same value. */
519        return _PyUnicode_Copy(unicode);
520}
521
522#ifdef HAVE_MBCS
523static OSVERSIONINFOEX winver;
524#endif
525
526/* --- Bloom Filters ----------------------------------------------------- */
527
528/* stuff to implement simple "bloom filters" for Unicode characters.
529   to keep things simple, we use a single bitmask, using the least 5
530   bits from each unicode characters as the bit index. */
531
532/* the linebreak mask is set up by Unicode_Init below */
533
534#if LONG_BIT >= 128
535#define BLOOM_WIDTH 128
536#elif LONG_BIT >= 64
537#define BLOOM_WIDTH 64
538#elif LONG_BIT >= 32
539#define BLOOM_WIDTH 32
540#else
541#error "LONG_BIT is smaller than 32"
542#endif
543
544#define BLOOM_MASK unsigned long
545
546static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
547
548#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
549
550#define BLOOM_LINEBREAK(ch)                                             \
551    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
552     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
553
554Py_LOCAL_INLINE(BLOOM_MASK)
555make_bloom_mask(int kind, void* ptr, Py_ssize_t len)
556{
557#define BLOOM_UPDATE(TYPE, MASK, PTR, LEN)             \
558    do {                                               \
559        TYPE *data = (TYPE *)PTR;                      \
560        TYPE *end = data + LEN;                        \
561        Py_UCS4 ch;                                    \
562        for (; data != end; data++) {                  \
563            ch = *data;                                \
564            MASK |= (1UL << (ch & (BLOOM_WIDTH - 1))); \
565        }                                              \
566        break;                                         \
567    } while (0)
568
569    /* calculate simple bloom-style bitmask for a given unicode string */
570
571    BLOOM_MASK mask;
572
573    mask = 0;
574    switch (kind) {
575    case PyUnicode_1BYTE_KIND:
576        BLOOM_UPDATE(Py_UCS1, mask, ptr, len);
577        break;
578    case PyUnicode_2BYTE_KIND:
579        BLOOM_UPDATE(Py_UCS2, mask, ptr, len);
580        break;
581    case PyUnicode_4BYTE_KIND:
582        BLOOM_UPDATE(Py_UCS4, mask, ptr, len);
583        break;
584    default:
585        assert(0);
586    }
587    return mask;
588
589#undef BLOOM_UPDATE
590}
591
592/* Compilation of templated routines */
593
594#include "stringlib/asciilib.h"
595#include "stringlib/fastsearch.h"
596#include "stringlib/partition.h"
597#include "stringlib/split.h"
598#include "stringlib/count.h"
599#include "stringlib/find.h"
600#include "stringlib/find_max_char.h"
601#include "stringlib/localeutil.h"
602#include "stringlib/undef.h"
603
604#include "stringlib/ucs1lib.h"
605#include "stringlib/fastsearch.h"
606#include "stringlib/partition.h"
607#include "stringlib/split.h"
608#include "stringlib/count.h"
609#include "stringlib/find.h"
610#include "stringlib/replace.h"
611#include "stringlib/find_max_char.h"
612#include "stringlib/localeutil.h"
613#include "stringlib/undef.h"
614
615#include "stringlib/ucs2lib.h"
616#include "stringlib/fastsearch.h"
617#include "stringlib/partition.h"
618#include "stringlib/split.h"
619#include "stringlib/count.h"
620#include "stringlib/find.h"
621#include "stringlib/replace.h"
622#include "stringlib/find_max_char.h"
623#include "stringlib/localeutil.h"
624#include "stringlib/undef.h"
625
626#include "stringlib/ucs4lib.h"
627#include "stringlib/fastsearch.h"
628#include "stringlib/partition.h"
629#include "stringlib/split.h"
630#include "stringlib/count.h"
631#include "stringlib/find.h"
632#include "stringlib/replace.h"
633#include "stringlib/find_max_char.h"
634#include "stringlib/localeutil.h"
635#include "stringlib/undef.h"
636
637#include "stringlib/unicodedefs.h"
638#include "stringlib/fastsearch.h"
639#include "stringlib/count.h"
640#include "stringlib/find.h"
641#include "stringlib/undef.h"
642
643/* --- Unicode Object ----------------------------------------------------- */
644
645static PyObject *
646fixup(PyObject *self, Py_UCS4 (*fixfct)(PyObject *s));
647
648Py_LOCAL_INLINE(Py_ssize_t) findchar(void *s, int kind,
649                                     Py_ssize_t size, Py_UCS4 ch,
650                                     int direction)
651{
652    int mode = (direction == 1) ? FAST_SEARCH : FAST_RSEARCH;
653
654    switch (kind) {
655    case PyUnicode_1BYTE_KIND:
656        {
657            Py_UCS1 ch1 = (Py_UCS1) ch;
658            if (ch1 == ch)
659                return ucs1lib_fastsearch((Py_UCS1 *) s, size, &ch1, 1, 0, mode);
660            else
661                return -1;
662        }
663    case PyUnicode_2BYTE_KIND:
664        {
665            Py_UCS2 ch2 = (Py_UCS2) ch;
666            if (ch2 == ch)
667                return ucs2lib_fastsearch((Py_UCS2 *) s, size, &ch2, 1, 0, mode);
668            else
669                return -1;
670        }
671    case PyUnicode_4BYTE_KIND:
672        return ucs4lib_fastsearch((Py_UCS4 *) s, size, &ch, 1, 0, mode);
673    default:
674        assert(0);
675        return -1;
676    }
677}
678
679#ifdef Py_DEBUG
680/* Fill the data of an Unicode string with invalid characters to detect bugs
681   earlier.
682
683   _PyUnicode_CheckConsistency(str, 1) detects invalid characters, at least for
684   ASCII and UCS-4 strings. U+00FF is invalid in ASCII and U+FFFFFFFF is an
685   invalid character in Unicode 6.0. */
686static void
687unicode_fill_invalid(PyObject *unicode, Py_ssize_t old_length)
688{
689    int kind = PyUnicode_KIND(unicode);
690    Py_UCS1 *data = PyUnicode_1BYTE_DATA(unicode);
691    Py_ssize_t length = _PyUnicode_LENGTH(unicode);
692    if (length <= old_length)
693        return;
694    memset(data + old_length * kind, 0xff, (length - old_length) * kind);
695}
696#endif
697
698static PyObject*
699resize_compact(PyObject *unicode, Py_ssize_t length)
700{
701    Py_ssize_t char_size;
702    Py_ssize_t struct_size;
703    Py_ssize_t new_size;
704    int share_wstr;
705    PyObject *new_unicode;
706#ifdef Py_DEBUG
707    Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
708#endif
709
710    assert(unicode_modifiable(unicode));
711    assert(PyUnicode_IS_READY(unicode));
712    assert(PyUnicode_IS_COMPACT(unicode));
713
714    char_size = PyUnicode_KIND(unicode);
715    if (PyUnicode_IS_ASCII(unicode))
716        struct_size = sizeof(PyASCIIObject);
717    else
718        struct_size = sizeof(PyCompactUnicodeObject);
719    share_wstr = _PyUnicode_SHARE_WSTR(unicode);
720
721    if (length > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1)) {
722        PyErr_NoMemory();
723        return NULL;
724    }
725    new_size = (struct_size + (length + 1) * char_size);
726
727    _Py_DEC_REFTOTAL;
728    _Py_ForgetReference(unicode);
729
730    new_unicode = (PyObject *)PyObject_REALLOC((char *)unicode, new_size);
731    if (new_unicode == NULL) {
732        _Py_NewReference(unicode);
733        PyErr_NoMemory();
734        return NULL;
735    }
736    unicode = new_unicode;
737    _Py_NewReference(unicode);
738
739    _PyUnicode_LENGTH(unicode) = length;
740    if (share_wstr) {
741        _PyUnicode_WSTR(unicode) = PyUnicode_DATA(unicode);
742        if (!PyUnicode_IS_ASCII(unicode))
743            _PyUnicode_WSTR_LENGTH(unicode) = length;
744    }
745    else if (_PyUnicode_HAS_WSTR_MEMORY(unicode)) {
746        PyObject_DEL(_PyUnicode_WSTR(unicode));
747        _PyUnicode_WSTR(unicode) = NULL;
748    }
749#ifdef Py_DEBUG
750    unicode_fill_invalid(unicode, old_length);
751#endif
752    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
753                    length, 0);
754    assert(_PyUnicode_CheckConsistency(unicode, 0));
755    return unicode;
756}
757
758static int
759resize_inplace(PyObject *unicode, Py_ssize_t length)
760{
761    wchar_t *wstr;
762    Py_ssize_t new_size;
763    assert(!PyUnicode_IS_COMPACT(unicode));
764    assert(Py_REFCNT(unicode) == 1);
765
766    if (PyUnicode_IS_READY(unicode)) {
767        Py_ssize_t char_size;
768        int share_wstr, share_utf8;
769        void *data;
770#ifdef Py_DEBUG
771        Py_ssize_t old_length = _PyUnicode_LENGTH(unicode);
772#endif
773
774        data = _PyUnicode_DATA_ANY(unicode);
775        char_size = PyUnicode_KIND(unicode);
776        share_wstr = _PyUnicode_SHARE_WSTR(unicode);
777        share_utf8 = _PyUnicode_SHARE_UTF8(unicode);
778
779        if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
780            PyErr_NoMemory();
781            return -1;
782        }
783        new_size = (length + 1) * char_size;
784
785        if (!share_utf8 && _PyUnicode_HAS_UTF8_MEMORY(unicode))
786        {
787            PyObject_DEL(_PyUnicode_UTF8(unicode));
788            _PyUnicode_UTF8(unicode) = NULL;
789            _PyUnicode_UTF8_LENGTH(unicode) = 0;
790        }
791
792        data = (PyObject *)PyObject_REALLOC(data, new_size);
793        if (data == NULL) {
794            PyErr_NoMemory();
795            return -1;
796        }
797        _PyUnicode_DATA_ANY(unicode) = data;
798        if (share_wstr) {
799            _PyUnicode_WSTR(unicode) = data;
800            _PyUnicode_WSTR_LENGTH(unicode) = length;
801        }
802        if (share_utf8) {
803            _PyUnicode_UTF8(unicode) = data;
804            _PyUnicode_UTF8_LENGTH(unicode) = length;
805        }
806        _PyUnicode_LENGTH(unicode) = length;
807        PyUnicode_WRITE(PyUnicode_KIND(unicode), data, length, 0);
808#ifdef Py_DEBUG
809        unicode_fill_invalid(unicode, old_length);
810#endif
811        if (share_wstr || _PyUnicode_WSTR(unicode) == NULL) {
812            assert(_PyUnicode_CheckConsistency(unicode, 0));
813            return 0;
814        }
815    }
816    assert(_PyUnicode_WSTR(unicode) != NULL);
817
818    /* check for integer overflow */
819    if (length > PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
820        PyErr_NoMemory();
821        return -1;
822    }
823    new_size = sizeof(wchar_t) * (length + 1);
824    wstr =  _PyUnicode_WSTR(unicode);
825    wstr = PyObject_REALLOC(wstr, new_size);
826    if (!wstr) {
827        PyErr_NoMemory();
828        return -1;
829    }
830    _PyUnicode_WSTR(unicode) = wstr;
831    _PyUnicode_WSTR(unicode)[length] = 0;
832    _PyUnicode_WSTR_LENGTH(unicode) = length;
833    assert(_PyUnicode_CheckConsistency(unicode, 0));
834    return 0;
835}
836
837static PyObject*
838resize_copy(PyObject *unicode, Py_ssize_t length)
839{
840    Py_ssize_t copy_length;
841    if (_PyUnicode_KIND(unicode) != PyUnicode_WCHAR_KIND) {
842        PyObject *copy;
843
844        if (PyUnicode_READY(unicode) == -1)
845            return NULL;
846
847        copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
848        if (copy == NULL)
849            return NULL;
850
851        copy_length = Py_MIN(length, PyUnicode_GET_LENGTH(unicode));
852        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, copy_length);
853        return copy;
854    }
855    else {
856        PyObject *w;
857
858        w = (PyObject*)_PyUnicode_New(length);
859        if (w == NULL)
860            return NULL;
861        copy_length = _PyUnicode_WSTR_LENGTH(unicode);
862        copy_length = Py_MIN(copy_length, length);
863        Py_MEMCPY(_PyUnicode_WSTR(w), _PyUnicode_WSTR(unicode),
864                  copy_length * sizeof(wchar_t));
865        return w;
866    }
867}
868
869/* We allocate one more byte to make sure the string is
870   Ux0000 terminated; some code (e.g. new_identifier)
871   relies on that.
872
873   XXX This allocator could further be enhanced by assuring that the
874   free list never reduces its size below 1.
875
876*/
877
878static PyUnicodeObject *
879_PyUnicode_New(Py_ssize_t length)
880{
881    PyUnicodeObject *unicode;
882    size_t new_size;
883
884    /* Optimization for empty strings */
885    if (length == 0 && unicode_empty != NULL) {
886        Py_INCREF(unicode_empty);
887        return (PyUnicodeObject*)unicode_empty;
888    }
889
890    /* Ensure we won't overflow the size. */
891    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
892        return (PyUnicodeObject *)PyErr_NoMemory();
893    }
894    if (length < 0) {
895        PyErr_SetString(PyExc_SystemError,
896                        "Negative size passed to _PyUnicode_New");
897        return NULL;
898    }
899
900    unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
901    if (unicode == NULL)
902        return NULL;
903    new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
904
905    _PyUnicode_WSTR_LENGTH(unicode) = length;
906    _PyUnicode_HASH(unicode) = -1;
907    _PyUnicode_STATE(unicode).interned = 0;
908    _PyUnicode_STATE(unicode).kind = 0;
909    _PyUnicode_STATE(unicode).compact = 0;
910    _PyUnicode_STATE(unicode).ready = 0;
911    _PyUnicode_STATE(unicode).ascii = 0;
912    _PyUnicode_DATA_ANY(unicode) = NULL;
913    _PyUnicode_LENGTH(unicode) = 0;
914    _PyUnicode_UTF8(unicode) = NULL;
915    _PyUnicode_UTF8_LENGTH(unicode) = 0;
916
917    _PyUnicode_WSTR(unicode) = (Py_UNICODE*) PyObject_MALLOC(new_size);
918    if (!_PyUnicode_WSTR(unicode)) {
919        Py_DECREF(unicode);
920        PyErr_NoMemory();
921        return NULL;
922    }
923
924    /* Initialize the first element to guard against cases where
925     * the caller fails before initializing str -- unicode_resize()
926     * reads str[0], and the Keep-Alive optimization can keep memory
927     * allocated for str alive across a call to unicode_dealloc(unicode).
928     * We don't want unicode_resize to read uninitialized memory in
929     * that case.
930     */
931    _PyUnicode_WSTR(unicode)[0] = 0;
932    _PyUnicode_WSTR(unicode)[length] = 0;
933
934    assert(_PyUnicode_CheckConsistency((PyObject *)unicode, 0));
935    return unicode;
936}
937
938static const char*
939unicode_kind_name(PyObject *unicode)
940{
941    /* don't check consistency: unicode_kind_name() is called from
942       _PyUnicode_Dump() */
943    if (!PyUnicode_IS_COMPACT(unicode))
944    {
945        if (!PyUnicode_IS_READY(unicode))
946            return "wstr";
947        switch (PyUnicode_KIND(unicode))
948        {
949        case PyUnicode_1BYTE_KIND:
950            if (PyUnicode_IS_ASCII(unicode))
951                return "legacy ascii";
952            else
953                return "legacy latin1";
954        case PyUnicode_2BYTE_KIND:
955            return "legacy UCS2";
956        case PyUnicode_4BYTE_KIND:
957            return "legacy UCS4";
958        default:
959            return "<legacy invalid kind>";
960        }
961    }
962    assert(PyUnicode_IS_READY(unicode));
963    switch (PyUnicode_KIND(unicode)) {
964    case PyUnicode_1BYTE_KIND:
965        if (PyUnicode_IS_ASCII(unicode))
966            return "ascii";
967        else
968            return "latin1";
969    case PyUnicode_2BYTE_KIND:
970        return "UCS2";
971    case PyUnicode_4BYTE_KIND:
972        return "UCS4";
973    default:
974        return "<invalid compact kind>";
975    }
976}
977
978#ifdef Py_DEBUG
979/* Functions wrapping macros for use in debugger */
980char *_PyUnicode_utf8(void *unicode){
981    return PyUnicode_UTF8(unicode);
982}
983
984void *_PyUnicode_compact_data(void *unicode) {
985    return _PyUnicode_COMPACT_DATA(unicode);
986}
987void *_PyUnicode_data(void *unicode){
988    printf("obj %p\n", unicode);
989    printf("compact %d\n", PyUnicode_IS_COMPACT(unicode));
990    printf("compact ascii %d\n", PyUnicode_IS_COMPACT_ASCII(unicode));
991    printf("ascii op %p\n", ((void*)((PyASCIIObject*)(unicode) + 1)));
992    printf("compact op %p\n", ((void*)((PyCompactUnicodeObject*)(unicode) + 1)));
993    printf("compact data %p\n", _PyUnicode_COMPACT_DATA(unicode));
994    return PyUnicode_DATA(unicode);
995}
996
997void
998_PyUnicode_Dump(PyObject *op)
999{
1000    PyASCIIObject *ascii = (PyASCIIObject *)op;
1001    PyCompactUnicodeObject *compact = (PyCompactUnicodeObject *)op;
1002    PyUnicodeObject *unicode = (PyUnicodeObject *)op;
1003    void *data;
1004
1005    if (ascii->state.compact)
1006    {
1007        if (ascii->state.ascii)
1008            data = (ascii + 1);
1009        else
1010            data = (compact + 1);
1011    }
1012    else
1013        data = unicode->data.any;
1014    printf("%s: len=%" PY_FORMAT_SIZE_T "u, ",
1015           unicode_kind_name(op), ascii->length);
1016
1017    if (ascii->wstr == data)
1018        printf("shared ");
1019    printf("wstr=%p", ascii->wstr);
1020
1021    if (!(ascii->state.ascii == 1 && ascii->state.compact == 1)) {
1022        printf(" (%" PY_FORMAT_SIZE_T "u), ", compact->wstr_length);
1023        if (!ascii->state.compact && compact->utf8 == unicode->data.any)
1024            printf("shared ");
1025        printf("utf8=%p (%" PY_FORMAT_SIZE_T "u)",
1026               compact->utf8, compact->utf8_length);
1027    }
1028    printf(", data=%p\n", data);
1029}
1030#endif
1031
1032PyObject *
1033PyUnicode_New(Py_ssize_t size, Py_UCS4 maxchar)
1034{
1035    PyObject *obj;
1036    PyCompactUnicodeObject *unicode;
1037    void *data;
1038    enum PyUnicode_Kind kind;
1039    int is_sharing, is_ascii;
1040    Py_ssize_t char_size;
1041    Py_ssize_t struct_size;
1042
1043    /* Optimization for empty strings */
1044    if (size == 0 && unicode_empty != NULL) {
1045        Py_INCREF(unicode_empty);
1046        return unicode_empty;
1047    }
1048
1049    is_ascii = 0;
1050    is_sharing = 0;
1051    struct_size = sizeof(PyCompactUnicodeObject);
1052    if (maxchar < 128) {
1053        kind = PyUnicode_1BYTE_KIND;
1054        char_size = 1;
1055        is_ascii = 1;
1056        struct_size = sizeof(PyASCIIObject);
1057    }
1058    else if (maxchar < 256) {
1059        kind = PyUnicode_1BYTE_KIND;
1060        char_size = 1;
1061    }
1062    else if (maxchar < 65536) {
1063        kind = PyUnicode_2BYTE_KIND;
1064        char_size = 2;
1065        if (sizeof(wchar_t) == 2)
1066            is_sharing = 1;
1067    }
1068    else {
1069        if (maxchar > MAX_UNICODE) {
1070            PyErr_SetString(PyExc_SystemError,
1071                            "invalid maximum character passed to PyUnicode_New");
1072            return NULL;
1073        }
1074        kind = PyUnicode_4BYTE_KIND;
1075        char_size = 4;
1076        if (sizeof(wchar_t) == 4)
1077            is_sharing = 1;
1078    }
1079
1080    /* Ensure we won't overflow the size. */
1081    if (size < 0) {
1082        PyErr_SetString(PyExc_SystemError,
1083                        "Negative size passed to PyUnicode_New");
1084        return NULL;
1085    }
1086    if (size > ((PY_SSIZE_T_MAX - struct_size) / char_size - 1))
1087        return PyErr_NoMemory();
1088
1089    /* Duplicated allocation code from _PyObject_New() instead of a call to
1090     * PyObject_New() so we are able to allocate space for the object and
1091     * it's data buffer.
1092     */
1093    obj = (PyObject *) PyObject_MALLOC(struct_size + (size + 1) * char_size);
1094    if (obj == NULL)
1095        return PyErr_NoMemory();
1096    obj = PyObject_INIT(obj, &PyUnicode_Type);
1097    if (obj == NULL)
1098        return NULL;
1099
1100    unicode = (PyCompactUnicodeObject *)obj;
1101    if (is_ascii)
1102        data = ((PyASCIIObject*)obj) + 1;
1103    else
1104        data = unicode + 1;
1105    _PyUnicode_LENGTH(unicode) = size;
1106    _PyUnicode_HASH(unicode) = -1;
1107    _PyUnicode_STATE(unicode).interned = 0;
1108    _PyUnicode_STATE(unicode).kind = kind;
1109    _PyUnicode_STATE(unicode).compact = 1;
1110    _PyUnicode_STATE(unicode).ready = 1;
1111    _PyUnicode_STATE(unicode).ascii = is_ascii;
1112    if (is_ascii) {
1113        ((char*)data)[size] = 0;
1114        _PyUnicode_WSTR(unicode) = NULL;
1115    }
1116    else if (kind == PyUnicode_1BYTE_KIND) {
1117        ((char*)data)[size] = 0;
1118        _PyUnicode_WSTR(unicode) = NULL;
1119        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1120        unicode->utf8 = NULL;
1121        unicode->utf8_length = 0;
1122    }
1123    else {
1124        unicode->utf8 = NULL;
1125        unicode->utf8_length = 0;
1126        if (kind == PyUnicode_2BYTE_KIND)
1127            ((Py_UCS2*)data)[size] = 0;
1128        else /* kind == PyUnicode_4BYTE_KIND */
1129            ((Py_UCS4*)data)[size] = 0;
1130        if (is_sharing) {
1131            _PyUnicode_WSTR_LENGTH(unicode) = size;
1132            _PyUnicode_WSTR(unicode) = (wchar_t *)data;
1133        }
1134        else {
1135            _PyUnicode_WSTR_LENGTH(unicode) = 0;
1136            _PyUnicode_WSTR(unicode) = NULL;
1137        }
1138    }
1139#ifdef Py_DEBUG
1140    unicode_fill_invalid((PyObject*)unicode, 0);
1141#endif
1142    assert(_PyUnicode_CheckConsistency((PyObject*)unicode, 0));
1143    return obj;
1144}
1145
1146#if SIZEOF_WCHAR_T == 2
1147/* Helper function to convert a 16-bits wchar_t representation to UCS4, this
1148   will decode surrogate pairs, the other conversions are implemented as macros
1149   for efficiency.
1150
1151   This function assumes that unicode can hold one more code point than wstr
1152   characters for a terminating null character. */
1153static void
1154unicode_convert_wchar_to_ucs4(const wchar_t *begin, const wchar_t *end,
1155                              PyObject *unicode)
1156{
1157    const wchar_t *iter;
1158    Py_UCS4 *ucs4_out;
1159
1160    assert(unicode != NULL);
1161    assert(_PyUnicode_CHECK(unicode));
1162    assert(_PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1163    ucs4_out = PyUnicode_4BYTE_DATA(unicode);
1164
1165    for (iter = begin; iter < end; ) {
1166        assert(ucs4_out < (PyUnicode_4BYTE_DATA(unicode) +
1167                           _PyUnicode_GET_LENGTH(unicode)));
1168        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1169            && (iter+1) < end
1170            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1171        {
1172            *ucs4_out++ = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1173            iter += 2;
1174        }
1175        else {
1176            *ucs4_out++ = *iter;
1177            iter++;
1178        }
1179    }
1180    assert(ucs4_out == (PyUnicode_4BYTE_DATA(unicode) +
1181                        _PyUnicode_GET_LENGTH(unicode)));
1182
1183}
1184#endif
1185
1186static int
1187unicode_check_modifiable(PyObject *unicode)
1188{
1189    if (!unicode_modifiable(unicode)) {
1190        PyErr_SetString(PyExc_SystemError,
1191                        "Cannot modify a string currently used");
1192        return -1;
1193    }
1194    return 0;
1195}
1196
1197static int
1198_copy_characters(PyObject *to, Py_ssize_t to_start,
1199                 PyObject *from, Py_ssize_t from_start,
1200                 Py_ssize_t how_many, int check_maxchar)
1201{
1202    unsigned int from_kind, to_kind;
1203    void *from_data, *to_data;
1204
1205    assert(0 <= how_many);
1206    assert(0 <= from_start);
1207    assert(0 <= to_start);
1208    assert(PyUnicode_Check(from));
1209    assert(PyUnicode_IS_READY(from));
1210    assert(from_start + how_many <= PyUnicode_GET_LENGTH(from));
1211
1212    assert(PyUnicode_Check(to));
1213    assert(PyUnicode_IS_READY(to));
1214    assert(to_start + how_many <= PyUnicode_GET_LENGTH(to));
1215
1216    if (how_many == 0)
1217        return 0;
1218
1219    from_kind = PyUnicode_KIND(from);
1220    from_data = PyUnicode_DATA(from);
1221    to_kind = PyUnicode_KIND(to);
1222    to_data = PyUnicode_DATA(to);
1223
1224#ifdef Py_DEBUG
1225    if (!check_maxchar
1226        && PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to))
1227    {
1228        const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1229        Py_UCS4 ch;
1230        Py_ssize_t i;
1231        for (i=0; i < how_many; i++) {
1232            ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1233            assert(ch <= to_maxchar);
1234        }
1235    }
1236#endif
1237
1238    if (from_kind == to_kind) {
1239        if (check_maxchar
1240            && !PyUnicode_IS_ASCII(from) && PyUnicode_IS_ASCII(to))
1241        {
1242            /* Writing Latin-1 characters into an ASCII string requires to
1243               check that all written characters are pure ASCII */
1244            Py_UCS4 max_char;
1245            max_char = ucs1lib_find_max_char(from_data,
1246                                             (Py_UCS1*)from_data + how_many);
1247            if (max_char >= 128)
1248                return -1;
1249        }
1250        Py_MEMCPY((char*)to_data + to_kind * to_start,
1251                  (char*)from_data + from_kind * from_start,
1252                  to_kind * how_many);
1253    }
1254    else if (from_kind == PyUnicode_1BYTE_KIND
1255             && to_kind == PyUnicode_2BYTE_KIND)
1256    {
1257        _PyUnicode_CONVERT_BYTES(
1258            Py_UCS1, Py_UCS2,
1259            PyUnicode_1BYTE_DATA(from) + from_start,
1260            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1261            PyUnicode_2BYTE_DATA(to) + to_start
1262            );
1263    }
1264    else if (from_kind == PyUnicode_1BYTE_KIND
1265             && to_kind == PyUnicode_4BYTE_KIND)
1266    {
1267        _PyUnicode_CONVERT_BYTES(
1268            Py_UCS1, Py_UCS4,
1269            PyUnicode_1BYTE_DATA(from) + from_start,
1270            PyUnicode_1BYTE_DATA(from) + from_start + how_many,
1271            PyUnicode_4BYTE_DATA(to) + to_start
1272            );
1273    }
1274    else if (from_kind == PyUnicode_2BYTE_KIND
1275             && to_kind == PyUnicode_4BYTE_KIND)
1276    {
1277        _PyUnicode_CONVERT_BYTES(
1278            Py_UCS2, Py_UCS4,
1279            PyUnicode_2BYTE_DATA(from) + from_start,
1280            PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1281            PyUnicode_4BYTE_DATA(to) + to_start
1282            );
1283    }
1284    else {
1285        assert (PyUnicode_MAX_CHAR_VALUE(from) > PyUnicode_MAX_CHAR_VALUE(to));
1286
1287        if (!check_maxchar) {
1288            if (from_kind == PyUnicode_2BYTE_KIND
1289                && to_kind == PyUnicode_1BYTE_KIND)
1290            {
1291                _PyUnicode_CONVERT_BYTES(
1292                    Py_UCS2, Py_UCS1,
1293                    PyUnicode_2BYTE_DATA(from) + from_start,
1294                    PyUnicode_2BYTE_DATA(from) + from_start + how_many,
1295                    PyUnicode_1BYTE_DATA(to) + to_start
1296                    );
1297            }
1298            else if (from_kind == PyUnicode_4BYTE_KIND
1299                     && to_kind == PyUnicode_1BYTE_KIND)
1300            {
1301                _PyUnicode_CONVERT_BYTES(
1302                    Py_UCS4, Py_UCS1,
1303                    PyUnicode_4BYTE_DATA(from) + from_start,
1304                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1305                    PyUnicode_1BYTE_DATA(to) + to_start
1306                    );
1307            }
1308            else if (from_kind == PyUnicode_4BYTE_KIND
1309                     && to_kind == PyUnicode_2BYTE_KIND)
1310            {
1311                _PyUnicode_CONVERT_BYTES(
1312                    Py_UCS4, Py_UCS2,
1313                    PyUnicode_4BYTE_DATA(from) + from_start,
1314                    PyUnicode_4BYTE_DATA(from) + from_start + how_many,
1315                    PyUnicode_2BYTE_DATA(to) + to_start
1316                    );
1317            }
1318            else {
1319                assert(0);
1320                return -1;
1321            }
1322        }
1323        else {
1324            const Py_UCS4 to_maxchar = PyUnicode_MAX_CHAR_VALUE(to);
1325            Py_UCS4 ch;
1326            Py_ssize_t i;
1327
1328            for (i=0; i < how_many; i++) {
1329                ch = PyUnicode_READ(from_kind, from_data, from_start + i);
1330                if (ch > to_maxchar)
1331                    return -1;
1332                PyUnicode_WRITE(to_kind, to_data, to_start + i, ch);
1333            }
1334        }
1335    }
1336    return 0;
1337}
1338
1339void
1340_PyUnicode_FastCopyCharacters(
1341    PyObject *to, Py_ssize_t to_start,
1342    PyObject *from, Py_ssize_t from_start, Py_ssize_t how_many)
1343{
1344    (void)_copy_characters(to, to_start, from, from_start, how_many, 0);
1345}
1346
1347Py_ssize_t
1348PyUnicode_CopyCharacters(PyObject *to, Py_ssize_t to_start,
1349                         PyObject *from, Py_ssize_t from_start,
1350                         Py_ssize_t how_many)
1351{
1352    int err;
1353
1354    if (!PyUnicode_Check(from) || !PyUnicode_Check(to)) {
1355        PyErr_BadInternalCall();
1356        return -1;
1357    }
1358
1359    if (PyUnicode_READY(from) == -1)
1360        return -1;
1361    if (PyUnicode_READY(to) == -1)
1362        return -1;
1363
1364    if (from_start < 0) {
1365        PyErr_SetString(PyExc_IndexError, "string index out of range");
1366        return -1;
1367    }
1368    if (to_start < 0) {
1369        PyErr_SetString(PyExc_IndexError, "string index out of range");
1370        return -1;
1371    }
1372    how_many = Py_MIN(PyUnicode_GET_LENGTH(from), how_many);
1373    if (to_start + how_many > PyUnicode_GET_LENGTH(to)) {
1374        PyErr_Format(PyExc_SystemError,
1375                     "Cannot write %zi characters at %zi "
1376                     "in a string of %zi characters",
1377                     how_many, to_start, PyUnicode_GET_LENGTH(to));
1378        return -1;
1379    }
1380
1381    if (how_many == 0)
1382        return 0;
1383
1384    if (unicode_check_modifiable(to))
1385        return -1;
1386
1387    err = _copy_characters(to, to_start, from, from_start, how_many, 1);
1388    if (err) {
1389        PyErr_Format(PyExc_SystemError,
1390                     "Cannot copy %s characters "
1391                     "into a string of %s characters",
1392                     unicode_kind_name(from),
1393                     unicode_kind_name(to));
1394        return -1;
1395    }
1396    return how_many;
1397}
1398
1399/* Find the maximum code point and count the number of surrogate pairs so a
1400   correct string length can be computed before converting a string to UCS4.
1401   This function counts single surrogates as a character and not as a pair.
1402
1403   Return 0 on success, or -1 on error. */
1404static int
1405find_maxchar_surrogates(const wchar_t *begin, const wchar_t *end,
1406                        Py_UCS4 *maxchar, Py_ssize_t *num_surrogates)
1407{
1408    const wchar_t *iter;
1409    Py_UCS4 ch;
1410
1411    assert(num_surrogates != NULL && maxchar != NULL);
1412    *num_surrogates = 0;
1413    *maxchar = 0;
1414
1415    for (iter = begin; iter < end; ) {
1416#if SIZEOF_WCHAR_T == 2
1417        if (Py_UNICODE_IS_HIGH_SURROGATE(iter[0])
1418            && (iter+1) < end
1419            && Py_UNICODE_IS_LOW_SURROGATE(iter[1]))
1420        {
1421            ch = Py_UNICODE_JOIN_SURROGATES(iter[0], iter[1]);
1422            ++(*num_surrogates);
1423            iter += 2;
1424        }
1425        else
1426#endif
1427        {
1428            ch = *iter;
1429            iter++;
1430        }
1431        if (ch > *maxchar) {
1432            *maxchar = ch;
1433            if (*maxchar > MAX_UNICODE) {
1434                PyErr_Format(PyExc_ValueError,
1435                             "character U+%x is not in range [U+0000; U+10ffff]",
1436                             ch);
1437                return -1;
1438            }
1439        }
1440    }
1441    return 0;
1442}
1443
1444int
1445_PyUnicode_Ready(PyObject *unicode)
1446{
1447    wchar_t *end;
1448    Py_UCS4 maxchar = 0;
1449    Py_ssize_t num_surrogates;
1450#if SIZEOF_WCHAR_T == 2
1451    Py_ssize_t length_wo_surrogates;
1452#endif
1453
1454    /* _PyUnicode_Ready() is only intended for old-style API usage where
1455       strings were created using _PyObject_New() and where no canonical
1456       representation (the str field) has been set yet aka strings
1457       which are not yet ready. */
1458    assert(_PyUnicode_CHECK(unicode));
1459    assert(_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND);
1460    assert(_PyUnicode_WSTR(unicode) != NULL);
1461    assert(_PyUnicode_DATA_ANY(unicode) == NULL);
1462    assert(_PyUnicode_UTF8(unicode) == NULL);
1463    /* Actually, it should neither be interned nor be anything else: */
1464    assert(_PyUnicode_STATE(unicode).interned == SSTATE_NOT_INTERNED);
1465
1466    end = _PyUnicode_WSTR(unicode) + _PyUnicode_WSTR_LENGTH(unicode);
1467    if (find_maxchar_surrogates(_PyUnicode_WSTR(unicode), end,
1468                                &maxchar, &num_surrogates) == -1)
1469        return -1;
1470
1471    if (maxchar < 256) {
1472        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(_PyUnicode_WSTR_LENGTH(unicode) + 1);
1473        if (!_PyUnicode_DATA_ANY(unicode)) {
1474            PyErr_NoMemory();
1475            return -1;
1476        }
1477        _PyUnicode_CONVERT_BYTES(wchar_t, unsigned char,
1478                                _PyUnicode_WSTR(unicode), end,
1479                                PyUnicode_1BYTE_DATA(unicode));
1480        PyUnicode_1BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1481        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1482        _PyUnicode_STATE(unicode).kind = PyUnicode_1BYTE_KIND;
1483        if (maxchar < 128) {
1484            _PyUnicode_STATE(unicode).ascii = 1;
1485            _PyUnicode_UTF8(unicode) = _PyUnicode_DATA_ANY(unicode);
1486            _PyUnicode_UTF8_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1487        }
1488        else {
1489            _PyUnicode_STATE(unicode).ascii = 0;
1490            _PyUnicode_UTF8(unicode) = NULL;
1491            _PyUnicode_UTF8_LENGTH(unicode) = 0;
1492        }
1493        PyObject_FREE(_PyUnicode_WSTR(unicode));
1494        _PyUnicode_WSTR(unicode) = NULL;
1495        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1496    }
1497    /* In this case we might have to convert down from 4-byte native
1498       wchar_t to 2-byte unicode. */
1499    else if (maxchar < 65536) {
1500        assert(num_surrogates == 0 &&
1501               "FindMaxCharAndNumSurrogatePairs() messed up");
1502
1503#if SIZEOF_WCHAR_T == 2
1504        /* We can share representations and are done. */
1505        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1506        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1507        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1508        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1509        _PyUnicode_UTF8(unicode) = NULL;
1510        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1511#else
1512        /* sizeof(wchar_t) == 4 */
1513        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(
1514            2 * (_PyUnicode_WSTR_LENGTH(unicode) + 1));
1515        if (!_PyUnicode_DATA_ANY(unicode)) {
1516            PyErr_NoMemory();
1517            return -1;
1518        }
1519        _PyUnicode_CONVERT_BYTES(wchar_t, Py_UCS2,
1520                                _PyUnicode_WSTR(unicode), end,
1521                                PyUnicode_2BYTE_DATA(unicode));
1522        PyUnicode_2BYTE_DATA(unicode)[_PyUnicode_WSTR_LENGTH(unicode)] = '\0';
1523        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1524        _PyUnicode_STATE(unicode).kind = PyUnicode_2BYTE_KIND;
1525        _PyUnicode_UTF8(unicode) = NULL;
1526        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1527        PyObject_FREE(_PyUnicode_WSTR(unicode));
1528        _PyUnicode_WSTR(unicode) = NULL;
1529        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1530#endif
1531    }
1532    /* maxchar exeeds 16 bit, wee need 4 bytes for unicode characters */
1533    else {
1534#if SIZEOF_WCHAR_T == 2
1535        /* in case the native representation is 2-bytes, we need to allocate a
1536           new normalized 4-byte version. */
1537        length_wo_surrogates = _PyUnicode_WSTR_LENGTH(unicode) - num_surrogates;
1538        if (length_wo_surrogates > PY_SSIZE_T_MAX / 4 - 1) {
1539            PyErr_NoMemory();
1540            return -1;
1541        }
1542        _PyUnicode_DATA_ANY(unicode) = PyObject_MALLOC(4 * (length_wo_surrogates + 1));
1543        if (!_PyUnicode_DATA_ANY(unicode)) {
1544            PyErr_NoMemory();
1545            return -1;
1546        }
1547        _PyUnicode_LENGTH(unicode) = length_wo_surrogates;
1548        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1549        _PyUnicode_UTF8(unicode) = NULL;
1550        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1551        /* unicode_convert_wchar_to_ucs4() requires a ready string */
1552        _PyUnicode_STATE(unicode).ready = 1;
1553        unicode_convert_wchar_to_ucs4(_PyUnicode_WSTR(unicode), end, unicode);
1554        PyObject_FREE(_PyUnicode_WSTR(unicode));
1555        _PyUnicode_WSTR(unicode) = NULL;
1556        _PyUnicode_WSTR_LENGTH(unicode) = 0;
1557#else
1558        assert(num_surrogates == 0);
1559
1560        _PyUnicode_DATA_ANY(unicode) = _PyUnicode_WSTR(unicode);
1561        _PyUnicode_LENGTH(unicode) = _PyUnicode_WSTR_LENGTH(unicode);
1562        _PyUnicode_UTF8(unicode) = NULL;
1563        _PyUnicode_UTF8_LENGTH(unicode) = 0;
1564        _PyUnicode_STATE(unicode).kind = PyUnicode_4BYTE_KIND;
1565#endif
1566        PyUnicode_4BYTE_DATA(unicode)[_PyUnicode_LENGTH(unicode)] = '\0';
1567    }
1568    _PyUnicode_STATE(unicode).ready = 1;
1569    assert(_PyUnicode_CheckConsistency(unicode, 1));
1570    return 0;
1571}
1572
1573static void
1574unicode_dealloc(PyObject *unicode)
1575{
1576    switch (PyUnicode_CHECK_INTERNED(unicode)) {
1577    case SSTATE_NOT_INTERNED:
1578        break;
1579
1580    case SSTATE_INTERNED_MORTAL:
1581        /* revive dead object temporarily for DelItem */
1582        Py_REFCNT(unicode) = 3;
1583        if (PyDict_DelItem(interned, unicode) != 0)
1584            Py_FatalError(
1585                "deletion of interned string failed");
1586        break;
1587
1588    case SSTATE_INTERNED_IMMORTAL:
1589        Py_FatalError("Immortal interned string died.");
1590
1591    default:
1592        Py_FatalError("Inconsistent interned string state.");
1593    }
1594
1595    if (_PyUnicode_HAS_WSTR_MEMORY(unicode))
1596        PyObject_DEL(_PyUnicode_WSTR(unicode));
1597    if (_PyUnicode_HAS_UTF8_MEMORY(unicode))
1598        PyObject_DEL(_PyUnicode_UTF8(unicode));
1599    if (!PyUnicode_IS_COMPACT(unicode) && _PyUnicode_DATA_ANY(unicode))
1600        PyObject_DEL(_PyUnicode_DATA_ANY(unicode));
1601
1602    Py_TYPE(unicode)->tp_free(unicode);
1603}
1604
1605#ifdef Py_DEBUG
1606static int
1607unicode_is_singleton(PyObject *unicode)
1608{
1609    PyASCIIObject *ascii = (PyASCIIObject *)unicode;
1610    if (unicode == unicode_empty)
1611        return 1;
1612    if (ascii->state.kind != PyUnicode_WCHAR_KIND && ascii->length == 1)
1613    {
1614        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, 0);
1615        if (ch < 256 && unicode_latin1[ch] == unicode)
1616            return 1;
1617    }
1618    return 0;
1619}
1620#endif
1621
1622static int
1623unicode_modifiable(PyObject *unicode)
1624{
1625    assert(_PyUnicode_CHECK(unicode));
1626    if (Py_REFCNT(unicode) != 1)
1627        return 0;
1628    if (_PyUnicode_HASH(unicode) != -1)
1629        return 0;
1630    if (PyUnicode_CHECK_INTERNED(unicode))
1631        return 0;
1632    if (!PyUnicode_CheckExact(unicode))
1633        return 0;
1634#ifdef Py_DEBUG
1635    /* singleton refcount is greater than 1 */
1636    assert(!unicode_is_singleton(unicode));
1637#endif
1638    return 1;
1639}
1640
1641static int
1642unicode_resize(PyObject **p_unicode, Py_ssize_t length)
1643{
1644    PyObject *unicode;
1645    Py_ssize_t old_length;
1646
1647    assert(p_unicode != NULL);
1648    unicode = *p_unicode;
1649
1650    assert(unicode != NULL);
1651    assert(PyUnicode_Check(unicode));
1652    assert(0 <= length);
1653
1654    if (_PyUnicode_KIND(unicode) == PyUnicode_WCHAR_KIND)
1655        old_length = PyUnicode_WSTR_LENGTH(unicode);
1656    else
1657        old_length = PyUnicode_GET_LENGTH(unicode);
1658    if (old_length == length)
1659        return 0;
1660
1661    if (length == 0) {
1662        _Py_INCREF_UNICODE_EMPTY();
1663        if (!unicode_empty)
1664            return -1;
1665        Py_DECREF(*p_unicode);
1666        *p_unicode = unicode_empty;
1667        return 0;
1668    }
1669
1670    if (!unicode_modifiable(unicode)) {
1671        PyObject *copy = resize_copy(unicode, length);
1672        if (copy == NULL)
1673            return -1;
1674        Py_DECREF(*p_unicode);
1675        *p_unicode = copy;
1676        return 0;
1677    }
1678
1679    if (PyUnicode_IS_COMPACT(unicode)) {
1680        PyObject *new_unicode = resize_compact(unicode, length);
1681        if (new_unicode == NULL)
1682            return -1;
1683        *p_unicode = new_unicode;
1684        return 0;
1685    }
1686    return resize_inplace(unicode, length);
1687}
1688
1689int
1690PyUnicode_Resize(PyObject **p_unicode, Py_ssize_t length)
1691{
1692    PyObject *unicode;
1693    if (p_unicode == NULL) {
1694        PyErr_BadInternalCall();
1695        return -1;
1696    }
1697    unicode = *p_unicode;
1698    if (unicode == NULL || !PyUnicode_Check(unicode) || length < 0)
1699    {
1700        PyErr_BadInternalCall();
1701        return -1;
1702    }
1703    return unicode_resize(p_unicode, length);
1704}
1705
1706/* Copy a ASCII or latin1 char* string into a Python Unicode string.
1707
1708   WARNING: The function doesn't copy the terminating null character and
1709   doesn't check the maximum character (may write a latin1 character in an
1710   ASCII string). */
1711static void
1712unicode_write_cstr(PyObject *unicode, Py_ssize_t index,
1713                   const char *str, Py_ssize_t len)
1714{
1715    enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
1716    void *data = PyUnicode_DATA(unicode);
1717    const char *end = str + len;
1718
1719    switch (kind) {
1720    case PyUnicode_1BYTE_KIND: {
1721        assert(index + len <= PyUnicode_GET_LENGTH(unicode));
1722#ifdef Py_DEBUG
1723        if (PyUnicode_IS_ASCII(unicode)) {
1724            Py_UCS4 maxchar = ucs1lib_find_max_char(
1725                (const Py_UCS1*)str,
1726                (const Py_UCS1*)str + len);
1727            assert(maxchar < 128);
1728        }
1729#endif
1730        memcpy((char *) data + index, str, len);
1731        break;
1732    }
1733    case PyUnicode_2BYTE_KIND: {
1734        Py_UCS2 *start = (Py_UCS2 *)data + index;
1735        Py_UCS2 *ucs2 = start;
1736        assert(index <= PyUnicode_GET_LENGTH(unicode));
1737
1738        for (; str < end; ++ucs2, ++str)
1739            *ucs2 = (Py_UCS2)*str;
1740
1741        assert((ucs2 - start) <= PyUnicode_GET_LENGTH(unicode));
1742        break;
1743    }
1744    default: {
1745        Py_UCS4 *start = (Py_UCS4 *)data + index;
1746        Py_UCS4 *ucs4 = start;
1747        assert(kind == PyUnicode_4BYTE_KIND);
1748        assert(index <= PyUnicode_GET_LENGTH(unicode));
1749
1750        for (; str < end; ++ucs4, ++str)
1751            *ucs4 = (Py_UCS4)*str;
1752
1753        assert((ucs4 - start) <= PyUnicode_GET_LENGTH(unicode));
1754    }
1755    }
1756}
1757
1758static PyObject*
1759get_latin1_char(unsigned char ch)
1760{
1761    PyObject *unicode = unicode_latin1[ch];
1762    if (!unicode) {
1763        unicode = PyUnicode_New(1, ch);
1764        if (!unicode)
1765            return NULL;
1766        PyUnicode_1BYTE_DATA(unicode)[0] = ch;
1767        assert(_PyUnicode_CheckConsistency(unicode, 1));
1768        unicode_latin1[ch] = unicode;
1769    }
1770    Py_INCREF(unicode);
1771    return unicode;
1772}
1773
1774static PyObject*
1775unicode_char(Py_UCS4 ch)
1776{
1777    PyObject *unicode;
1778
1779    assert(ch <= MAX_UNICODE);
1780
1781    if (ch < 256)
1782        return get_latin1_char(ch);
1783
1784    unicode = PyUnicode_New(1, ch);
1785    if (unicode == NULL)
1786        return NULL;
1787    switch (PyUnicode_KIND(unicode)) {
1788    case PyUnicode_1BYTE_KIND:
1789        PyUnicode_1BYTE_DATA(unicode)[0] = (Py_UCS1)ch;
1790        break;
1791    case PyUnicode_2BYTE_KIND:
1792        PyUnicode_2BYTE_DATA(unicode)[0] = (Py_UCS2)ch;
1793        break;
1794    default:
1795        assert(PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND);
1796        PyUnicode_4BYTE_DATA(unicode)[0] = ch;
1797    }
1798    assert(_PyUnicode_CheckConsistency(unicode, 1));
1799    return unicode;
1800}
1801
1802PyObject *
1803PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
1804{
1805    PyObject *unicode;
1806    Py_UCS4 maxchar = 0;
1807    Py_ssize_t num_surrogates;
1808
1809    if (u == NULL)
1810        return (PyObject*)_PyUnicode_New(size);
1811
1812    /* If the Unicode data is known at construction time, we can apply
1813       some optimizations which share commonly used objects. */
1814
1815    /* Optimization for empty strings */
1816    if (size == 0)
1817        _Py_RETURN_UNICODE_EMPTY();
1818
1819    /* Single character Unicode objects in the Latin-1 range are
1820       shared when using this constructor */
1821    if (size == 1 && (Py_UCS4)*u < 256)
1822        return get_latin1_char((unsigned char)*u);
1823
1824    /* If not empty and not single character, copy the Unicode data
1825       into the new object */
1826    if (find_maxchar_surrogates(u, u + size,
1827                                &maxchar, &num_surrogates) == -1)
1828        return NULL;
1829
1830    unicode = PyUnicode_New(size - num_surrogates, maxchar);
1831    if (!unicode)
1832        return NULL;
1833
1834    switch (PyUnicode_KIND(unicode)) {
1835    case PyUnicode_1BYTE_KIND:
1836        _PyUnicode_CONVERT_BYTES(Py_UNICODE, unsigned char,
1837                                u, u + size, PyUnicode_1BYTE_DATA(unicode));
1838        break;
1839    case PyUnicode_2BYTE_KIND:
1840#if Py_UNICODE_SIZE == 2
1841        Py_MEMCPY(PyUnicode_2BYTE_DATA(unicode), u, size * 2);
1842#else
1843        _PyUnicode_CONVERT_BYTES(Py_UNICODE, Py_UCS2,
1844                                u, u + size, PyUnicode_2BYTE_DATA(unicode));
1845#endif
1846        break;
1847    case PyUnicode_4BYTE_KIND:
1848#if SIZEOF_WCHAR_T == 2
1849        /* This is the only case which has to process surrogates, thus
1850           a simple copy loop is not enough and we need a function. */
1851        unicode_convert_wchar_to_ucs4(u, u + size, unicode);
1852#else
1853        assert(num_surrogates == 0);
1854        Py_MEMCPY(PyUnicode_4BYTE_DATA(unicode), u, size * 4);
1855#endif
1856        break;
1857    default:
1858        assert(0 && "Impossible state");
1859    }
1860
1861    return unicode_result(unicode);
1862}
1863
1864PyObject *
1865PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
1866{
1867    if (size < 0) {
1868        PyErr_SetString(PyExc_SystemError,
1869                        "Negative size passed to PyUnicode_FromStringAndSize");
1870        return NULL;
1871    }
1872    if (u != NULL)
1873        return PyUnicode_DecodeUTF8Stateful(u, size, NULL, NULL);
1874    else
1875        return (PyObject *)_PyUnicode_New(size);
1876}
1877
1878PyObject *
1879PyUnicode_FromString(const char *u)
1880{
1881    size_t size = strlen(u);
1882    if (size > PY_SSIZE_T_MAX) {
1883        PyErr_SetString(PyExc_OverflowError, "input too long");
1884        return NULL;
1885    }
1886    return PyUnicode_DecodeUTF8Stateful(u, (Py_ssize_t)size, NULL, NULL);
1887}
1888
1889PyObject *
1890_PyUnicode_FromId(_Py_Identifier *id)
1891{
1892    if (!id->object) {
1893        id->object = PyUnicode_DecodeUTF8Stateful(id->string,
1894                                                  strlen(id->string),
1895                                                  NULL, NULL);
1896        if (!id->object)
1897            return NULL;
1898        PyUnicode_InternInPlace(&id->object);
1899        assert(!id->next);
1900        id->next = static_strings;
1901        static_strings = id;
1902    }
1903    return id->object;
1904}
1905
1906void
1907_PyUnicode_ClearStaticStrings()
1908{
1909    _Py_Identifier *tmp, *s = static_strings;
1910    while (s) {
1911        Py_CLEAR(s->object);
1912        tmp = s->next;
1913        s->next = NULL;
1914        s = tmp;
1915    }
1916    static_strings = NULL;
1917}
1918
1919/* Internal function, doesn't check maximum character */
1920
1921PyObject*
1922_PyUnicode_FromASCII(const char *buffer, Py_ssize_t size)
1923{
1924    const unsigned char *s = (const unsigned char *)buffer;
1925    PyObject *unicode;
1926    if (size == 1) {
1927#ifdef Py_DEBUG
1928        assert((unsigned char)s[0] < 128);
1929#endif
1930        return get_latin1_char(s[0]);
1931    }
1932    unicode = PyUnicode_New(size, 127);
1933    if (!unicode)
1934        return NULL;
1935    memcpy(PyUnicode_1BYTE_DATA(unicode), s, size);
1936    assert(_PyUnicode_CheckConsistency(unicode, 1));
1937    return unicode;
1938}
1939
1940static Py_UCS4
1941kind_maxchar_limit(unsigned int kind)
1942{
1943    switch (kind) {
1944    case PyUnicode_1BYTE_KIND:
1945        return 0x80;
1946    case PyUnicode_2BYTE_KIND:
1947        return 0x100;
1948    case PyUnicode_4BYTE_KIND:
1949        return 0x10000;
1950    default:
1951        assert(0 && "invalid kind");
1952        return MAX_UNICODE;
1953    }
1954}
1955
1956Py_LOCAL_INLINE(Py_UCS4)
1957align_maxchar(Py_UCS4 maxchar)
1958{
1959    if (maxchar <= 127)
1960        return 127;
1961    else if (maxchar <= 255)
1962        return 255;
1963    else if (maxchar <= 65535)
1964        return 65535;
1965    else
1966        return MAX_UNICODE;
1967}
1968
1969static PyObject*
1970_PyUnicode_FromUCS1(const Py_UCS1* u, Py_ssize_t size)
1971{
1972    PyObject *res;
1973    unsigned char max_char;
1974
1975    if (size == 0)
1976        _Py_RETURN_UNICODE_EMPTY();
1977    assert(size > 0);
1978    if (size == 1)
1979        return get_latin1_char(u[0]);
1980
1981    max_char = ucs1lib_find_max_char(u, u + size);
1982    res = PyUnicode_New(size, max_char);
1983    if (!res)
1984        return NULL;
1985    memcpy(PyUnicode_1BYTE_DATA(res), u, size);
1986    assert(_PyUnicode_CheckConsistency(res, 1));
1987    return res;
1988}
1989
1990static PyObject*
1991_PyUnicode_FromUCS2(const Py_UCS2 *u, Py_ssize_t size)
1992{
1993    PyObject *res;
1994    Py_UCS2 max_char;
1995
1996    if (size == 0)
1997        _Py_RETURN_UNICODE_EMPTY();
1998    assert(size > 0);
1999    if (size == 1)
2000        return unicode_char(u[0]);
2001
2002    max_char = ucs2lib_find_max_char(u, u + size);
2003    res = PyUnicode_New(size, max_char);
2004    if (!res)
2005        return NULL;
2006    if (max_char >= 256)
2007        memcpy(PyUnicode_2BYTE_DATA(res), u, sizeof(Py_UCS2)*size);
2008    else {
2009        _PyUnicode_CONVERT_BYTES(
2010            Py_UCS2, Py_UCS1, u, u + size, PyUnicode_1BYTE_DATA(res));
2011    }
2012    assert(_PyUnicode_CheckConsistency(res, 1));
2013    return res;
2014}
2015
2016static PyObject*
2017_PyUnicode_FromUCS4(const Py_UCS4 *u, Py_ssize_t size)
2018{
2019    PyObject *res;
2020    Py_UCS4 max_char;
2021
2022    if (size == 0)
2023        _Py_RETURN_UNICODE_EMPTY();
2024    assert(size > 0);
2025    if (size == 1)
2026        return unicode_char(u[0]);
2027
2028    max_char = ucs4lib_find_max_char(u, u + size);
2029    res = PyUnicode_New(size, max_char);
2030    if (!res)
2031        return NULL;
2032    if (max_char < 256)
2033        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, u, u + size,
2034                                 PyUnicode_1BYTE_DATA(res));
2035    else if (max_char < 0x10000)
2036        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, u, u + size,
2037                                 PyUnicode_2BYTE_DATA(res));
2038    else
2039        memcpy(PyUnicode_4BYTE_DATA(res), u, sizeof(Py_UCS4)*size);
2040    assert(_PyUnicode_CheckConsistency(res, 1));
2041    return res;
2042}
2043
2044PyObject*
2045PyUnicode_FromKindAndData(int kind, const void *buffer, Py_ssize_t size)
2046{
2047    if (size < 0) {
2048        PyErr_SetString(PyExc_ValueError, "size must be positive");
2049        return NULL;
2050    }
2051    switch (kind) {
2052    case PyUnicode_1BYTE_KIND:
2053        return _PyUnicode_FromUCS1(buffer, size);
2054    case PyUnicode_2BYTE_KIND:
2055        return _PyUnicode_FromUCS2(buffer, size);
2056    case PyUnicode_4BYTE_KIND:
2057        return _PyUnicode_FromUCS4(buffer, size);
2058    default:
2059        PyErr_SetString(PyExc_SystemError, "invalid kind");
2060        return NULL;
2061    }
2062}
2063
2064Py_UCS4
2065_PyUnicode_FindMaxChar(PyObject *unicode, Py_ssize_t start, Py_ssize_t end)
2066{
2067    enum PyUnicode_Kind kind;
2068    void *startptr, *endptr;
2069
2070    assert(PyUnicode_IS_READY(unicode));
2071    assert(0 <= start);
2072    assert(end <= PyUnicode_GET_LENGTH(unicode));
2073    assert(start <= end);
2074
2075    if (start == 0 && end == PyUnicode_GET_LENGTH(unicode))
2076        return PyUnicode_MAX_CHAR_VALUE(unicode);
2077
2078    if (start == end)
2079        return 127;
2080
2081    if (PyUnicode_IS_ASCII(unicode))
2082        return 127;
2083
2084    kind = PyUnicode_KIND(unicode);
2085    startptr = PyUnicode_DATA(unicode);
2086    endptr = (char *)startptr + end * kind;
2087    startptr = (char *)startptr + start * kind;
2088    switch(kind) {
2089    case PyUnicode_1BYTE_KIND:
2090        return ucs1lib_find_max_char(startptr, endptr);
2091    case PyUnicode_2BYTE_KIND:
2092        return ucs2lib_find_max_char(startptr, endptr);
2093    case PyUnicode_4BYTE_KIND:
2094        return ucs4lib_find_max_char(startptr, endptr);
2095    default:
2096        assert(0);
2097        return 0;
2098    }
2099}
2100
2101/* Ensure that a string uses the most efficient storage, if it is not the
2102   case: create a new string with of the right kind. Write NULL into *p_unicode
2103   on error. */
2104static void
2105unicode_adjust_maxchar(PyObject **p_unicode)
2106{
2107    PyObject *unicode, *copy;
2108    Py_UCS4 max_char;
2109    Py_ssize_t len;
2110    unsigned int kind;
2111
2112    assert(p_unicode != NULL);
2113    unicode = *p_unicode;
2114    assert(PyUnicode_IS_READY(unicode));
2115    if (PyUnicode_IS_ASCII(unicode))
2116        return;
2117
2118    len = PyUnicode_GET_LENGTH(unicode);
2119    kind = PyUnicode_KIND(unicode);
2120    if (kind == PyUnicode_1BYTE_KIND) {
2121        const Py_UCS1 *u = PyUnicode_1BYTE_DATA(unicode);
2122        max_char = ucs1lib_find_max_char(u, u + len);
2123        if (max_char >= 128)
2124            return;
2125    }
2126    else if (kind == PyUnicode_2BYTE_KIND) {
2127        const Py_UCS2 *u = PyUnicode_2BYTE_DATA(unicode);
2128        max_char = ucs2lib_find_max_char(u, u + len);
2129        if (max_char >= 256)
2130            return;
2131    }
2132    else {
2133        const Py_UCS4 *u = PyUnicode_4BYTE_DATA(unicode);
2134        assert(kind == PyUnicode_4BYTE_KIND);
2135        max_char = ucs4lib_find_max_char(u, u + len);
2136        if (max_char >= 0x10000)
2137            return;
2138    }
2139    copy = PyUnicode_New(len, max_char);
2140    if (copy != NULL)
2141        _PyUnicode_FastCopyCharacters(copy, 0, unicode, 0, len);
2142    Py_DECREF(unicode);
2143    *p_unicode = copy;
2144}
2145
2146PyObject*
2147_PyUnicode_Copy(PyObject *unicode)
2148{
2149    Py_ssize_t length;
2150    PyObject *copy;
2151
2152    if (!PyUnicode_Check(unicode)) {
2153        PyErr_BadInternalCall();
2154        return NULL;
2155    }
2156    if (PyUnicode_READY(unicode) == -1)
2157        return NULL;
2158
2159    length = PyUnicode_GET_LENGTH(unicode);
2160    copy = PyUnicode_New(length, PyUnicode_MAX_CHAR_VALUE(unicode));
2161    if (!copy)
2162        return NULL;
2163    assert(PyUnicode_KIND(copy) == PyUnicode_KIND(unicode));
2164
2165    Py_MEMCPY(PyUnicode_DATA(copy), PyUnicode_DATA(unicode),
2166              length * PyUnicode_KIND(unicode));
2167    assert(_PyUnicode_CheckConsistency(copy, 1));
2168    return copy;
2169}
2170
2171
2172/* Widen Unicode objects to larger buffers. Don't write terminating null
2173   character. Return NULL on error. */
2174
2175void*
2176_PyUnicode_AsKind(PyObject *s, unsigned int kind)
2177{
2178    Py_ssize_t len;
2179    void *result;
2180    unsigned int skind;
2181
2182    if (PyUnicode_READY(s) == -1)
2183        return NULL;
2184
2185    len = PyUnicode_GET_LENGTH(s);
2186    skind = PyUnicode_KIND(s);
2187    if (skind >= kind) {
2188        PyErr_SetString(PyExc_SystemError, "invalid widening attempt");
2189        return NULL;
2190    }
2191    switch (kind) {
2192    case PyUnicode_2BYTE_KIND:
2193        result = PyMem_New(Py_UCS2, len);
2194        if (!result)
2195            return PyErr_NoMemory();
2196        assert(skind == PyUnicode_1BYTE_KIND);
2197        _PyUnicode_CONVERT_BYTES(
2198            Py_UCS1, Py_UCS2,
2199            PyUnicode_1BYTE_DATA(s),
2200            PyUnicode_1BYTE_DATA(s) + len,
2201            result);
2202        return result;
2203    case PyUnicode_4BYTE_KIND:
2204        result = PyMem_New(Py_UCS4, len);
2205        if (!result)
2206            return PyErr_NoMemory();
2207        if (skind == PyUnicode_2BYTE_KIND) {
2208            _PyUnicode_CONVERT_BYTES(
2209                Py_UCS2, Py_UCS4,
2210                PyUnicode_2BYTE_DATA(s),
2211                PyUnicode_2BYTE_DATA(s) + len,
2212                result);
2213        }
2214        else {
2215            assert(skind == PyUnicode_1BYTE_KIND);
2216            _PyUnicode_CONVERT_BYTES(
2217                Py_UCS1, Py_UCS4,
2218                PyUnicode_1BYTE_DATA(s),
2219                PyUnicode_1BYTE_DATA(s) + len,
2220                result);
2221        }
2222        return result;
2223    default:
2224        break;
2225    }
2226    PyErr_SetString(PyExc_SystemError, "invalid kind");
2227    return NULL;
2228}
2229
2230static Py_UCS4*
2231as_ucs4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2232        int copy_null)
2233{
2234    int kind;
2235    void *data;
2236    Py_ssize_t len, targetlen;
2237    if (PyUnicode_READY(string) == -1)
2238        return NULL;
2239    kind = PyUnicode_KIND(string);
2240    data = PyUnicode_DATA(string);
2241    len = PyUnicode_GET_LENGTH(string);
2242    targetlen = len;
2243    if (copy_null)
2244        targetlen++;
2245    if (!target) {
2246        target = PyMem_New(Py_UCS4, targetlen);
2247        if (!target) {
2248            PyErr_NoMemory();
2249            return NULL;
2250        }
2251    }
2252    else {
2253        if (targetsize < targetlen) {
2254            PyErr_Format(PyExc_SystemError,
2255                         "string is longer than the buffer");
2256            if (copy_null && 0 < targetsize)
2257                target[0] = 0;
2258            return NULL;
2259        }
2260    }
2261    if (kind == PyUnicode_1BYTE_KIND) {
2262        Py_UCS1 *start = (Py_UCS1 *) data;
2263        _PyUnicode_CONVERT_BYTES(Py_UCS1, Py_UCS4, start, start + len, target);
2264    }
2265    else if (kind == PyUnicode_2BYTE_KIND) {
2266        Py_UCS2 *start = (Py_UCS2 *) data;
2267        _PyUnicode_CONVERT_BYTES(Py_UCS2, Py_UCS4, start, start + len, target);
2268    }
2269    else {
2270        assert(kind == PyUnicode_4BYTE_KIND);
2271        Py_MEMCPY(target, data, len * sizeof(Py_UCS4));
2272    }
2273    if (copy_null)
2274        target[len] = 0;
2275    return target;
2276}
2277
2278Py_UCS4*
2279PyUnicode_AsUCS4(PyObject *string, Py_UCS4 *target, Py_ssize_t targetsize,
2280                 int copy_null)
2281{
2282    if (target == NULL || targetsize < 0) {
2283        PyErr_BadInternalCall();
2284        return NULL;
2285    }
2286    return as_ucs4(string, target, targetsize, copy_null);
2287}
2288
2289Py_UCS4*
2290PyUnicode_AsUCS4Copy(PyObject *string)
2291{
2292    return as_ucs4(string, NULL, 0, 1);
2293}
2294
2295#ifdef HAVE_WCHAR_H
2296
2297PyObject *
2298PyUnicode_FromWideChar(const wchar_t *w, Py_ssize_t size)
2299{
2300    if (w == NULL) {
2301        if (size == 0)
2302            _Py_RETURN_UNICODE_EMPTY();
2303        PyErr_BadInternalCall();
2304        return NULL;
2305    }
2306
2307    if (size == -1) {
2308        size = wcslen(w);
2309    }
2310
2311    return PyUnicode_FromUnicode(w, size);
2312}
2313
2314#endif /* HAVE_WCHAR_H */
2315
2316static void
2317makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
2318        char c)
2319{
2320    *fmt++ = '%';
2321    if (longflag)
2322        *fmt++ = 'l';
2323    else if (longlongflag) {
2324        /* longlongflag should only ever be nonzero on machines with
2325           HAVE_LONG_LONG defined */
2326#ifdef HAVE_LONG_LONG
2327        char *f = PY_FORMAT_LONG_LONG;
2328        while (*f)
2329            *fmt++ = *f++;
2330#else
2331        /* we shouldn't ever get here */
2332        assert(0);
2333        *fmt++ = 'l';
2334#endif
2335    }
2336    else if (size_tflag) {
2337        char *f = PY_FORMAT_SIZE_T;
2338        while (*f)
2339            *fmt++ = *f++;
2340    }
2341    *fmt++ = c;
2342    *fmt = '\0';
2343}
2344
2345/* maximum number of characters required for output of %lld or %p.
2346   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
2347   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
2348#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
2349
2350static int
2351unicode_fromformat_write_str(_PyUnicodeWriter *writer, PyObject *str,
2352                             Py_ssize_t width, Py_ssize_t precision)
2353{
2354    Py_ssize_t length, fill, arglen;
2355    Py_UCS4 maxchar;
2356
2357    if (PyUnicode_READY(str) == -1)
2358        return -1;
2359
2360    length = PyUnicode_GET_LENGTH(str);
2361    if ((precision == -1 || precision >= length)
2362        && width <= length)
2363        return _PyUnicodeWriter_WriteStr(writer, str);
2364
2365    if (precision != -1)
2366        length = Py_MIN(precision, length);
2367
2368    arglen = Py_MAX(length, width);
2369    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
2370        maxchar = _PyUnicode_FindMaxChar(str, 0, length);
2371    else
2372        maxchar = writer->maxchar;
2373
2374    if (_PyUnicodeWriter_Prepare(writer, arglen, maxchar) == -1)
2375        return -1;
2376
2377    if (width > length) {
2378        fill = width - length;
2379        if (PyUnicode_Fill(writer->buffer, writer->pos, fill, ' ') == -1)
2380            return -1;
2381        writer->pos += fill;
2382    }
2383
2384    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
2385                                  str, 0, length);
2386    writer->pos += length;
2387    return 0;
2388}
2389
2390static int
2391unicode_fromformat_write_cstr(_PyUnicodeWriter *writer, const char *str,
2392                              Py_ssize_t width, Py_ssize_t precision)
2393{
2394    /* UTF-8 */
2395    Py_ssize_t length;
2396    PyObject *unicode;
2397    int res;
2398
2399    length = strlen(str);
2400    if (precision != -1)
2401        length = Py_MIN(length, precision);
2402    unicode = PyUnicode_DecodeUTF8Stateful(str, length, "replace", NULL);
2403    if (unicode == NULL)
2404        return -1;
2405
2406    res = unicode_fromformat_write_str(writer, unicode, width, -1);
2407    Py_DECREF(unicode);
2408    return res;
2409}
2410
2411static const char*
2412unicode_fromformat_arg(_PyUnicodeWriter *writer,
2413                       const char *f, va_list *vargs)
2414{
2415    const char *p;
2416    Py_ssize_t len;
2417    int zeropad;
2418    Py_ssize_t width;
2419    Py_ssize_t precision;
2420    int longflag;
2421    int longlongflag;
2422    int size_tflag;
2423    Py_ssize_t fill;
2424
2425    p = f;
2426    f++;
2427    zeropad = 0;
2428    if (*f == '0') {
2429        zeropad = 1;
2430        f++;
2431    }
2432
2433    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
2434    width = -1;
2435    if (Py_ISDIGIT((unsigned)*f)) {
2436        width = *f - '0';
2437        f++;
2438        while (Py_ISDIGIT((unsigned)*f)) {
2439            if (width > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2440                PyErr_SetString(PyExc_ValueError,
2441                                "width too big");
2442                return NULL;
2443            }
2444            width = (width * 10) + (*f - '0');
2445            f++;
2446        }
2447    }
2448    precision = -1;
2449    if (*f == '.') {
2450        f++;
2451        if (Py_ISDIGIT((unsigned)*f)) {
2452            precision = (*f - '0');
2453            f++;
2454            while (Py_ISDIGIT((unsigned)*f)) {
2455                if (precision > (PY_SSIZE_T_MAX - ((int)*f - '0')) / 10) {
2456                    PyErr_SetString(PyExc_ValueError,
2457                                    "precision too big");
2458                    return NULL;
2459                }
2460                precision = (precision * 10) + (*f - '0');
2461                f++;
2462            }
2463        }
2464        if (*f == '%') {
2465            /* "%.3%s" => f points to "3" */
2466            f--;
2467        }
2468    }
2469    if (*f == '\0') {
2470        /* bogus format "%.123" => go backward, f points to "3" */
2471        f--;
2472    }
2473
2474    /* Handle %ld, %lu, %lld and %llu. */
2475    longflag = 0;
2476    longlongflag = 0;
2477    size_tflag = 0;
2478    if (*f == 'l') {
2479        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
2480            longflag = 1;
2481            ++f;
2482        }
2483#ifdef HAVE_LONG_LONG
2484        else if (f[1] == 'l' &&
2485                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
2486            longlongflag = 1;
2487            f += 2;
2488        }
2489#endif
2490    }
2491    /* handle the size_t flag. */
2492    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
2493        size_tflag = 1;
2494        ++f;
2495    }
2496
2497    if (f[1] == '\0')
2498        writer->overallocate = 0;
2499
2500    switch (*f) {
2501    case 'c':
2502    {
2503        int ordinal = va_arg(*vargs, int);
2504        if (ordinal < 0 || ordinal > MAX_UNICODE) {
2505            PyErr_SetString(PyExc_OverflowError,
2506                            "character argument not in range(0x110000)");
2507            return NULL;
2508        }
2509        if (_PyUnicodeWriter_WriteCharInline(writer, ordinal) < 0)
2510            return NULL;
2511        break;
2512    }
2513
2514    case 'i':
2515    case 'd':
2516    case 'u':
2517    case 'x':
2518    {
2519        /* used by sprintf */
2520        char fmt[10]; /* should be enough for "%0lld\0" */
2521        char buffer[MAX_LONG_LONG_CHARS];
2522        Py_ssize_t arglen;
2523
2524        if (*f == 'u') {
2525            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2526
2527            if (longflag)
2528                len = sprintf(buffer, fmt,
2529                        va_arg(*vargs, unsigned long));
2530#ifdef HAVE_LONG_LONG
2531            else if (longlongflag)
2532                len = sprintf(buffer, fmt,
2533                        va_arg(*vargs, unsigned PY_LONG_LONG));
2534#endif
2535            else if (size_tflag)
2536                len = sprintf(buffer, fmt,
2537                        va_arg(*vargs, size_t));
2538            else
2539                len = sprintf(buffer, fmt,
2540                        va_arg(*vargs, unsigned int));
2541        }
2542        else if (*f == 'x') {
2543            makefmt(fmt, 0, 0, 0, 'x');
2544            len = sprintf(buffer, fmt, va_arg(*vargs, int));
2545        }
2546        else {
2547            makefmt(fmt, longflag, longlongflag, size_tflag, *f);
2548
2549            if (longflag)
2550                len = sprintf(buffer, fmt,
2551                        va_arg(*vargs, long));
2552#ifdef HAVE_LONG_LONG
2553            else if (longlongflag)
2554                len = sprintf(buffer, fmt,
2555                        va_arg(*vargs, PY_LONG_LONG));
2556#endif
2557            else if (size_tflag)
2558                len = sprintf(buffer, fmt,
2559                        va_arg(*vargs, Py_ssize_t));
2560            else
2561                len = sprintf(buffer, fmt,
2562                        va_arg(*vargs, int));
2563        }
2564        assert(len >= 0);
2565
2566        if (precision < len)
2567            precision = len;
2568
2569        arglen = Py_MAX(precision, width);
2570        if (_PyUnicodeWriter_Prepare(writer, arglen, 127) == -1)
2571            return NULL;
2572
2573        if (width > precision) {
2574            Py_UCS4 fillchar;
2575            fill = width - precision;
2576            fillchar = zeropad?'0':' ';
2577            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, fillchar) == -1)
2578                return NULL;
2579            writer->pos += fill;
2580        }
2581        if (precision > len) {
2582            fill = precision - len;
2583            if (PyUnicode_Fill(writer->buffer, writer->pos, fill, '0') == -1)
2584                return NULL;
2585            writer->pos += fill;
2586        }
2587
2588        if (_PyUnicodeWriter_WriteASCIIString(writer, buffer, len) < 0)
2589            return NULL;
2590        break;
2591    }
2592
2593    case 'p':
2594    {
2595        char number[MAX_LONG_LONG_CHARS];
2596
2597        len = sprintf(number, "%p", va_arg(*vargs, void*));
2598        assert(len >= 0);
2599
2600        /* %p is ill-defined:  ensure leading 0x. */
2601        if (number[1] == 'X')
2602            number[1] = 'x';
2603        else if (number[1] != 'x') {
2604            memmove(number + 2, number,
2605                    strlen(number) + 1);
2606            number[0] = '0';
2607            number[1] = 'x';
2608            len += 2;
2609        }
2610
2611        if (_PyUnicodeWriter_WriteASCIIString(writer, number, len) < 0)
2612            return NULL;
2613        break;
2614    }
2615
2616    case 's':
2617    {
2618        /* UTF-8 */
2619        const char *s = va_arg(*vargs, const char*);
2620        if (unicode_fromformat_write_cstr(writer, s, width, precision) < 0)
2621            return NULL;
2622        break;
2623    }
2624
2625    case 'U':
2626    {
2627        PyObject *obj = va_arg(*vargs, PyObject *);
2628        assert(obj && _PyUnicode_CHECK(obj));
2629
2630        if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2631            return NULL;
2632        break;
2633    }
2634
2635    case 'V':
2636    {
2637        PyObject *obj = va_arg(*vargs, PyObject *);
2638        const char *str = va_arg(*vargs, const char *);
2639        if (obj) {
2640            assert(_PyUnicode_CHECK(obj));
2641            if (unicode_fromformat_write_str(writer, obj, width, precision) == -1)
2642                return NULL;
2643        }
2644        else {
2645            assert(str != NULL);
2646            if (unicode_fromformat_write_cstr(writer, str, width, precision) < 0)
2647                return NULL;
2648        }
2649        break;
2650    }
2651
2652    case 'S':
2653    {
2654        PyObject *obj = va_arg(*vargs, PyObject *);
2655        PyObject *str;
2656        assert(obj);
2657        str = PyObject_Str(obj);
2658        if (!str)
2659            return NULL;
2660        if (unicode_fromformat_write_str(writer, str, width, precision) == -1) {
2661            Py_DECREF(str);
2662            return NULL;
2663        }
2664        Py_DECREF(str);
2665        break;
2666    }
2667
2668    case 'R':
2669    {
2670        PyObject *obj = va_arg(*vargs, PyObject *);
2671        PyObject *repr;
2672        assert(obj);
2673        repr = PyObject_Repr(obj);
2674        if (!repr)
2675            return NULL;
2676        if (unicode_fromformat_write_str(writer, repr, width, precision) == -1) {
2677            Py_DECREF(repr);
2678            return NULL;
2679        }
2680        Py_DECREF(repr);
2681        break;
2682    }
2683
2684    case 'A':
2685    {
2686        PyObject *obj = va_arg(*vargs, PyObject *);
2687        PyObject *ascii;
2688        assert(obj);
2689        ascii = PyObject_ASCII(obj);
2690        if (!ascii)
2691            return NULL;
2692        if (unicode_fromformat_write_str(writer, ascii, width, precision) == -1) {
2693            Py_DECREF(ascii);
2694            return NULL;
2695        }
2696        Py_DECREF(ascii);
2697        break;
2698    }
2699
2700    case '%':
2701        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
2702            return NULL;
2703        break;
2704
2705    default:
2706        /* if we stumble upon an unknown formatting code, copy the rest
2707           of the format string to the output string. (we cannot just
2708           skip the code, since there's no way to know what's in the
2709           argument list) */
2710        len = strlen(p);
2711        if (_PyUnicodeWriter_WriteLatin1String(writer, p, len) == -1)
2712            return NULL;
2713        f = p+len;
2714        return f;
2715    }
2716
2717    f++;
2718    return f;
2719}
2720
2721PyObject *
2722PyUnicode_FromFormatV(const char *format, va_list vargs)
2723{
2724    va_list vargs2;
2725    const char *f;
2726    _PyUnicodeWriter writer;
2727
2728    _PyUnicodeWriter_Init(&writer);
2729    writer.min_length = strlen(format) + 100;
2730    writer.overallocate = 1;
2731
2732    /* va_list may be an array (of 1 item) on some platforms (ex: AMD64).
2733       Copy it to be able to pass a reference to a subfunction. */
2734    Py_VA_COPY(vargs2, vargs);
2735
2736    for (f = format; *f; ) {
2737        if (*f == '%') {
2738            f = unicode_fromformat_arg(&writer, f, &vargs2);
2739            if (f == NULL)
2740                goto fail;
2741        }
2742        else {
2743            const char *p;
2744            Py_ssize_t len;
2745
2746            p = f;
2747            do
2748            {
2749                if ((unsigned char)*p > 127) {
2750                    PyErr_Format(PyExc_ValueError,
2751                        "PyUnicode_FromFormatV() expects an ASCII-encoded format "
2752                        "string, got a non-ASCII byte: 0x%02x",
2753                        (unsigned char)*p);
2754                    return NULL;
2755                }
2756                p++;
2757            }
2758            while (*p != '\0' && *p != '%');
2759            len = p - f;
2760
2761            if (*p == '\0')
2762                writer.overallocate = 0;
2763
2764            if (_PyUnicodeWriter_WriteASCIIString(&writer, f, len) < 0)
2765                goto fail;
2766
2767            f = p;
2768        }
2769    }
2770    return _PyUnicodeWriter_Finish(&writer);
2771
2772  fail:
2773    _PyUnicodeWriter_Dealloc(&writer);
2774    return NULL;
2775}
2776
2777PyObject *
2778PyUnicode_FromFormat(const char *format, ...)
2779{
2780    PyObject* ret;
2781    va_list vargs;
2782
2783#ifdef HAVE_STDARG_PROTOTYPES
2784    va_start(vargs, format);
2785#else
2786    va_start(vargs);
2787#endif
2788    ret = PyUnicode_FromFormatV(format, vargs);
2789    va_end(vargs);
2790    return ret;
2791}
2792
2793#ifdef HAVE_WCHAR_H
2794
2795/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
2796   convert a Unicode object to a wide character string.
2797
2798   - If w is NULL: return the number of wide characters (including the null
2799     character) required to convert the unicode object. Ignore size argument.
2800
2801   - Otherwise: return the number of wide characters (excluding the null
2802     character) written into w. Write at most size wide characters (including
2803     the null character). */
2804static Py_ssize_t
2805unicode_aswidechar(PyObject *unicode,
2806                   wchar_t *w,
2807                   Py_ssize_t size)
2808{
2809    Py_ssize_t res;
2810    const wchar_t *wstr;
2811
2812    wstr = PyUnicode_AsUnicodeAndSize(unicode, &res);
2813    if (wstr == NULL)
2814        return -1;
2815
2816    if (w != NULL) {
2817        if (size > res)
2818            size = res + 1;
2819        else
2820            res = size;
2821        Py_MEMCPY(w, wstr, size * sizeof(wchar_t));
2822        return res;
2823    }
2824    else
2825        return res + 1;
2826}
2827
2828Py_ssize_t
2829PyUnicode_AsWideChar(PyObject *unicode,
2830                     wchar_t *w,
2831                     Py_ssize_t size)
2832{
2833    if (unicode == NULL) {
2834        PyErr_BadInternalCall();
2835        return -1;
2836    }
2837    return unicode_aswidechar(unicode, w, size);
2838}
2839
2840wchar_t*
2841PyUnicode_AsWideCharString(PyObject *unicode,
2842                           Py_ssize_t *size)
2843{
2844    wchar_t* buffer;
2845    Py_ssize_t buflen;
2846
2847    if (unicode == NULL) {
2848        PyErr_BadInternalCall();
2849        return NULL;
2850    }
2851
2852    buflen = unicode_aswidechar(unicode, NULL, 0);
2853    if (buflen == -1)
2854        return NULL;
2855    buffer = PyMem_NEW(wchar_t, buflen);
2856    if (buffer == NULL) {
2857        PyErr_NoMemory();
2858        return NULL;
2859    }
2860    buflen = unicode_aswidechar(unicode, buffer, buflen);
2861    if (buflen == -1) {
2862        PyMem_FREE(buffer);
2863        return NULL;
2864    }
2865    if (size != NULL)
2866        *size = buflen;
2867    return buffer;
2868}
2869
2870#endif /* HAVE_WCHAR_H */
2871
2872PyObject *
2873PyUnicode_FromOrdinal(int ordinal)
2874{
2875    if (ordinal < 0 || ordinal > MAX_UNICODE) {
2876        PyErr_SetString(PyExc_ValueError,
2877                        "chr() arg not in range(0x110000)");
2878        return NULL;
2879    }
2880
2881    return unicode_char((Py_UCS4)ordinal);
2882}
2883
2884PyObject *
2885PyUnicode_FromObject(PyObject *obj)
2886{
2887    /* XXX Perhaps we should make this API an alias of
2888       PyObject_Str() instead ?! */
2889    if (PyUnicode_CheckExact(obj)) {
2890        if (PyUnicode_READY(obj) == -1)
2891            return NULL;
2892        Py_INCREF(obj);
2893        return obj;
2894    }
2895    if (PyUnicode_Check(obj)) {
2896        /* For a Unicode subtype that's not a Unicode object,
2897           return a true Unicode object with the same data. */
2898        return _PyUnicode_Copy(obj);
2899    }
2900    PyErr_Format(PyExc_TypeError,
2901                 "Can't convert '%.100s' object to str implicitly",
2902                 Py_TYPE(obj)->tp_name);
2903    return NULL;
2904}
2905
2906PyObject *
2907PyUnicode_FromEncodedObject(PyObject *obj,
2908                            const char *encoding,
2909                            const char *errors)
2910{
2911    Py_buffer buffer;
2912    PyObject *v;
2913
2914    if (obj == NULL) {
2915        PyErr_BadInternalCall();
2916        return NULL;
2917    }
2918
2919    /* Decoding bytes objects is the most common case and should be fast */
2920    if (PyBytes_Check(obj)) {
2921        if (PyBytes_GET_SIZE(obj) == 0)
2922            _Py_RETURN_UNICODE_EMPTY();
2923        v = PyUnicode_Decode(
2924                PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
2925                encoding, errors);
2926        return v;
2927    }
2928
2929    if (PyUnicode_Check(obj)) {
2930        PyErr_SetString(PyExc_TypeError,
2931                        "decoding str is not supported");
2932        return NULL;
2933    }
2934
2935    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
2936    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
2937        PyErr_Format(PyExc_TypeError,
2938                     "coercing to str: need a bytes-like object, %.80s found",
2939                     Py_TYPE(obj)->tp_name);
2940        return NULL;
2941    }
2942
2943    if (buffer.len == 0) {
2944        PyBuffer_Release(&buffer);
2945        _Py_RETURN_UNICODE_EMPTY();
2946    }
2947
2948    v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
2949    PyBuffer_Release(&buffer);
2950    return v;
2951}
2952
2953/* Convert encoding to lower case and replace '_' with '-' in order to
2954   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
2955   1 on success. */
2956int
2957_Py_normalize_encoding(const char *encoding,
2958                       char *lower,
2959                       size_t lower_len)
2960{
2961    const char *e;
2962    char *l;
2963    char *l_end;
2964
2965    if (encoding == NULL) {
2966        /* 6 == strlen("utf-8") + 1 */
2967        if (lower_len < 6)
2968            return 0;
2969        strcpy(lower, "utf-8");
2970        return 1;
2971    }
2972    e = encoding;
2973    l = lower;
2974    l_end = &lower[lower_len - 1];
2975    while (*e) {
2976        if (l == l_end)
2977            return 0;
2978        if (Py_ISUPPER(*e)) {
2979            *l++ = Py_TOLOWER(*e++);
2980        }
2981        else if (*e == '_') {
2982            *l++ = '-';
2983            e++;
2984        }
2985        else {
2986            *l++ = *e++;
2987        }
2988    }
2989    *l = '\0';
2990    return 1;
2991}
2992
2993PyObject *
2994PyUnicode_Decode(const char *s,
2995                 Py_ssize_t size,
2996                 const char *encoding,
2997                 const char *errors)
2998{
2999    PyObject *buffer = NULL, *unicode;
3000    Py_buffer info;
3001    char lower[11];  /* Enough for any encoding shortcut */
3002
3003    /* Shortcuts for common default encodings */
3004    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3005        if ((strcmp(lower, "utf-8") == 0) ||
3006            (strcmp(lower, "utf8") == 0))
3007            return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
3008        else if ((strcmp(lower, "latin-1") == 0) ||
3009                 (strcmp(lower, "latin1") == 0) ||
3010                 (strcmp(lower, "iso-8859-1") == 0) ||
3011                 (strcmp(lower, "iso8859-1") == 0))
3012            return PyUnicode_DecodeLatin1(s, size, errors);
3013#ifdef HAVE_MBCS
3014        else if (strcmp(lower, "mbcs") == 0)
3015            return PyUnicode_DecodeMBCS(s, size, errors);
3016#endif
3017        else if (strcmp(lower, "ascii") == 0)
3018            return PyUnicode_DecodeASCII(s, size, errors);
3019        else if (strcmp(lower, "utf-16") == 0)
3020            return PyUnicode_DecodeUTF16(s, size, errors, 0);
3021        else if (strcmp(lower, "utf-32") == 0)
3022            return PyUnicode_DecodeUTF32(s, size, errors, 0);
3023    }
3024
3025    /* Decode via the codec registry */
3026    buffer = NULL;
3027    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
3028        goto onError;
3029    buffer = PyMemoryView_FromBuffer(&info);
3030    if (buffer == NULL)
3031        goto onError;
3032    unicode = _PyCodec_DecodeText(buffer, encoding, errors);
3033    if (unicode == NULL)
3034        goto onError;
3035    if (!PyUnicode_Check(unicode)) {
3036        PyErr_Format(PyExc_TypeError,
3037                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3038                     "use codecs.decode() to decode to arbitrary types",
3039                     encoding,
3040                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3041        Py_DECREF(unicode);
3042        goto onError;
3043    }
3044    Py_DECREF(buffer);
3045    return unicode_result(unicode);
3046
3047  onError:
3048    Py_XDECREF(buffer);
3049    return NULL;
3050}
3051
3052PyObject *
3053PyUnicode_AsDecodedObject(PyObject *unicode,
3054                          const char *encoding,
3055                          const char *errors)
3056{
3057    PyObject *v;
3058
3059    if (!PyUnicode_Check(unicode)) {
3060        PyErr_BadArgument();
3061        goto onError;
3062    }
3063
3064    if (encoding == NULL)
3065        encoding = PyUnicode_GetDefaultEncoding();
3066
3067    /* Decode via the codec registry */
3068    v = PyCodec_Decode(unicode, encoding, errors);
3069    if (v == NULL)
3070        goto onError;
3071    return unicode_result(v);
3072
3073  onError:
3074    return NULL;
3075}
3076
3077PyObject *
3078PyUnicode_AsDecodedUnicode(PyObject *unicode,
3079                           const char *encoding,
3080                           const char *errors)
3081{
3082    PyObject *v;
3083
3084    if (!PyUnicode_Check(unicode)) {
3085        PyErr_BadArgument();
3086        goto onError;
3087    }
3088
3089    if (encoding == NULL)
3090        encoding = PyUnicode_GetDefaultEncoding();
3091
3092    /* Decode via the codec registry */
3093    v = PyCodec_Decode(unicode, encoding, errors);
3094    if (v == NULL)
3095        goto onError;
3096    if (!PyUnicode_Check(v)) {
3097        PyErr_Format(PyExc_TypeError,
3098                     "'%.400s' decoder returned '%.400s' instead of 'str'; "
3099                     "use codecs.decode() to decode to arbitrary types",
3100                     encoding,
3101                     Py_TYPE(unicode)->tp_name, Py_TYPE(unicode)->tp_name);
3102        Py_DECREF(v);
3103        goto onError;
3104    }
3105    return unicode_result(v);
3106
3107  onError:
3108    return NULL;
3109}
3110
3111PyObject *
3112PyUnicode_Encode(const Py_UNICODE *s,
3113                 Py_ssize_t size,
3114                 const char *encoding,
3115                 const char *errors)
3116{
3117    PyObject *v, *unicode;
3118
3119    unicode = PyUnicode_FromUnicode(s, size);
3120    if (unicode == NULL)
3121        return NULL;
3122    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
3123    Py_DECREF(unicode);
3124    return v;
3125}
3126
3127PyObject *
3128PyUnicode_AsEncodedObject(PyObject *unicode,
3129                          const char *encoding,
3130                          const char *errors)
3131{
3132    PyObject *v;
3133
3134    if (!PyUnicode_Check(unicode)) {
3135        PyErr_BadArgument();
3136        goto onError;
3137    }
3138
3139    if (encoding == NULL)
3140        encoding = PyUnicode_GetDefaultEncoding();
3141
3142    /* Encode via the codec registry */
3143    v = PyCodec_Encode(unicode, encoding, errors);
3144    if (v == NULL)
3145        goto onError;
3146    return v;
3147
3148  onError:
3149    return NULL;
3150}
3151
3152static size_t
3153wcstombs_errorpos(const wchar_t *wstr)
3154{
3155    size_t len;
3156#if SIZEOF_WCHAR_T == 2
3157    wchar_t buf[3];
3158#else
3159    wchar_t buf[2];
3160#endif
3161    char outbuf[MB_LEN_MAX];
3162    const wchar_t *start, *previous;
3163
3164#if SIZEOF_WCHAR_T == 2
3165    buf[2] = 0;
3166#else
3167    buf[1] = 0;
3168#endif
3169    start = wstr;
3170    while (*wstr != L'\0')
3171    {
3172        previous = wstr;
3173#if SIZEOF_WCHAR_T == 2
3174        if (Py_UNICODE_IS_HIGH_SURROGATE(wstr[0])
3175            && Py_UNICODE_IS_LOW_SURROGATE(wstr[1]))
3176        {
3177            buf[0] = wstr[0];
3178            buf[1] = wstr[1];
3179            wstr += 2;
3180        }
3181        else {
3182            buf[0] = *wstr;
3183            buf[1] = 0;
3184            wstr++;
3185        }
3186#else
3187        buf[0] = *wstr;
3188        wstr++;
3189#endif
3190        len = wcstombs(outbuf, buf, sizeof(outbuf));
3191        if (len == (size_t)-1)
3192            return previous - start;
3193    }
3194
3195    /* failed to find the unencodable character */
3196    return 0;
3197}
3198
3199static int
3200locale_error_handler(const char *errors, int *surrogateescape)
3201{
3202    if (errors == NULL) {
3203        *surrogateescape = 0;
3204        return 0;
3205    }
3206
3207    if (strcmp(errors, "strict") == 0) {
3208        *surrogateescape = 0;
3209        return 0;
3210    }
3211    if (strcmp(errors, "surrogateescape") == 0) {
3212        *surrogateescape = 1;
3213        return 0;
3214    }
3215    PyErr_Format(PyExc_ValueError,
3216                 "only 'strict' and 'surrogateescape' error handlers "
3217                 "are supported, not '%s'",
3218                 errors);
3219    return -1;
3220}
3221
3222PyObject *
3223PyUnicode_EncodeLocale(PyObject *unicode, const char *errors)
3224{
3225    Py_ssize_t wlen, wlen2;
3226    wchar_t *wstr;
3227    PyObject *bytes = NULL;
3228    char *errmsg;
3229    PyObject *reason = NULL;
3230    PyObject *exc;
3231    size_t error_pos;
3232    int surrogateescape;
3233
3234    if (locale_error_handler(errors, &surrogateescape) < 0)
3235        return NULL;
3236
3237    wstr = PyUnicode_AsWideCharString(unicode, &wlen);
3238    if (wstr == NULL)
3239        return NULL;
3240
3241    wlen2 = wcslen(wstr);
3242    if (wlen2 != wlen) {
3243        PyMem_Free(wstr);
3244        PyErr_SetString(PyExc_TypeError, "embedded null character");
3245        return NULL;
3246    }
3247
3248    if (surrogateescape) {
3249        /* "surrogateescape" error handler */
3250        char *str;
3251
3252        str = _Py_wchar2char(wstr, &error_pos);
3253        if (str == NULL) {
3254            if (error_pos == (size_t)-1) {
3255                PyErr_NoMemory();
3256                PyMem_Free(wstr);
3257                return NULL;
3258            }
3259            else {
3260                goto encode_error;
3261            }
3262        }
3263        PyMem_Free(wstr);
3264
3265        bytes = PyBytes_FromString(str);
3266        PyMem_Free(str);
3267    }
3268    else {
3269        /* strict mode */
3270        size_t len, len2;
3271
3272        len = wcstombs(NULL, wstr, 0);
3273        if (len == (size_t)-1) {
3274            error_pos = (size_t)-1;
3275            goto encode_error;
3276        }
3277
3278        bytes = PyBytes_FromStringAndSize(NULL, len);
3279        if (bytes == NULL) {
3280            PyMem_Free(wstr);
3281            return NULL;
3282        }
3283
3284        len2 = wcstombs(PyBytes_AS_STRING(bytes), wstr, len+1);
3285        if (len2 == (size_t)-1 || len2 > len) {
3286            error_pos = (size_t)-1;
3287            goto encode_error;
3288        }
3289        PyMem_Free(wstr);
3290    }
3291    return bytes;
3292
3293encode_error:
3294    errmsg = strerror(errno);
3295    assert(errmsg != NULL);
3296
3297    if (error_pos == (size_t)-1)
3298        error_pos = wcstombs_errorpos(wstr);
3299
3300    PyMem_Free(wstr);
3301    Py_XDECREF(bytes);
3302
3303    if (errmsg != NULL) {
3304        size_t errlen;
3305        wstr = _Py_char2wchar(errmsg, &errlen);
3306        if (wstr != NULL) {
3307            reason = PyUnicode_FromWideChar(wstr, errlen);
3308            PyMem_RawFree(wstr);
3309        } else
3310            errmsg = NULL;
3311    }
3312    if (errmsg == NULL)
3313        reason = PyUnicode_FromString(
3314            "wcstombs() encountered an unencodable "
3315            "wide character");
3316    if (reason == NULL)
3317        return NULL;
3318
3319    exc = PyObject_CallFunction(PyExc_UnicodeEncodeError, "sOnnO",
3320                                "locale", unicode,
3321                                (Py_ssize_t)error_pos,
3322                                (Py_ssize_t)(error_pos+1),
3323                                reason);
3324    Py_DECREF(reason);
3325    if (exc != NULL) {
3326        PyCodec_StrictErrors(exc);
3327        Py_XDECREF(exc);
3328    }
3329    return NULL;
3330}
3331
3332PyObject *
3333PyUnicode_EncodeFSDefault(PyObject *unicode)
3334{
3335#ifdef HAVE_MBCS
3336    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
3337#elif defined(__APPLE__)
3338    return _PyUnicode_AsUTF8String(unicode, "surrogateescape");
3339#else
3340    PyInterpreterState *interp = PyThreadState_GET()->interp;
3341    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3342       cannot use it to encode and decode filenames before it is loaded. Load
3343       the Python codec requires to encode at least its own filename. Use the C
3344       version of the locale codec until the codec registry is initialized and
3345       the Python codec is loaded.
3346
3347       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3348       cannot only rely on it: check also interp->fscodec_initialized for
3349       subinterpreters. */
3350    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3351        return PyUnicode_AsEncodedString(unicode,
3352                                         Py_FileSystemDefaultEncoding,
3353                                         "surrogateescape");
3354    }
3355    else {
3356        return PyUnicode_EncodeLocale(unicode, "surrogateescape");
3357    }
3358#endif
3359}
3360
3361PyObject *
3362PyUnicode_AsEncodedString(PyObject *unicode,
3363                          const char *encoding,
3364                          const char *errors)
3365{
3366    PyObject *v;
3367    char lower[11];  /* Enough for any encoding shortcut */
3368
3369    if (!PyUnicode_Check(unicode)) {
3370        PyErr_BadArgument();
3371        return NULL;
3372    }
3373
3374    /* Shortcuts for common default encodings */
3375    if (_Py_normalize_encoding(encoding, lower, sizeof(lower))) {
3376        if ((strcmp(lower, "utf-8") == 0) ||
3377            (strcmp(lower, "utf8") == 0))
3378        {
3379            if (errors == NULL || strcmp(errors, "strict") == 0)
3380                return _PyUnicode_AsUTF8String(unicode, NULL);
3381            else
3382                return _PyUnicode_AsUTF8String(unicode, errors);
3383        }
3384        else if ((strcmp(lower, "latin-1") == 0) ||
3385                 (strcmp(lower, "latin1") == 0) ||
3386                 (strcmp(lower, "iso-8859-1") == 0) ||
3387                 (strcmp(lower, "iso8859-1") == 0))
3388            return _PyUnicode_AsLatin1String(unicode, errors);
3389#ifdef HAVE_MBCS
3390        else if (strcmp(lower, "mbcs") == 0)
3391            return PyUnicode_EncodeCodePage(CP_ACP, unicode, errors);
3392#endif
3393        else if (strcmp(lower, "ascii") == 0)
3394            return _PyUnicode_AsASCIIString(unicode, errors);
3395    }
3396
3397    /* Encode via the codec registry */
3398    v = _PyCodec_EncodeText(unicode, encoding, errors);
3399    if (v == NULL)
3400        return NULL;
3401
3402    /* The normal path */
3403    if (PyBytes_Check(v))
3404        return v;
3405
3406    /* If the codec returns a buffer, raise a warning and convert to bytes */
3407    if (PyByteArray_Check(v)) {
3408        int error;
3409        PyObject *b;
3410
3411        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
3412            "encoder %s returned bytearray instead of bytes; "
3413            "use codecs.encode() to encode to arbitrary types",
3414            encoding);
3415        if (error) {
3416            Py_DECREF(v);
3417            return NULL;
3418        }
3419
3420        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
3421        Py_DECREF(v);
3422        return b;
3423    }
3424
3425    PyErr_Format(PyExc_TypeError,
3426                 "'%.400s' encoder returned '%.400s' instead of 'bytes'; "
3427                 "use codecs.encode() to encode to arbitrary types",
3428                 encoding,
3429                 Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3430    Py_DECREF(v);
3431    return NULL;
3432}
3433
3434PyObject *
3435PyUnicode_AsEncodedUnicode(PyObject *unicode,
3436                           const char *encoding,
3437                           const char *errors)
3438{
3439    PyObject *v;
3440
3441    if (!PyUnicode_Check(unicode)) {
3442        PyErr_BadArgument();
3443        goto onError;
3444    }
3445
3446    if (encoding == NULL)
3447        encoding = PyUnicode_GetDefaultEncoding();
3448
3449    /* Encode via the codec registry */
3450    v = PyCodec_Encode(unicode, encoding, errors);
3451    if (v == NULL)
3452        goto onError;
3453    if (!PyUnicode_Check(v)) {
3454        PyErr_Format(PyExc_TypeError,
3455                     "'%.400s' encoder returned '%.400s' instead of 'str'; "
3456                     "use codecs.encode() to encode to arbitrary types",
3457                     encoding,
3458                     Py_TYPE(v)->tp_name, Py_TYPE(v)->tp_name);
3459        Py_DECREF(v);
3460        goto onError;
3461    }
3462    return v;
3463
3464  onError:
3465    return NULL;
3466}
3467
3468static size_t
3469mbstowcs_errorpos(const char *str, size_t len)
3470{
3471#ifdef HAVE_MBRTOWC
3472    const char *start = str;
3473    mbstate_t mbs;
3474    size_t converted;
3475    wchar_t ch;
3476
3477    memset(&mbs, 0, sizeof mbs);
3478    while (len)
3479    {
3480        converted = mbrtowc(&ch, (char*)str, len, &mbs);
3481        if (converted == 0)
3482            /* Reached end of string */
3483            break;
3484        if (converted == (size_t)-1 || converted == (size_t)-2) {
3485            /* Conversion error or incomplete character */
3486            return str - start;
3487        }
3488        else {
3489            str += converted;
3490            len -= converted;
3491        }
3492    }
3493    /* failed to find the undecodable byte sequence */
3494    return 0;
3495#endif
3496    return 0;
3497}
3498
3499PyObject*
3500PyUnicode_DecodeLocaleAndSize(const char *str, Py_ssize_t len,
3501                              const char *errors)
3502{
3503    wchar_t smallbuf[256];
3504    size_t smallbuf_len = Py_ARRAY_LENGTH(smallbuf);
3505    wchar_t *wstr;
3506    size_t wlen, wlen2;
3507    PyObject *unicode;
3508    int surrogateescape;
3509    size_t error_pos;
3510    char *errmsg;
3511    PyObject *reason, *exc;
3512
3513    if (locale_error_handler(errors, &surrogateescape) < 0)
3514        return NULL;
3515
3516    if (str[len] != '\0' || len != strlen(str)) {
3517        PyErr_SetString(PyExc_TypeError, "embedded null character");
3518        return NULL;
3519    }
3520
3521    if (surrogateescape) {
3522        /* "surrogateescape" error handler */
3523        wstr = _Py_char2wchar(str, &wlen);
3524        if (wstr == NULL) {
3525            if (wlen == (size_t)-1)
3526                PyErr_NoMemory();
3527            else
3528                PyErr_SetFromErrno(PyExc_OSError);
3529            return NULL;
3530        }
3531
3532        unicode = PyUnicode_FromWideChar(wstr, wlen);
3533        PyMem_RawFree(wstr);
3534    }
3535    else {
3536        /* strict mode */
3537#ifndef HAVE_BROKEN_MBSTOWCS
3538        wlen = mbstowcs(NULL, str, 0);
3539#else
3540        wlen = len;
3541#endif
3542        if (wlen == (size_t)-1)
3543            goto decode_error;
3544        if (wlen+1 <= smallbuf_len) {
3545            wstr = smallbuf;
3546        }
3547        else {
3548            wstr = PyMem_New(wchar_t, wlen+1);
3549            if (!wstr)
3550                return PyErr_NoMemory();
3551        }
3552
3553        wlen2 = mbstowcs(wstr, str, wlen+1);
3554        if (wlen2 == (size_t)-1) {
3555            if (wstr != smallbuf)
3556                PyMem_Free(wstr);
3557            goto decode_error;
3558        }
3559#ifdef HAVE_BROKEN_MBSTOWCS
3560        assert(wlen2 == wlen);
3561#endif
3562        unicode = PyUnicode_FromWideChar(wstr, wlen2);
3563        if (wstr != smallbuf)
3564            PyMem_Free(wstr);
3565    }
3566    return unicode;
3567
3568decode_error:
3569    reason = NULL;
3570    errmsg = strerror(errno);
3571    assert(errmsg != NULL);
3572
3573    error_pos = mbstowcs_errorpos(str, len);
3574    if (errmsg != NULL) {
3575        size_t errlen;
3576        wstr = _Py_char2wchar(errmsg, &errlen);
3577        if (wstr != NULL) {
3578            reason = PyUnicode_FromWideChar(wstr, errlen);
3579            PyMem_RawFree(wstr);
3580        }
3581    }
3582    if (reason == NULL)
3583        reason = PyUnicode_FromString(
3584            "mbstowcs() encountered an invalid multibyte sequence");
3585    if (reason == NULL)
3586        return NULL;
3587
3588    exc = PyObject_CallFunction(PyExc_UnicodeDecodeError, "sy#nnO",
3589                                "locale", str, len,
3590                                (Py_ssize_t)error_pos,
3591                                (Py_ssize_t)(error_pos+1),
3592                                reason);
3593    Py_DECREF(reason);
3594    if (exc != NULL) {
3595        PyCodec_StrictErrors(exc);
3596        Py_XDECREF(exc);
3597    }
3598    return NULL;
3599}
3600
3601PyObject*
3602PyUnicode_DecodeLocale(const char *str, const char *errors)
3603{
3604    Py_ssize_t size = (Py_ssize_t)strlen(str);
3605    return PyUnicode_DecodeLocaleAndSize(str, size, errors);
3606}
3607
3608
3609PyObject*
3610PyUnicode_DecodeFSDefault(const char *s) {
3611    Py_ssize_t size = (Py_ssize_t)strlen(s);
3612    return PyUnicode_DecodeFSDefaultAndSize(s, size);
3613}
3614
3615PyObject*
3616PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
3617{
3618#ifdef HAVE_MBCS
3619    return PyUnicode_DecodeMBCS(s, size, NULL);
3620#elif defined(__APPLE__)
3621    return PyUnicode_DecodeUTF8Stateful(s, size, "surrogateescape", NULL);
3622#else
3623    PyInterpreterState *interp = PyThreadState_GET()->interp;
3624    /* Bootstrap check: if the filesystem codec is implemented in Python, we
3625       cannot use it to encode and decode filenames before it is loaded. Load
3626       the Python codec requires to encode at least its own filename. Use the C
3627       version of the locale codec until the codec registry is initialized and
3628       the Python codec is loaded.
3629
3630       Py_FileSystemDefaultEncoding is shared between all interpreters, we
3631       cannot only rely on it: check also interp->fscodec_initialized for
3632       subinterpreters. */
3633    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
3634        return PyUnicode_Decode(s, size,
3635                                Py_FileSystemDefaultEncoding,
3636                                "surrogateescape");
3637    }
3638    else {
3639        return PyUnicode_DecodeLocaleAndSize(s, size, "surrogateescape");
3640    }
3641#endif
3642}
3643
3644
3645int
3646_PyUnicode_HasNULChars(PyObject* str)
3647{
3648    Py_ssize_t pos;
3649
3650    if (PyUnicode_READY(str) == -1)
3651        return -1;
3652    pos = findchar(PyUnicode_DATA(str), PyUnicode_KIND(str),
3653                   PyUnicode_GET_LENGTH(str), '\0', 1);
3654    if (pos == -1)
3655        return 0;
3656    else
3657        return 1;
3658}
3659
3660int
3661PyUnicode_FSConverter(PyObject* arg, void* addr)
3662{
3663    PyObject *output = NULL;
3664    Py_ssize_t size;
3665    void *data;
3666    if (arg == NULL) {
3667        Py_DECREF(*(PyObject**)addr);
3668        return 1;
3669    }
3670    if (PyBytes_Check(arg)) {
3671        output = arg;
3672        Py_INCREF(output);
3673    }
3674    else {
3675        arg = PyUnicode_FromObject(arg);
3676        if (!arg)
3677            return 0;
3678        output = PyUnicode_EncodeFSDefault(arg);
3679        Py_DECREF(arg);
3680        if (!output)
3681            return 0;
3682        if (!PyBytes_Check(output)) {
3683            Py_DECREF(output);
3684            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
3685            return 0;
3686        }
3687    }
3688    size = PyBytes_GET_SIZE(output);
3689    data = PyBytes_AS_STRING(output);
3690    if (size != strlen(data)) {
3691        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3692        Py_DECREF(output);
3693        return 0;
3694    }
3695    *(PyObject**)addr = output;
3696    return Py_CLEANUP_SUPPORTED;
3697}
3698
3699
3700int
3701PyUnicode_FSDecoder(PyObject* arg, void* addr)
3702{
3703    PyObject *output = NULL;
3704    if (arg == NULL) {
3705        Py_DECREF(*(PyObject**)addr);
3706        return 1;
3707    }
3708    if (PyUnicode_Check(arg)) {
3709        if (PyUnicode_READY(arg) == -1)
3710            return 0;
3711        output = arg;
3712        Py_INCREF(output);
3713    }
3714    else {
3715        arg = PyBytes_FromObject(arg);
3716        if (!arg)
3717            return 0;
3718        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
3719                                                  PyBytes_GET_SIZE(arg));
3720        Py_DECREF(arg);
3721        if (!output)
3722            return 0;
3723        if (!PyUnicode_Check(output)) {
3724            Py_DECREF(output);
3725            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
3726            return 0;
3727        }
3728    }
3729    if (PyUnicode_READY(output) == -1) {
3730        Py_DECREF(output);
3731        return 0;
3732    }
3733    if (findchar(PyUnicode_DATA(output), PyUnicode_KIND(output),
3734                 PyUnicode_GET_LENGTH(output), 0, 1) >= 0) {
3735        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
3736        Py_DECREF(output);
3737        return 0;
3738    }
3739    *(PyObject**)addr = output;
3740    return Py_CLEANUP_SUPPORTED;
3741}
3742
3743
3744char*
3745PyUnicode_AsUTF8AndSize(PyObject *unicode, Py_ssize_t *psize)
3746{
3747    PyObject *bytes;
3748
3749    if (!PyUnicode_Check(unicode)) {
3750        PyErr_BadArgument();
3751        return NULL;
3752    }
3753    if (PyUnicode_READY(unicode) == -1)
3754        return NULL;
3755
3756    if (PyUnicode_UTF8(unicode) == NULL) {
3757        assert(!PyUnicode_IS_COMPACT_ASCII(unicode));
3758        bytes = _PyUnicode_AsUTF8String(unicode, "strict");
3759        if (bytes == NULL)
3760            return NULL;
3761        _PyUnicode_UTF8(unicode) = PyObject_MALLOC(PyBytes_GET_SIZE(bytes) + 1);
3762        if (_PyUnicode_UTF8(unicode) == NULL) {
3763            PyErr_NoMemory();
3764            Py_DECREF(bytes);
3765            return NULL;
3766        }
3767        _PyUnicode_UTF8_LENGTH(unicode) = PyBytes_GET_SIZE(bytes);
3768        Py_MEMCPY(_PyUnicode_UTF8(unicode),
3769                  PyBytes_AS_STRING(bytes),
3770                  _PyUnicode_UTF8_LENGTH(unicode) + 1);
3771        Py_DECREF(bytes);
3772    }
3773
3774    if (psize)
3775        *psize = PyUnicode_UTF8_LENGTH(unicode);
3776    return PyUnicode_UTF8(unicode);
3777}
3778
3779char*
3780PyUnicode_AsUTF8(PyObject *unicode)
3781{
3782    return PyUnicode_AsUTF8AndSize(unicode, NULL);
3783}
3784
3785Py_UNICODE *
3786PyUnicode_AsUnicodeAndSize(PyObject *unicode, Py_ssize_t *size)
3787{
3788    const unsigned char *one_byte;
3789#if SIZEOF_WCHAR_T == 4
3790    const Py_UCS2 *two_bytes;
3791#else
3792    const Py_UCS4 *four_bytes;
3793    const Py_UCS4 *ucs4_end;
3794    Py_ssize_t num_surrogates;
3795#endif
3796    wchar_t *w;
3797    wchar_t *wchar_end;
3798
3799    if (!PyUnicode_Check(unicode)) {
3800        PyErr_BadArgument();
3801        return NULL;
3802    }
3803    if (_PyUnicode_WSTR(unicode) == NULL) {
3804        /* Non-ASCII compact unicode object */
3805        assert(_PyUnicode_KIND(unicode) != 0);
3806        assert(PyUnicode_IS_READY(unicode));
3807
3808        if (PyUnicode_KIND(unicode) == PyUnicode_4BYTE_KIND) {
3809#if SIZEOF_WCHAR_T == 2
3810            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3811            ucs4_end = four_bytes + _PyUnicode_LENGTH(unicode);
3812            num_surrogates = 0;
3813
3814            for (; four_bytes < ucs4_end; ++four_bytes) {
3815                if (*four_bytes > 0xFFFF)
3816                    ++num_surrogates;
3817            }
3818
3819            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(
3820                    sizeof(wchar_t) * (_PyUnicode_LENGTH(unicode) + 1 + num_surrogates));
3821            if (!_PyUnicode_WSTR(unicode)) {
3822                PyErr_NoMemory();
3823                return NULL;
3824            }
3825            _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode) + num_surrogates;
3826
3827            w = _PyUnicode_WSTR(unicode);
3828            wchar_end = w + _PyUnicode_WSTR_LENGTH(unicode);
3829            four_bytes = PyUnicode_4BYTE_DATA(unicode);
3830            for (; four_bytes < ucs4_end; ++four_bytes, ++w) {
3831                if (*four_bytes > 0xFFFF) {
3832                    assert(*four_bytes <= MAX_UNICODE);
3833                    /* encode surrogate pair in this case */
3834                    *w++ = Py_UNICODE_HIGH_SURROGATE(*four_bytes);
3835                    *w   = Py_UNICODE_LOW_SURROGATE(*four_bytes);
3836                }
3837                else
3838                    *w = *four_bytes;
3839
3840                if (w > wchar_end) {
3841                    assert(0 && "Miscalculated string end");
3842                }
3843            }
3844            *w = 0;
3845#else
3846            /* sizeof(wchar_t) == 4 */
3847            Py_FatalError("Impossible unicode object state, wstr and str "
3848                          "should share memory already.");
3849            return NULL;
3850#endif
3851        }
3852        else {
3853            if ((size_t)_PyUnicode_LENGTH(unicode) >
3854                    PY_SSIZE_T_MAX / sizeof(wchar_t) - 1) {
3855                PyErr_NoMemory();
3856                return NULL;
3857            }
3858            _PyUnicode_WSTR(unicode) = (wchar_t *) PyObject_MALLOC(sizeof(wchar_t) *
3859                                                  (_PyUnicode_LENGTH(unicode) + 1));
3860            if (!_PyUnicode_WSTR(unicode)) {
3861                PyErr_NoMemory();
3862                return NULL;
3863            }
3864            if (!PyUnicode_IS_COMPACT_ASCII(unicode))
3865                _PyUnicode_WSTR_LENGTH(unicode) = _PyUnicode_LENGTH(unicode);
3866            w = _PyUnicode_WSTR(unicode);
3867            wchar_end = w + _PyUnicode_LENGTH(unicode);
3868
3869            if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND) {
3870                one_byte = PyUnicode_1BYTE_DATA(unicode);
3871                for (; w < wchar_end; ++one_byte, ++w)
3872                    *w = *one_byte;
3873                /* null-terminate the wstr */
3874                *w = 0;
3875            }
3876            else if (PyUnicode_KIND(unicode) == PyUnicode_2BYTE_KIND) {
3877#if SIZEOF_WCHAR_T == 4
3878                two_bytes = PyUnicode_2BYTE_DATA(unicode);
3879                for (; w < wchar_end; ++two_bytes, ++w)
3880                    *w = *two_bytes;
3881                /* null-terminate the wstr */
3882                *w = 0;
3883#else
3884                /* sizeof(wchar_t) == 2 */
3885                PyObject_FREE(_PyUnicode_WSTR(unicode));
3886                _PyUnicode_WSTR(unicode) = NULL;
3887                Py_FatalError("Impossible unicode object state, wstr "
3888                              "and str should share memory already.");
3889                return NULL;
3890#endif
3891            }
3892            else {
3893                assert(0 && "This should never happen.");
3894            }
3895        }
3896    }
3897    if (size != NULL)
3898        *size = PyUnicode_WSTR_LENGTH(unicode);
3899    return _PyUnicode_WSTR(unicode);
3900}
3901
3902Py_UNICODE *
3903PyUnicode_AsUnicode(PyObject *unicode)
3904{
3905    return PyUnicode_AsUnicodeAndSize(unicode, NULL);
3906}
3907
3908
3909Py_ssize_t
3910PyUnicode_GetSize(PyObject *unicode)
3911{
3912    if (!PyUnicode_Check(unicode)) {
3913        PyErr_BadArgument();
3914        goto onError;
3915    }
3916    return PyUnicode_GET_SIZE(unicode);
3917
3918  onError:
3919    return -1;
3920}
3921
3922Py_ssize_t
3923PyUnicode_GetLength(PyObject *unicode)
3924{
3925    if (!PyUnicode_Check(unicode)) {
3926        PyErr_BadArgument();
3927        return -1;
3928    }
3929    if (PyUnicode_READY(unicode) == -1)
3930        return -1;
3931    return PyUnicode_GET_LENGTH(unicode);
3932}
3933
3934Py_UCS4
3935PyUnicode_ReadChar(PyObject *unicode, Py_ssize_t index)
3936{
3937    void *data;
3938    int kind;
3939
3940    if (!PyUnicode_Check(unicode) || PyUnicode_READY(unicode) == -1) {
3941        PyErr_BadArgument();
3942        return (Py_UCS4)-1;
3943    }
3944    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3945        PyErr_SetString(PyExc_IndexError, "string index out of range");
3946        return (Py_UCS4)-1;
3947    }
3948    data = PyUnicode_DATA(unicode);
3949    kind = PyUnicode_KIND(unicode);
3950    return PyUnicode_READ(kind, data, index);
3951}
3952
3953int
3954PyUnicode_WriteChar(PyObject *unicode, Py_ssize_t index, Py_UCS4 ch)
3955{
3956    if (!PyUnicode_Check(unicode) || !PyUnicode_IS_COMPACT(unicode)) {
3957        PyErr_BadArgument();
3958        return -1;
3959    }
3960    assert(PyUnicode_IS_READY(unicode));
3961    if (index < 0 || index >= PyUnicode_GET_LENGTH(unicode)) {
3962        PyErr_SetString(PyExc_IndexError, "string index out of range");
3963        return -1;
3964    }
3965    if (unicode_check_modifiable(unicode))
3966        return -1;
3967    if (ch > PyUnicode_MAX_CHAR_VALUE(unicode)) {
3968        PyErr_SetString(PyExc_ValueError, "character out of range");
3969        return -1;
3970    }
3971    PyUnicode_WRITE(PyUnicode_KIND(unicode), PyUnicode_DATA(unicode),
3972                    index, ch);
3973    return 0;
3974}
3975
3976const char *
3977PyUnicode_GetDefaultEncoding(void)
3978{
3979    return "utf-8";
3980}
3981
3982/* create or adjust a UnicodeDecodeError */
3983static void
3984make_decode_exception(PyObject **exceptionObject,
3985                      const char *encoding,
3986                      const char *input, Py_ssize_t length,
3987                      Py_ssize_t startpos, Py_ssize_t endpos,
3988                      const char *reason)
3989{
3990    if (*exceptionObject == NULL) {
3991        *exceptionObject = PyUnicodeDecodeError_Create(
3992            encoding, input, length, startpos, endpos, reason);
3993    }
3994    else {
3995        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
3996            goto onError;
3997        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
3998            goto onError;
3999        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
4000            goto onError;
4001    }
4002    return;
4003
4004onError:
4005    Py_CLEAR(*exceptionObject);
4006}
4007
4008#ifdef HAVE_MBCS
4009/* error handling callback helper:
4010   build arguments, call the callback and check the arguments,
4011   if no exception occurred, copy the replacement to the output
4012   and adjust various state variables.
4013   return 0 on success, -1 on error
4014*/
4015
4016static int
4017unicode_decode_call_errorhandler_wchar(
4018    const char *errors, PyObject **errorHandler,
4019    const char *encoding, const char *reason,
4020    const char **input, const char **inend, Py_ssize_t *startinpos,
4021    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4022    PyObject **output, Py_ssize_t *outpos)
4023{
4024    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4025
4026    PyObject *restuple = NULL;
4027    PyObject *repunicode = NULL;
4028    Py_ssize_t outsize;
4029    Py_ssize_t insize;
4030    Py_ssize_t requiredsize;
4031    Py_ssize_t newpos;
4032    PyObject *inputobj = NULL;
4033    wchar_t *repwstr;
4034    Py_ssize_t repwlen;
4035
4036    assert (_PyUnicode_KIND(*output) == PyUnicode_WCHAR_KIND);
4037    outsize = _PyUnicode_WSTR_LENGTH(*output);
4038
4039    if (*errorHandler == NULL) {
4040        *errorHandler = PyCodec_LookupError(errors);
4041        if (*errorHandler == NULL)
4042            goto onError;
4043    }
4044
4045    make_decode_exception(exceptionObject,
4046        encoding,
4047        *input, *inend - *input,
4048        *startinpos, *endinpos,
4049        reason);
4050    if (*exceptionObject == NULL)
4051        goto onError;
4052
4053    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4054    if (restuple == NULL)
4055        goto onError;
4056    if (!PyTuple_Check(restuple)) {
4057        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4058        goto onError;
4059    }
4060    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4061        goto onError;
4062
4063    /* Copy back the bytes variables, which might have been modified by the
4064       callback */
4065    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4066    if (!inputobj)
4067        goto onError;
4068    if (!PyBytes_Check(inputobj)) {
4069        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4070    }
4071    *input = PyBytes_AS_STRING(inputobj);
4072    insize = PyBytes_GET_SIZE(inputobj);
4073    *inend = *input + insize;
4074    /* we can DECREF safely, as the exception has another reference,
4075       so the object won't go away. */
4076    Py_DECREF(inputobj);
4077
4078    if (newpos<0)
4079        newpos = insize+newpos;
4080    if (newpos<0 || newpos>insize) {
4081        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4082        goto onError;
4083    }
4084
4085    repwstr = PyUnicode_AsUnicodeAndSize(repunicode, &repwlen);
4086    if (repwstr == NULL)
4087        goto onError;
4088    /* need more space? (at least enough for what we
4089       have+the replacement+the rest of the string (starting
4090       at the new input position), so we won't have to check space
4091       when there are no errors in the rest of the string) */
4092    requiredsize = *outpos;
4093    if (requiredsize > PY_SSIZE_T_MAX - repwlen)
4094        goto overflow;
4095    requiredsize += repwlen;
4096    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
4097        goto overflow;
4098    requiredsize += insize - newpos;
4099    if (requiredsize > outsize) {
4100        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
4101            requiredsize = 2*outsize;
4102        if (unicode_resize(output, requiredsize) < 0)
4103            goto onError;
4104    }
4105    wcsncpy(_PyUnicode_WSTR(*output) + *outpos, repwstr, repwlen);
4106    *outpos += repwlen;
4107    *endinpos = newpos;
4108    *inptr = *input + newpos;
4109
4110    /* we made it! */
4111    Py_XDECREF(restuple);
4112    return 0;
4113
4114  overflow:
4115    PyErr_SetString(PyExc_OverflowError,
4116                    "decoded result is too long for a Python string");
4117
4118  onError:
4119    Py_XDECREF(restuple);
4120    return -1;
4121}
4122#endif   /* HAVE_MBCS */
4123
4124static int
4125unicode_decode_call_errorhandler_writer(
4126    const char *errors, PyObject **errorHandler,
4127    const char *encoding, const char *reason,
4128    const char **input, const char **inend, Py_ssize_t *startinpos,
4129    Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
4130    _PyUnicodeWriter *writer /* PyObject **output, Py_ssize_t *outpos */)
4131{
4132    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
4133
4134    PyObject *restuple = NULL;
4135    PyObject *repunicode = NULL;
4136    Py_ssize_t insize;
4137    Py_ssize_t newpos;
4138    Py_ssize_t replen;
4139    PyObject *inputobj = NULL;
4140
4141    if (*errorHandler == NULL) {
4142        *errorHandler = PyCodec_LookupError(errors);
4143        if (*errorHandler == NULL)
4144            goto onError;
4145    }
4146
4147    make_decode_exception(exceptionObject,
4148        encoding,
4149        *input, *inend - *input,
4150        *startinpos, *endinpos,
4151        reason);
4152    if (*exceptionObject == NULL)
4153        goto onError;
4154
4155    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
4156    if (restuple == NULL)
4157        goto onError;
4158    if (!PyTuple_Check(restuple)) {
4159        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4160        goto onError;
4161    }
4162    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
4163        goto onError;
4164
4165    /* Copy back the bytes variables, which might have been modified by the
4166       callback */
4167    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
4168    if (!inputobj)
4169        goto onError;
4170    if (!PyBytes_Check(inputobj)) {
4171        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
4172    }
4173    *input = PyBytes_AS_STRING(inputobj);
4174    insize = PyBytes_GET_SIZE(inputobj);
4175    *inend = *input + insize;
4176    /* we can DECREF safely, as the exception has another reference,
4177       so the object won't go away. */
4178    Py_DECREF(inputobj);
4179
4180    if (newpos<0)
4181        newpos = insize+newpos;
4182    if (newpos<0 || newpos>insize) {
4183        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
4184        goto onError;
4185    }
4186
4187    if (PyUnicode_READY(repunicode) < 0)
4188        goto onError;
4189    replen = PyUnicode_GET_LENGTH(repunicode);
4190    if (replen > 1) {
4191        writer->min_length += replen - 1;
4192        writer->overallocate = 1;
4193        if (_PyUnicodeWriter_Prepare(writer, writer->min_length,
4194                            PyUnicode_MAX_CHAR_VALUE(repunicode)) == -1)
4195            goto onError;
4196    }
4197    if (_PyUnicodeWriter_WriteStr(writer, repunicode) == -1)
4198        goto onError;
4199
4200    *endinpos = newpos;
4201    *inptr = *input + newpos;
4202
4203    /* we made it! */
4204    Py_XDECREF(restuple);
4205    return 0;
4206
4207  onError:
4208    Py_XDECREF(restuple);
4209    return -1;
4210}
4211
4212/* --- UTF-7 Codec -------------------------------------------------------- */
4213
4214/* See RFC2152 for details.  We encode conservatively and decode liberally. */
4215
4216/* Three simple macros defining base-64. */
4217
4218/* Is c a base-64 character? */
4219
4220#define IS_BASE64(c) \
4221    (((c) >= 'A' && (c) <= 'Z') ||     \
4222     ((c) >= 'a' && (c) <= 'z') ||     \
4223     ((c) >= '0' && (c) <= '9') ||     \
4224     (c) == '+' || (c) == '/')
4225
4226/* given that c is a base-64 character, what is its base-64 value? */
4227
4228#define FROM_BASE64(c)                                                  \
4229    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
4230     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
4231     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
4232     (c) == '+' ? 62 : 63)
4233
4234/* What is the base-64 character of the bottom 6 bits of n? */
4235
4236#define TO_BASE64(n)  \
4237    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
4238
4239/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
4240 * decoded as itself.  We are permissive on decoding; the only ASCII
4241 * byte not decoding to itself is the + which begins a base64
4242 * string. */
4243
4244#define DECODE_DIRECT(c)                                \
4245    ((c) <= 127 && (c) != '+')
4246
4247/* The UTF-7 encoder treats ASCII characters differently according to
4248 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
4249 * the above).  See RFC2152.  This array identifies these different
4250 * sets:
4251 * 0 : "Set D"
4252 *     alphanumeric and '(),-./:?
4253 * 1 : "Set O"
4254 *     !"#$%&*;<=>@[]^_`{|}
4255 * 2 : "whitespace"
4256 *     ht nl cr sp
4257 * 3 : special (must be base64 encoded)
4258 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
4259 */
4260
4261static
4262char utf7_category[128] = {
4263/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
4264    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
4265/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
4266    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
4267/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
4268    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
4269/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
4270    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
4271/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
4272    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4273/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
4274    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
4275/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
4276    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
4277/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
4278    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
4279};
4280
4281/* ENCODE_DIRECT: this character should be encoded as itself.  The
4282 * answer depends on whether we are encoding set O as itself, and also
4283 * on whether we are encoding whitespace as itself.  RFC2152 makes it
4284 * clear that the answers to these questions vary between
4285 * applications, so this code needs to be flexible.  */
4286
4287#define ENCODE_DIRECT(c, directO, directWS)             \
4288    ((c) < 128 && (c) > 0 &&                            \
4289     ((utf7_category[(c)] == 0) ||                      \
4290      (directWS && (utf7_category[(c)] == 2)) ||        \
4291      (directO && (utf7_category[(c)] == 1))))
4292
4293PyObject *
4294PyUnicode_DecodeUTF7(const char *s,
4295                     Py_ssize_t size,
4296                     const char *errors)
4297{
4298    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
4299}
4300
4301/* The decoder.  The only state we preserve is our read position,
4302 * i.e. how many characters we have consumed.  So if we end in the
4303 * middle of a shift sequence we have to back off the read position
4304 * and the output to the beginning of the sequence, otherwise we lose
4305 * all the shift state (seen bits, number of bits seen, high
4306 * surrogate). */
4307
4308PyObject *
4309PyUnicode_DecodeUTF7Stateful(const char *s,
4310                             Py_ssize_t size,
4311                             const char *errors,
4312                             Py_ssize_t *consumed)
4313{
4314    const char *starts = s;
4315    Py_ssize_t startinpos;
4316    Py_ssize_t endinpos;
4317    const char *e;
4318    _PyUnicodeWriter writer;
4319    const char *errmsg = "";
4320    int inShift = 0;
4321    Py_ssize_t shiftOutStart;
4322    unsigned int base64bits = 0;
4323    unsigned long base64buffer = 0;
4324    Py_UCS4 surrogate = 0;
4325    PyObject *errorHandler = NULL;
4326    PyObject *exc = NULL;
4327
4328    if (size == 0) {
4329        if (consumed)
4330            *consumed = 0;
4331        _Py_RETURN_UNICODE_EMPTY();
4332    }
4333
4334    /* Start off assuming it's all ASCII. Widen later as necessary. */
4335    _PyUnicodeWriter_Init(&writer);
4336    writer.min_length = size;
4337
4338    shiftOutStart = 0;
4339    e = s + size;
4340
4341    while (s < e) {
4342        Py_UCS4 ch;
4343      restart:
4344        ch = (unsigned char) *s;
4345
4346        if (inShift) { /* in a base-64 section */
4347            if (IS_BASE64(ch)) { /* consume a base-64 character */
4348                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
4349                base64bits += 6;
4350                s++;
4351                if (base64bits >= 16) {
4352                    /* we have enough bits for a UTF-16 value */
4353                    Py_UCS4 outCh = (Py_UCS4)(base64buffer >> (base64bits-16));
4354                    base64bits -= 16;
4355                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
4356                    assert(outCh <= 0xffff);
4357                    if (surrogate) {
4358                        /* expecting a second surrogate */
4359                        if (Py_UNICODE_IS_LOW_SURROGATE(outCh)) {
4360                            Py_UCS4 ch2 = Py_UNICODE_JOIN_SURROGATES(surrogate, outCh);
4361                            if (_PyUnicodeWriter_WriteCharInline(&writer, ch2) < 0)
4362                                goto onError;
4363                            surrogate = 0;
4364                            continue;
4365                        }
4366                        else {
4367                            if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4368                                goto onError;
4369                            surrogate = 0;
4370                        }
4371                    }
4372                    if (Py_UNICODE_IS_HIGH_SURROGATE(outCh)) {
4373                        /* first surrogate */
4374                        surrogate = outCh;
4375                    }
4376                    else {
4377                        if (_PyUnicodeWriter_WriteCharInline(&writer, outCh) < 0)
4378                            goto onError;
4379                    }
4380                }
4381            }
4382            else { /* now leaving a base-64 section */
4383                inShift = 0;
4384                s++;
4385                if (surrogate) {
4386                    if (_PyUnicodeWriter_WriteCharInline(&writer, surrogate) < 0)
4387                        goto onError;
4388                    surrogate = 0;
4389                }
4390                if (base64bits > 0) { /* left-over bits */
4391                    if (base64bits >= 6) {
4392                        /* We've seen at least one base-64 character */
4393                        errmsg = "partial character in shift sequence";
4394                        goto utf7Error;
4395                    }
4396                    else {
4397                        /* Some bits remain; they should be zero */
4398                        if (base64buffer != 0) {
4399                            errmsg = "non-zero padding bits in shift sequence";
4400                            goto utf7Error;
4401                        }
4402                    }
4403                }
4404                if (ch != '-') {
4405                    /* '-' is absorbed; other terminating
4406                       characters are preserved */
4407                    if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4408                        goto onError;
4409                }
4410            }
4411        }
4412        else if ( ch == '+' ) {
4413            startinpos = s-starts;
4414            s++; /* consume '+' */
4415            if (s < e && *s == '-') { /* '+-' encodes '+' */
4416                s++;
4417                if (_PyUnicodeWriter_WriteCharInline(&writer, '+') < 0)
4418                    goto onError;
4419            }
4420            else { /* begin base64-encoded section */
4421                inShift = 1;
4422                shiftOutStart = writer.pos;
4423                base64bits = 0;
4424                base64buffer = 0;
4425            }
4426        }
4427        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
4428            s++;
4429            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4430                goto onError;
4431        }
4432        else {
4433            startinpos = s-starts;
4434            s++;
4435            errmsg = "unexpected special character";
4436            goto utf7Error;
4437        }
4438        continue;
4439utf7Error:
4440        endinpos = s-starts;
4441        if (unicode_decode_call_errorhandler_writer(
4442                errors, &errorHandler,
4443                "utf7", errmsg,
4444                &starts, &e, &startinpos, &endinpos, &exc, &s,
4445                &writer))
4446            goto onError;
4447    }
4448
4449    /* end of string */
4450
4451    if (inShift && !consumed) { /* in shift sequence, no more to follow */
4452        /* if we're in an inconsistent state, that's an error */
4453        if (surrogate ||
4454                (base64bits >= 6) ||
4455                (base64bits > 0 && base64buffer != 0)) {
4456            endinpos = size;
4457            if (unicode_decode_call_errorhandler_writer(
4458                    errors, &errorHandler,
4459                    "utf7", "unterminated shift sequence",
4460                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4461                    &writer))
4462                goto onError;
4463            if (s < e)
4464                goto restart;
4465        }
4466    }
4467
4468    /* return state */
4469    if (consumed) {
4470        if (inShift) {
4471            *consumed = startinpos;
4472            if (writer.pos != shiftOutStart && writer.maxchar > 127) {
4473                PyObject *result = PyUnicode_FromKindAndData(
4474                        writer.kind, writer.data, shiftOutStart);
4475                Py_XDECREF(errorHandler);
4476                Py_XDECREF(exc);
4477                _PyUnicodeWriter_Dealloc(&writer);
4478                return result;
4479            }
4480            writer.pos = shiftOutStart; /* back off output */
4481        }
4482        else {
4483            *consumed = s-starts;
4484        }
4485    }
4486
4487    Py_XDECREF(errorHandler);
4488    Py_XDECREF(exc);
4489    return _PyUnicodeWriter_Finish(&writer);
4490
4491  onError:
4492    Py_XDECREF(errorHandler);
4493    Py_XDECREF(exc);
4494    _PyUnicodeWriter_Dealloc(&writer);
4495    return NULL;
4496}
4497
4498
4499PyObject *
4500_PyUnicode_EncodeUTF7(PyObject *str,
4501                      int base64SetO,
4502                      int base64WhiteSpace,
4503                      const char *errors)
4504{
4505    int kind;
4506    void *data;
4507    Py_ssize_t len;
4508    PyObject *v;
4509    int inShift = 0;
4510    Py_ssize_t i;
4511    unsigned int base64bits = 0;
4512    unsigned long base64buffer = 0;
4513    char * out;
4514    char * start;
4515
4516    if (PyUnicode_READY(str) == -1)
4517        return NULL;
4518    kind = PyUnicode_KIND(str);
4519    data = PyUnicode_DATA(str);
4520    len = PyUnicode_GET_LENGTH(str);
4521
4522    if (len == 0)
4523        return PyBytes_FromStringAndSize(NULL, 0);
4524
4525    /* It might be possible to tighten this worst case */
4526    if (len > PY_SSIZE_T_MAX / 8)
4527        return PyErr_NoMemory();
4528    v = PyBytes_FromStringAndSize(NULL, len * 8);
4529    if (v == NULL)
4530        return NULL;
4531
4532    start = out = PyBytes_AS_STRING(v);
4533    for (i = 0; i < len; ++i) {
4534        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
4535
4536        if (inShift) {
4537            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4538                /* shifting out */
4539                if (base64bits) { /* output remaining bits */
4540                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
4541                    base64buffer = 0;
4542                    base64bits = 0;
4543                }
4544                inShift = 0;
4545                /* Characters not in the BASE64 set implicitly unshift the sequence
4546                   so no '-' is required, except if the character is itself a '-' */
4547                if (IS_BASE64(ch) || ch == '-') {
4548                    *out++ = '-';
4549                }
4550                *out++ = (char) ch;
4551            }
4552            else {
4553                goto encode_char;
4554            }
4555        }
4556        else { /* not in a shift sequence */
4557            if (ch == '+') {
4558                *out++ = '+';
4559                        *out++ = '-';
4560            }
4561            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
4562                *out++ = (char) ch;
4563            }
4564            else {
4565                *out++ = '+';
4566                inShift = 1;
4567                goto encode_char;
4568            }
4569        }
4570        continue;
4571encode_char:
4572        if (ch >= 0x10000) {
4573            assert(ch <= MAX_UNICODE);
4574
4575            /* code first surrogate */
4576            base64bits += 16;
4577            base64buffer = (base64buffer << 16) | Py_UNICODE_HIGH_SURROGATE(ch);
4578            while (base64bits >= 6) {
4579                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4580                base64bits -= 6;
4581            }
4582            /* prepare second surrogate */
4583            ch = Py_UNICODE_LOW_SURROGATE(ch);
4584        }
4585        base64bits += 16;
4586        base64buffer = (base64buffer << 16) | ch;
4587        while (base64bits >= 6) {
4588            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
4589            base64bits -= 6;
4590        }
4591    }
4592    if (base64bits)
4593        *out++= TO_BASE64(base64buffer << (6-base64bits) );
4594    if (inShift)
4595        *out++ = '-';
4596    if (_PyBytes_Resize(&v, out - start) < 0)
4597        return NULL;
4598    return v;
4599}
4600PyObject *
4601PyUnicode_EncodeUTF7(const Py_UNICODE *s,
4602                     Py_ssize_t size,
4603                     int base64SetO,
4604                     int base64WhiteSpace,
4605                     const char *errors)
4606{
4607    PyObject *result;
4608    PyObject *tmp = PyUnicode_FromUnicode(s, size);
4609    if (tmp == NULL)
4610        return NULL;
4611    result = _PyUnicode_EncodeUTF7(tmp, base64SetO,
4612                                   base64WhiteSpace, errors);
4613    Py_DECREF(tmp);
4614    return result;
4615}
4616
4617#undef IS_BASE64
4618#undef FROM_BASE64
4619#undef TO_BASE64
4620#undef DECODE_DIRECT
4621#undef ENCODE_DIRECT
4622
4623/* --- UTF-8 Codec -------------------------------------------------------- */
4624
4625PyObject *
4626PyUnicode_DecodeUTF8(const char *s,
4627                     Py_ssize_t size,
4628                     const char *errors)
4629{
4630    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
4631}
4632
4633#include "stringlib/asciilib.h"
4634#include "stringlib/codecs.h"
4635#include "stringlib/undef.h"
4636
4637#include "stringlib/ucs1lib.h"
4638#include "stringlib/codecs.h"
4639#include "stringlib/undef.h"
4640
4641#include "stringlib/ucs2lib.h"
4642#include "stringlib/codecs.h"
4643#include "stringlib/undef.h"
4644
4645#include "stringlib/ucs4lib.h"
4646#include "stringlib/codecs.h"
4647#include "stringlib/undef.h"
4648
4649/* Mask to quickly check whether a C 'long' contains a
4650   non-ASCII, UTF8-encoded char. */
4651#if (SIZEOF_LONG == 8)
4652# define ASCII_CHAR_MASK 0x8080808080808080UL
4653#elif (SIZEOF_LONG == 4)
4654# define ASCII_CHAR_MASK 0x80808080UL
4655#else
4656# error C 'long' size should be either 4 or 8!
4657#endif
4658
4659static Py_ssize_t
4660ascii_decode(const char *start, const char *end, Py_UCS1 *dest)
4661{
4662    const char *p = start;
4663    const char *aligned_end = (const char *) _Py_ALIGN_DOWN(end, SIZEOF_LONG);
4664
4665    /*
4666     * Issue #17237: m68k is a bit different from most architectures in
4667     * that objects do not use "natural alignment" - for example, int and
4668     * long are only aligned at 2-byte boundaries.  Therefore the assert()
4669     * won't work; also, tests have shown that skipping the "optimised
4670     * version" will even speed up m68k.
4671     */
4672#if !defined(__m68k__)
4673#if SIZEOF_LONG <= SIZEOF_VOID_P
4674    assert(_Py_IS_ALIGNED(dest, SIZEOF_LONG));
4675    if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4676        /* Fast path, see in STRINGLIB(utf8_decode) for
4677           an explanation. */
4678        /* Help allocation */
4679        const char *_p = p;
4680        Py_UCS1 * q = dest;
4681        while (_p < aligned_end) {
4682            unsigned long value = *(const unsigned long *) _p;
4683            if (value & ASCII_CHAR_MASK)
4684                break;
4685            *((unsigned long *)q) = value;
4686            _p += SIZEOF_LONG;
4687            q += SIZEOF_LONG;
4688        }
4689        p = _p;
4690        while (p < end) {
4691            if ((unsigned char)*p & 0x80)
4692                break;
4693            *q++ = *p++;
4694        }
4695        return p - start;
4696    }
4697#endif
4698#endif
4699    while (p < end) {
4700        /* Fast path, see in STRINGLIB(utf8_decode) in stringlib/codecs.h
4701           for an explanation. */
4702        if (_Py_IS_ALIGNED(p, SIZEOF_LONG)) {
4703            /* Help allocation */
4704            const char *_p = p;
4705            while (_p < aligned_end) {
4706                unsigned long value = *(unsigned long *) _p;
4707                if (value & ASCII_CHAR_MASK)
4708                    break;
4709                _p += SIZEOF_LONG;
4710            }
4711            p = _p;
4712            if (_p == end)
4713                break;
4714        }
4715        if ((unsigned char)*p & 0x80)
4716            break;
4717        ++p;
4718    }
4719    memcpy(dest, start, p - start);
4720    return p - start;
4721}
4722
4723PyObject *
4724PyUnicode_DecodeUTF8Stateful(const char *s,
4725                             Py_ssize_t size,
4726                             const char *errors,
4727                             Py_ssize_t *consumed)
4728{
4729    _PyUnicodeWriter writer;
4730    const char *starts = s;
4731    const char *end = s + size;
4732
4733    Py_ssize_t startinpos;
4734    Py_ssize_t endinpos;
4735    const char *errmsg = "";
4736    PyObject *errorHandler = NULL;
4737    PyObject *exc = NULL;
4738
4739    if (size == 0) {
4740        if (consumed)
4741            *consumed = 0;
4742        _Py_RETURN_UNICODE_EMPTY();
4743    }
4744
4745    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4746    if (size == 1 && (unsigned char)s[0] < 128) {
4747        if (consumed)
4748            *consumed = 1;
4749        return get_latin1_char((unsigned char)s[0]);
4750    }
4751
4752    _PyUnicodeWriter_Init(&writer);
4753    writer.min_length = size;
4754    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
4755        goto onError;
4756
4757    writer.pos = ascii_decode(s, end, writer.data);
4758    s += writer.pos;
4759    while (s < end) {
4760        Py_UCS4 ch;
4761        int kind = writer.kind;
4762        if (kind == PyUnicode_1BYTE_KIND) {
4763            if (PyUnicode_IS_ASCII(writer.buffer))
4764                ch = asciilib_utf8_decode(&s, end, writer.data, &writer.pos);
4765            else
4766                ch = ucs1lib_utf8_decode(&s, end, writer.data, &writer.pos);
4767        } else if (kind == PyUnicode_2BYTE_KIND) {
4768            ch = ucs2lib_utf8_decode(&s, end, writer.data, &writer.pos);
4769        } else {
4770            assert(kind == PyUnicode_4BYTE_KIND);
4771            ch = ucs4lib_utf8_decode(&s, end, writer.data, &writer.pos);
4772        }
4773
4774        switch (ch) {
4775        case 0:
4776            if (s == end || consumed)
4777                goto End;
4778            errmsg = "unexpected end of data";
4779            startinpos = s - starts;
4780            endinpos = end - starts;
4781            break;
4782        case 1:
4783            errmsg = "invalid start byte";
4784            startinpos = s - starts;
4785            endinpos = startinpos + 1;
4786            break;
4787        case 2:
4788        case 3:
4789        case 4:
4790            errmsg = "invalid continuation byte";
4791            startinpos = s - starts;
4792            endinpos = startinpos + ch - 1;
4793            break;
4794        default:
4795            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
4796                goto onError;
4797            continue;
4798        }
4799
4800        if (unicode_decode_call_errorhandler_writer(
4801                errors, &errorHandler,
4802                "utf-8", errmsg,
4803                &starts, &end, &startinpos, &endinpos, &exc, &s,
4804                &writer))
4805            goto onError;
4806    }
4807
4808End:
4809    if (consumed)
4810        *consumed = s - starts;
4811
4812    Py_XDECREF(errorHandler);
4813    Py_XDECREF(exc);
4814    return _PyUnicodeWriter_Finish(&writer);
4815
4816onError:
4817    Py_XDECREF(errorHandler);
4818    Py_XDECREF(exc);
4819    _PyUnicodeWriter_Dealloc(&writer);
4820    return NULL;
4821}
4822
4823#ifdef __APPLE__
4824
4825/* Simplified UTF-8 decoder using surrogateescape error handler,
4826   used to decode the command line arguments on Mac OS X.
4827
4828   Return a pointer to a newly allocated wide character string (use
4829   PyMem_RawFree() to free the memory), or NULL on memory allocation error. */
4830
4831wchar_t*
4832_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
4833{
4834    const char *e;
4835    wchar_t *unicode;
4836    Py_ssize_t outpos;
4837
4838    /* Note: size will always be longer than the resulting Unicode
4839       character count */
4840    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1))
4841        return NULL;
4842    unicode = PyMem_RawMalloc((size + 1) * sizeof(wchar_t));
4843    if (!unicode)
4844        return NULL;
4845
4846    /* Unpack UTF-8 encoded data */
4847    e = s + size;
4848    outpos = 0;
4849    while (s < e) {
4850        Py_UCS4 ch;
4851#if SIZEOF_WCHAR_T == 4
4852        ch = ucs4lib_utf8_decode(&s, e, (Py_UCS4 *)unicode, &outpos);
4853#else
4854        ch = ucs2lib_utf8_decode(&s, e, (Py_UCS2 *)unicode, &outpos);
4855#endif
4856        if (ch > 0xFF) {
4857#if SIZEOF_WCHAR_T == 4
4858            assert(0);
4859#else
4860            assert(Py_UNICODE_IS_SURROGATE(ch));
4861            /*  compute and append the two surrogates: */
4862            unicode[outpos++] = (wchar_t)Py_UNICODE_HIGH_SURROGATE(ch);
4863            unicode[outpos++] = (wchar_t)Py_UNICODE_LOW_SURROGATE(ch);
4864#endif
4865        }
4866        else {
4867            if (!ch && s == e)
4868                break;
4869            /* surrogateescape */
4870            unicode[outpos++] = 0xDC00 + (unsigned char)*s++;
4871        }
4872    }
4873    unicode[outpos] = L'\0';
4874    return unicode;
4875}
4876
4877#endif /* __APPLE__ */
4878
4879/* Primary internal function which creates utf8 encoded bytes objects.
4880
4881   Allocation strategy:  if the string is short, convert into a stack buffer
4882   and allocate exactly as much space needed at the end.  Else allocate the
4883   maximum possible needed (4 result bytes per Unicode character), and return
4884   the excess memory at the end.
4885*/
4886PyObject *
4887_PyUnicode_AsUTF8String(PyObject *unicode, const char *errors)
4888{
4889    enum PyUnicode_Kind kind;
4890    void *data;
4891    Py_ssize_t size;
4892
4893    if (!PyUnicode_Check(unicode)) {
4894        PyErr_BadArgument();
4895        return NULL;
4896    }
4897
4898    if (PyUnicode_READY(unicode) == -1)
4899        return NULL;
4900
4901    if (PyUnicode_UTF8(unicode))
4902        return PyBytes_FromStringAndSize(PyUnicode_UTF8(unicode),
4903                                         PyUnicode_UTF8_LENGTH(unicode));
4904
4905    kind = PyUnicode_KIND(unicode);
4906    data = PyUnicode_DATA(unicode);
4907    size = PyUnicode_GET_LENGTH(unicode);
4908
4909    switch (kind) {
4910    default:
4911        assert(0);
4912    case PyUnicode_1BYTE_KIND:
4913        /* the string cannot be ASCII, or PyUnicode_UTF8() would be set */
4914        assert(!PyUnicode_IS_ASCII(unicode));
4915        return ucs1lib_utf8_encoder(unicode, data, size, errors);
4916    case PyUnicode_2BYTE_KIND:
4917        return ucs2lib_utf8_encoder(unicode, data, size, errors);
4918    case PyUnicode_4BYTE_KIND:
4919        return ucs4lib_utf8_encoder(unicode, data, size, errors);
4920    }
4921}
4922
4923PyObject *
4924PyUnicode_EncodeUTF8(const Py_UNICODE *s,
4925                     Py_ssize_t size,
4926                     const char *errors)
4927{
4928    PyObject *v, *unicode;
4929
4930    unicode = PyUnicode_FromUnicode(s, size);
4931    if (unicode == NULL)
4932        return NULL;
4933    v = _PyUnicode_AsUTF8String(unicode, errors);
4934    Py_DECREF(unicode);
4935    return v;
4936}
4937
4938PyObject *
4939PyUnicode_AsUTF8String(PyObject *unicode)
4940{
4941    return _PyUnicode_AsUTF8String(unicode, NULL);
4942}
4943
4944/* --- UTF-32 Codec ------------------------------------------------------- */
4945
4946PyObject *
4947PyUnicode_DecodeUTF32(const char *s,
4948                      Py_ssize_t size,
4949                      const char *errors,
4950                      int *byteorder)
4951{
4952    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
4953}
4954
4955PyObject *
4956PyUnicode_DecodeUTF32Stateful(const char *s,
4957                              Py_ssize_t size,
4958                              const char *errors,
4959                              int *byteorder,
4960                              Py_ssize_t *consumed)
4961{
4962    const char *starts = s;
4963    Py_ssize_t startinpos;
4964    Py_ssize_t endinpos;
4965    _PyUnicodeWriter writer;
4966    const unsigned char *q, *e;
4967    int le, bo = 0;       /* assume native ordering by default */
4968    const char *encoding;
4969    const char *errmsg = "";
4970    PyObject *errorHandler = NULL;
4971    PyObject *exc = NULL;
4972
4973    q = (unsigned char *)s;
4974    e = q + size;
4975
4976    if (byteorder)
4977        bo = *byteorder;
4978
4979    /* Check for BOM marks (U+FEFF) in the input and adjust current
4980       byte order setting accordingly. In native mode, the leading BOM
4981       mark is skipped, in all other modes, it is copied to the output
4982       stream as-is (giving a ZWNBSP character). */
4983    if (bo == 0 && size >= 4) {
4984        Py_UCS4 bom = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
4985        if (bom == 0x0000FEFF) {
4986            bo = -1;
4987            q += 4;
4988        }
4989        else if (bom == 0xFFFE0000) {
4990            bo = 1;
4991            q += 4;
4992        }
4993        if (byteorder)
4994            *byteorder = bo;
4995    }
4996
4997    if (q == e) {
4998        if (consumed)
4999            *consumed = size;
5000        _Py_RETURN_UNICODE_EMPTY();
5001    }
5002
5003#ifdef WORDS_BIGENDIAN
5004    le = bo < 0;
5005#else
5006    le = bo <= 0;
5007#endif
5008    encoding = le ? "utf-32-le" : "utf-32-be";
5009
5010    _PyUnicodeWriter_Init(&writer);
5011    writer.min_length = (e - q + 3) / 4;
5012    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5013        goto onError;
5014
5015    while (1) {
5016        Py_UCS4 ch = 0;
5017        Py_UCS4 maxch = PyUnicode_MAX_CHAR_VALUE(writer.buffer);
5018
5019        if (e - q >= 4) {
5020            enum PyUnicode_Kind kind = writer.kind;
5021            void *data = writer.data;
5022            const unsigned char *last = e - 4;
5023            Py_ssize_t pos = writer.pos;
5024            if (le) {
5025                do {
5026                    ch = (q[3] << 24) | (q[2] << 16) | (q[1] << 8) | q[0];
5027                    if (ch > maxch)
5028                        break;
5029                    if (kind != PyUnicode_1BYTE_KIND &&
5030                        Py_UNICODE_IS_SURROGATE(ch))
5031                        break;
5032                    PyUnicode_WRITE(kind, data, pos++, ch);
5033                    q += 4;
5034                } while (q <= last);
5035            }
5036            else {
5037                do {
5038                    ch = (q[0] << 24) | (q[1] << 16) | (q[2] << 8) | q[3];
5039                    if (ch > maxch)
5040                        break;
5041                    if (kind != PyUnicode_1BYTE_KIND &&
5042                        Py_UNICODE_IS_SURROGATE(ch))
5043                        break;
5044                    PyUnicode_WRITE(kind, data, pos++, ch);
5045                    q += 4;
5046                } while (q <= last);
5047            }
5048            writer.pos = pos;
5049        }
5050
5051        if (Py_UNICODE_IS_SURROGATE(ch)) {
5052            errmsg = "code point in surrogate code point range(0xd800, 0xe000)";
5053            startinpos = ((const char *)q) - starts;
5054            endinpos = startinpos + 4;
5055        }
5056        else if (ch <= maxch) {
5057            if (q == e || consumed)
5058                break;
5059            /* remaining bytes at the end? (size should be divisible by 4) */
5060            errmsg = "truncated data";
5061            startinpos = ((const char *)q) - starts;
5062            endinpos = ((const char *)e) - starts;
5063        }
5064        else {
5065            if (ch < 0x110000) {
5066                if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5067                    goto onError;
5068                q += 4;
5069                continue;
5070            }
5071            errmsg = "code point not in range(0x110000)";
5072            startinpos = ((const char *)q) - starts;
5073            endinpos = startinpos + 4;
5074        }
5075
5076        /* The remaining input chars are ignored if the callback
5077           chooses to skip the input */
5078        if (unicode_decode_call_errorhandler_writer(
5079                errors, &errorHandler,
5080                encoding, errmsg,
5081                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
5082                &writer))
5083            goto onError;
5084    }
5085
5086    if (consumed)
5087        *consumed = (const char *)q-starts;
5088
5089    Py_XDECREF(errorHandler);
5090    Py_XDECREF(exc);
5091    return _PyUnicodeWriter_Finish(&writer);
5092
5093  onError:
5094    _PyUnicodeWriter_Dealloc(&writer);
5095    Py_XDECREF(errorHandler);
5096    Py_XDECREF(exc);
5097    return NULL;
5098}
5099
5100PyObject *
5101_PyUnicode_EncodeUTF32(PyObject *str,
5102                       const char *errors,
5103                       int byteorder)
5104{
5105    int kind;
5106    void *data;
5107    Py_ssize_t len;
5108    PyObject *v;
5109    unsigned char *p;
5110    Py_ssize_t nsize, i;
5111    /* Offsets from p for storing byte pairs in the right order. */
5112#if PY_LITTLE_ENDIAN
5113    int iorder[] = {0, 1, 2, 3};
5114#else
5115    int iorder[] = {3, 2, 1, 0};
5116#endif
5117    const char *encoding;
5118    PyObject *errorHandler = NULL;
5119    PyObject *exc = NULL;
5120    PyObject *rep = NULL;
5121
5122#define STORECHAR(CH)                           \
5123    do {                                        \
5124        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
5125        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
5126        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
5127        p[iorder[0]] = (CH) & 0xff;             \
5128        p += 4;                                 \
5129    } while(0)
5130
5131    if (!PyUnicode_Check(str)) {
5132        PyErr_BadArgument();
5133        return NULL;
5134    }
5135    if (PyUnicode_READY(str) == -1)
5136        return NULL;
5137    kind = PyUnicode_KIND(str);
5138    data = PyUnicode_DATA(str);
5139    len = PyUnicode_GET_LENGTH(str);
5140
5141    nsize = len + (byteorder == 0);
5142    if (nsize > PY_SSIZE_T_MAX / 4)
5143        return PyErr_NoMemory();
5144    v = PyBytes_FromStringAndSize(NULL, nsize * 4);
5145    if (v == NULL)
5146        return NULL;
5147
5148    p = (unsigned char *)PyBytes_AS_STRING(v);
5149    if (byteorder == 0)
5150        STORECHAR(0xFEFF);
5151    if (len == 0)
5152        return v;
5153
5154    if (byteorder == -1) {
5155        /* force LE */
5156        iorder[0] = 0;
5157        iorder[1] = 1;
5158        iorder[2] = 2;
5159        iorder[3] = 3;
5160        encoding = "utf-32-le";
5161    }
5162    else if (byteorder == 1) {
5163        /* force BE */
5164        iorder[0] = 3;
5165        iorder[1] = 2;
5166        iorder[2] = 1;
5167        iorder[3] = 0;
5168        encoding = "utf-32-be";
5169    }
5170    else
5171        encoding = "utf-32";
5172
5173    if (kind == PyUnicode_1BYTE_KIND) {
5174        for (i = 0; i < len; i++)
5175            STORECHAR(PyUnicode_READ(kind, data, i));
5176        return v;
5177    }
5178
5179    for (i = 0; i < len;) {
5180        Py_ssize_t repsize, moreunits;
5181        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5182        i++;
5183        assert(ch <= MAX_UNICODE);
5184        if (!Py_UNICODE_IS_SURROGATE(ch)) {
5185            STORECHAR(ch);
5186            continue;
5187        }
5188
5189        rep = unicode_encode_call_errorhandler(
5190                errors, &errorHandler,
5191                encoding, "surrogates not allowed",
5192                str, &exc, i-1, i, &i);
5193
5194        if (!rep)
5195            goto error;
5196
5197        if (PyBytes_Check(rep)) {
5198            repsize = PyBytes_GET_SIZE(rep);
5199            if (repsize & 3) {
5200                raise_encode_exception(&exc, encoding,
5201                                       str, i - 1, i,
5202                                       "surrogates not allowed");
5203                goto error;
5204            }
5205            moreunits = repsize / 4;
5206        }
5207        else {
5208            assert(PyUnicode_Check(rep));
5209            if (PyUnicode_READY(rep) < 0)
5210                goto error;
5211            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5212            if (!PyUnicode_IS_ASCII(rep)) {
5213                raise_encode_exception(&exc, encoding,
5214                                       str, i - 1, i,
5215                                       "surrogates not allowed");
5216                goto error;
5217            }
5218        }
5219
5220        /* four bytes are reserved for each surrogate */
5221        if (moreunits > 1) {
5222            Py_ssize_t outpos = p - (unsigned char*) PyBytes_AS_STRING(v);
5223            Py_ssize_t morebytes = 4 * (moreunits - 1);
5224            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5225                /* integer overflow */
5226                PyErr_NoMemory();
5227                goto error;
5228            }
5229            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5230                goto error;
5231            p = (unsigned char*) PyBytes_AS_STRING(v) + outpos;
5232        }
5233
5234        if (PyBytes_Check(rep)) {
5235            Py_MEMCPY(p, PyBytes_AS_STRING(rep), repsize);
5236            p += repsize;
5237        } else /* rep is unicode */ {
5238            const Py_UCS1 *repdata;
5239            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5240            repdata = PyUnicode_1BYTE_DATA(rep);
5241            while (repsize--) {
5242                Py_UCS4 ch = *repdata++;
5243                STORECHAR(ch);
5244            }
5245        }
5246
5247        Py_CLEAR(rep);
5248    }
5249
5250    /* Cut back to size actually needed. This is necessary for, for example,
5251       encoding of a string containing isolated surrogates and the 'ignore'
5252       handler is used. */
5253    nsize = p - (unsigned char*) PyBytes_AS_STRING(v);
5254    if (nsize != PyBytes_GET_SIZE(v))
5255      _PyBytes_Resize(&v, nsize);
5256    Py_XDECREF(errorHandler);
5257    Py_XDECREF(exc);
5258    return v;
5259  error:
5260    Py_XDECREF(rep);
5261    Py_XDECREF(errorHandler);
5262    Py_XDECREF(exc);
5263    Py_XDECREF(v);
5264    return NULL;
5265#undef STORECHAR
5266}
5267
5268PyObject *
5269PyUnicode_EncodeUTF32(const Py_UNICODE *s,
5270                      Py_ssize_t size,
5271                      const char *errors,
5272                      int byteorder)
5273{
5274    PyObject *result;
5275    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5276    if (tmp == NULL)
5277        return NULL;
5278    result = _PyUnicode_EncodeUTF32(tmp, errors, byteorder);
5279    Py_DECREF(tmp);
5280    return result;
5281}
5282
5283PyObject *
5284PyUnicode_AsUTF32String(PyObject *unicode)
5285{
5286    return _PyUnicode_EncodeUTF32(unicode, NULL, 0);
5287}
5288
5289/* --- UTF-16 Codec ------------------------------------------------------- */
5290
5291PyObject *
5292PyUnicode_DecodeUTF16(const char *s,
5293                      Py_ssize_t size,
5294                      const char *errors,
5295                      int *byteorder)
5296{
5297    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
5298}
5299
5300PyObject *
5301PyUnicode_DecodeUTF16Stateful(const char *s,
5302                              Py_ssize_t size,
5303                              const char *errors,
5304                              int *byteorder,
5305                              Py_ssize_t *consumed)
5306{
5307    const char *starts = s;
5308    Py_ssize_t startinpos;
5309    Py_ssize_t endinpos;
5310    _PyUnicodeWriter writer;
5311    const unsigned char *q, *e;
5312    int bo = 0;       /* assume native ordering by default */
5313    int native_ordering;
5314    const char *errmsg = "";
5315    PyObject *errorHandler = NULL;
5316    PyObject *exc = NULL;
5317    const char *encoding;
5318
5319    q = (unsigned char *)s;
5320    e = q + size;
5321
5322    if (byteorder)
5323        bo = *byteorder;
5324
5325    /* Check for BOM marks (U+FEFF) in the input and adjust current
5326       byte order setting accordingly. In native mode, the leading BOM
5327       mark is skipped, in all other modes, it is copied to the output
5328       stream as-is (giving a ZWNBSP character). */
5329    if (bo == 0 && size >= 2) {
5330        const Py_UCS4 bom = (q[1] << 8) | q[0];
5331        if (bom == 0xFEFF) {
5332            q += 2;
5333            bo = -1;
5334        }
5335        else if (bom == 0xFFFE) {
5336            q += 2;
5337            bo = 1;
5338        }
5339        if (byteorder)
5340            *byteorder = bo;
5341    }
5342
5343    if (q == e) {
5344        if (consumed)
5345            *consumed = size;
5346        _Py_RETURN_UNICODE_EMPTY();
5347    }
5348
5349#if PY_LITTLE_ENDIAN
5350    native_ordering = bo <= 0;
5351    encoding = bo <= 0 ? "utf-16-le" : "utf-16-be";
5352#else
5353    native_ordering = bo >= 0;
5354    encoding = bo >= 0 ? "utf-16-be" : "utf-16-le";
5355#endif
5356
5357    /* Note: size will always be longer than the resulting Unicode
5358       character count */
5359    _PyUnicodeWriter_Init(&writer);
5360    writer.min_length = (e - q + 1) / 2;
5361    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
5362        goto onError;
5363
5364    while (1) {
5365        Py_UCS4 ch = 0;
5366        if (e - q >= 2) {
5367            int kind = writer.kind;
5368            if (kind == PyUnicode_1BYTE_KIND) {
5369                if (PyUnicode_IS_ASCII(writer.buffer))
5370                    ch = asciilib_utf16_decode(&q, e,
5371                            (Py_UCS1*)writer.data, &writer.pos,
5372                            native_ordering);
5373                else
5374                    ch = ucs1lib_utf16_decode(&q, e,
5375                            (Py_UCS1*)writer.data, &writer.pos,
5376                            native_ordering);
5377            } else if (kind == PyUnicode_2BYTE_KIND) {
5378                ch = ucs2lib_utf16_decode(&q, e,
5379                        (Py_UCS2*)writer.data, &writer.pos,
5380                        native_ordering);
5381            } else {
5382                assert(kind == PyUnicode_4BYTE_KIND);
5383                ch = ucs4lib_utf16_decode(&q, e,
5384                        (Py_UCS4*)writer.data, &writer.pos,
5385                        native_ordering);
5386            }
5387        }
5388
5389        switch (ch)
5390        {
5391        case 0:
5392            /* remaining byte at the end? (size should be even) */
5393            if (q == e || consumed)
5394                goto End;
5395            errmsg = "truncated data";
5396            startinpos = ((const char *)q) - starts;
5397            endinpos = ((const char *)e) - starts;
5398            break;
5399            /* The remaining input chars are ignored if the callback
5400               chooses to skip the input */
5401        case 1:
5402            q -= 2;
5403            if (consumed)
5404                goto End;
5405            errmsg = "unexpected end of data";
5406            startinpos = ((const char *)q) - starts;
5407            endinpos = ((const char *)e) - starts;
5408            break;
5409        case 2:
5410            errmsg = "illegal encoding";
5411            startinpos = ((const char *)q) - 2 - starts;
5412            endinpos = startinpos + 2;
5413            break;
5414        case 3:
5415            errmsg = "illegal UTF-16 surrogate";
5416            startinpos = ((const char *)q) - 4 - starts;
5417            endinpos = startinpos + 2;
5418            break;
5419        default:
5420            if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
5421                goto onError;
5422            continue;
5423        }
5424
5425        if (unicode_decode_call_errorhandler_writer(
5426                errors,
5427                &errorHandler,
5428                encoding, errmsg,
5429                &starts,
5430                (const char **)&e,
5431                &startinpos,
5432                &endinpos,
5433                &exc,
5434                (const char **)&q,
5435                &writer))
5436            goto onError;
5437    }
5438
5439End:
5440    if (consumed)
5441        *consumed = (const char *)q-starts;
5442
5443    Py_XDECREF(errorHandler);
5444    Py_XDECREF(exc);
5445    return _PyUnicodeWriter_Finish(&writer);
5446
5447  onError:
5448    _PyUnicodeWriter_Dealloc(&writer);
5449    Py_XDECREF(errorHandler);
5450    Py_XDECREF(exc);
5451    return NULL;
5452}
5453
5454PyObject *
5455_PyUnicode_EncodeUTF16(PyObject *str,
5456                       const char *errors,
5457                       int byteorder)
5458{
5459    enum PyUnicode_Kind kind;
5460    const void *data;
5461    Py_ssize_t len;
5462    PyObject *v;
5463    unsigned short *out;
5464    Py_ssize_t pairs;
5465#if PY_BIG_ENDIAN
5466    int native_ordering = byteorder >= 0;
5467#else
5468    int native_ordering = byteorder <= 0;
5469#endif
5470    const char *encoding;
5471    Py_ssize_t nsize, pos;
5472    PyObject *errorHandler = NULL;
5473    PyObject *exc = NULL;
5474    PyObject *rep = NULL;
5475
5476    if (!PyUnicode_Check(str)) {
5477        PyErr_BadArgument();
5478        return NULL;
5479    }
5480    if (PyUnicode_READY(str) == -1)
5481        return NULL;
5482    kind = PyUnicode_KIND(str);
5483    data = PyUnicode_DATA(str);
5484    len = PyUnicode_GET_LENGTH(str);
5485
5486    pairs = 0;
5487    if (kind == PyUnicode_4BYTE_KIND) {
5488        const Py_UCS4 *in = (const Py_UCS4 *)data;
5489        const Py_UCS4 *end = in + len;
5490        while (in < end)
5491            if (*in++ >= 0x10000)
5492                pairs++;
5493    }
5494    if (len > PY_SSIZE_T_MAX / 2 - pairs - (byteorder == 0))
5495        return PyErr_NoMemory();
5496    nsize = len + pairs + (byteorder == 0);
5497    v = PyBytes_FromStringAndSize(NULL, nsize * 2);
5498    if (v == NULL)
5499        return NULL;
5500
5501    /* output buffer is 2-bytes aligned */
5502    assert(_Py_IS_ALIGNED(PyBytes_AS_STRING(v), 2));
5503    out = (unsigned short *)PyBytes_AS_STRING(v);
5504    if (byteorder == 0)
5505        *out++ = 0xFEFF;
5506    if (len == 0)
5507        goto done;
5508
5509    if (kind == PyUnicode_1BYTE_KIND) {
5510        ucs1lib_utf16_encode((const Py_UCS1 *)data, len, &out, native_ordering);
5511        goto done;
5512    }
5513
5514    if (byteorder < 0)
5515        encoding = "utf-16-le";
5516    else if (byteorder > 0)
5517        encoding = "utf-16-be";
5518    else
5519        encoding = "utf-16";
5520
5521    pos = 0;
5522    while (pos < len) {
5523        Py_ssize_t repsize, moreunits;
5524
5525        if (kind == PyUnicode_2BYTE_KIND) {
5526            pos += ucs2lib_utf16_encode((const Py_UCS2 *)data + pos, len - pos,
5527                                        &out, native_ordering);
5528        }
5529        else {
5530            assert(kind == PyUnicode_4BYTE_KIND);
5531            pos += ucs4lib_utf16_encode((const Py_UCS4 *)data + pos, len - pos,
5532                                        &out, native_ordering);
5533        }
5534        if (pos == len)
5535            break;
5536
5537        rep = unicode_encode_call_errorhandler(
5538                errors, &errorHandler,
5539                encoding, "surrogates not allowed",
5540                str, &exc, pos, pos + 1, &pos);
5541        if (!rep)
5542            goto error;
5543
5544        if (PyBytes_Check(rep)) {
5545            repsize = PyBytes_GET_SIZE(rep);
5546            if (repsize & 1) {
5547                raise_encode_exception(&exc, encoding,
5548                                       str, pos - 1, pos,
5549                                       "surrogates not allowed");
5550                goto error;
5551            }
5552            moreunits = repsize / 2;
5553        }
5554        else {
5555            assert(PyUnicode_Check(rep));
5556            if (PyUnicode_READY(rep) < 0)
5557                goto error;
5558            moreunits = repsize = PyUnicode_GET_LENGTH(rep);
5559            if (!PyUnicode_IS_ASCII(rep)) {
5560                raise_encode_exception(&exc, encoding,
5561                                       str, pos - 1, pos,
5562                                       "surrogates not allowed");
5563                goto error;
5564            }
5565        }
5566
5567        /* two bytes are reserved for each surrogate */
5568        if (moreunits > 1) {
5569            Py_ssize_t outpos = out - (unsigned short*) PyBytes_AS_STRING(v);
5570            Py_ssize_t morebytes = 2 * (moreunits - 1);
5571            if (PyBytes_GET_SIZE(v) > PY_SSIZE_T_MAX - morebytes) {
5572                /* integer overflow */
5573                PyErr_NoMemory();
5574                goto error;
5575            }
5576            if (_PyBytes_Resize(&v, PyBytes_GET_SIZE(v) + morebytes) < 0)
5577                goto error;
5578            out = (unsigned short*) PyBytes_AS_STRING(v) + outpos;
5579        }
5580
5581        if (PyBytes_Check(rep)) {
5582            Py_MEMCPY(out, PyBytes_AS_STRING(rep), repsize);
5583            out += moreunits;
5584        } else /* rep is unicode */ {
5585            assert(PyUnicode_KIND(rep) == PyUnicode_1BYTE_KIND);
5586            ucs1lib_utf16_encode(PyUnicode_1BYTE_DATA(rep), repsize,
5587                                 &out, native_ordering);
5588        }
5589
5590        Py_CLEAR(rep);
5591    }
5592
5593    /* Cut back to size actually needed. This is necessary for, for example,
5594    encoding of a string containing isolated surrogates and the 'ignore' handler
5595    is used. */
5596    nsize = (unsigned char*) out - (unsigned char*) PyBytes_AS_STRING(v);
5597    if (nsize != PyBytes_GET_SIZE(v))
5598      _PyBytes_Resize(&v, nsize);
5599    Py_XDECREF(errorHandler);
5600    Py_XDECREF(exc);
5601  done:
5602    return v;
5603  error:
5604    Py_XDECREF(rep);
5605    Py_XDECREF(errorHandler);
5606    Py_XDECREF(exc);
5607    Py_XDECREF(v);
5608    return NULL;
5609#undef STORECHAR
5610}
5611
5612PyObject *
5613PyUnicode_EncodeUTF16(const Py_UNICODE *s,
5614                      Py_ssize_t size,
5615                      const char *errors,
5616                      int byteorder)
5617{
5618    PyObject *result;
5619    PyObject *tmp = PyUnicode_FromUnicode(s, size);
5620    if (tmp == NULL)
5621        return NULL;
5622    result = _PyUnicode_EncodeUTF16(tmp, errors, byteorder);
5623    Py_DECREF(tmp);
5624    return result;
5625}
5626
5627PyObject *
5628PyUnicode_AsUTF16String(PyObject *unicode)
5629{
5630    return _PyUnicode_EncodeUTF16(unicode, NULL, 0);
5631}
5632
5633/* --- Unicode Escape Codec ----------------------------------------------- */
5634
5635/* Helper function for PyUnicode_DecodeUnicodeEscape, determines
5636   if all the escapes in the string make it still a valid ASCII string.
5637   Returns -1 if any escapes were found which cause the string to
5638   pop out of ASCII range.  Otherwise returns the length of the
5639   required buffer to hold the string.
5640   */
5641static Py_ssize_t
5642length_of_escaped_ascii_string(const char *s, Py_ssize_t size)
5643{
5644    const unsigned char *p = (const unsigned char *)s;
5645    const unsigned char *end = p + size;
5646    Py_ssize_t length = 0;
5647
5648    if (size < 0)
5649        return -1;
5650
5651    for (; p < end; ++p) {
5652        if (*p > 127) {
5653            /* Non-ASCII */
5654            return -1;
5655        }
5656        else if (*p != '\\') {
5657            /* Normal character */
5658            ++length;
5659        }
5660        else {
5661            /* Backslash-escape, check next char */
5662            ++p;
5663            /* Escape sequence reaches till end of string or
5664               non-ASCII follow-up. */
5665            if (p >= end || *p > 127)
5666                return -1;
5667            switch (*p) {
5668            case '\n':
5669                /* backslash + \n result in zero characters */
5670                break;
5671            case '\\': case '\'': case '\"':
5672            case 'b': case 'f': case 't':
5673            case 'n': case 'r': case 'v': case 'a':
5674                ++length;
5675                break;
5676            case '0': case '1': case '2': case '3':
5677            case '4': case '5': case '6': case '7':
5678            case 'x': case 'u': case 'U': case 'N':
5679                /* these do not guarantee ASCII characters */
5680                return -1;
5681            default:
5682                /* count the backslash + the other character */
5683                length += 2;
5684            }
5685        }
5686    }
5687    return length;
5688}
5689
5690static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
5691
5692PyObject *
5693PyUnicode_DecodeUnicodeEscape(const char *s,
5694                              Py_ssize_t size,
5695                              const char *errors)
5696{
5697    const char *starts = s;
5698    Py_ssize_t startinpos;
5699    Py_ssize_t endinpos;
5700    _PyUnicodeWriter writer;
5701    const char *end;
5702    char* message;
5703    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
5704    PyObject *errorHandler = NULL;
5705    PyObject *exc = NULL;
5706    Py_ssize_t len;
5707
5708    len = length_of_escaped_ascii_string(s, size);
5709    if (len == 0)
5710        _Py_RETURN_UNICODE_EMPTY();
5711
5712    /* After length_of_escaped_ascii_string() there are two alternatives,
5713       either the string is pure ASCII with named escapes like \n, etc.
5714       and we determined it's exact size (common case)
5715       or it contains \x, \u, ... escape sequences.  then we create a
5716       legacy wchar string and resize it at the end of this function. */
5717    _PyUnicodeWriter_Init(&writer);
5718    if (len > 0) {
5719        writer.min_length = len;
5720    }
5721    else {
5722        /* Escaped strings will always be longer than the resulting
5723           Unicode string, so we start with size here and then reduce the
5724           length after conversion to the true value.
5725           (but if the error callback returns a long replacement string
5726           we'll have to allocate more space) */
5727        writer.min_length = size;
5728    }
5729
5730    if (size == 0)
5731        return _PyUnicodeWriter_Finish(&writer);
5732    end = s + size;
5733
5734    while (s < end) {
5735        unsigned char c;
5736        Py_UCS4 x;
5737        int digits;
5738
5739        /* Non-escape characters are interpreted as Unicode ordinals */
5740        if (*s != '\\') {
5741            x = (unsigned char)*s;
5742            s++;
5743            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
5744                goto onError;
5745            continue;
5746        }
5747
5748        startinpos = s-starts;
5749        /* \ - Escapes */
5750        s++;
5751        c = *s++;
5752        if (s > end)
5753            c = '\0'; /* Invalid after \ */
5754
5755        switch (c) {
5756
5757            /* \x escapes */
5758#define WRITECHAR(ch)                                                      \
5759            do {                                                           \
5760                if (_PyUnicodeWriter_WriteCharInline(&writer, (ch)) < 0)    \
5761                    goto onError;                                          \
5762            } while(0)
5763
5764        case '\n': break;
5765        case '\\': WRITECHAR('\\'); break;
5766        case '\'': WRITECHAR('\''); break;
5767        case '\"': WRITECHAR('\"'); break;
5768        case 'b': WRITECHAR('\b'); break;
5769        /* FF */
5770        case 'f': WRITECHAR('\014'); break;
5771        case 't': WRITECHAR('\t'); break;
5772        case 'n': WRITECHAR('\n'); break;
5773        case 'r': WRITECHAR('\r'); break;
5774        /* VT */
5775        case 'v': WRITECHAR('\013'); break;
5776        /* BEL, not classic C */
5777        case 'a': WRITECHAR('\007'); break;
5778
5779            /* \OOO (octal) escapes */
5780        case '0': case '1': case '2': case '3':
5781        case '4': case '5': case '6': case '7':
5782            x = s[-1] - '0';
5783            if (s < end && '0' <= *s && *s <= '7') {
5784                x = (x<<3) + *s++ - '0';
5785                if (s < end && '0' <= *s && *s <= '7')
5786                    x = (x<<3) + *s++ - '0';
5787            }
5788            WRITECHAR(x);
5789            break;
5790
5791            /* hex escapes */
5792            /* \xXX */
5793        case 'x':
5794            digits = 2;
5795            message = "truncated \\xXX escape";
5796            goto hexescape;
5797
5798            /* \uXXXX */
5799        case 'u':
5800            digits = 4;
5801            message = "truncated \\uXXXX escape";
5802            goto hexescape;
5803
5804            /* \UXXXXXXXX */
5805        case 'U':
5806            digits = 8;
5807            message = "truncated \\UXXXXXXXX escape";
5808        hexescape:
5809            chr = 0;
5810            if (end - s < digits) {
5811                /* count only hex digits */
5812                for (; s < end; ++s) {
5813                    c = (unsigned char)*s;
5814                    if (!Py_ISXDIGIT(c))
5815                        goto error;
5816                }
5817                goto error;
5818            }
5819            for (; digits--; ++s) {
5820                c = (unsigned char)*s;
5821                if (!Py_ISXDIGIT(c))
5822                    goto error;
5823                chr = (chr<<4) & ~0xF;
5824                if (c >= '0' && c <= '9')
5825                    chr += c - '0';
5826                else if (c >= 'a' && c <= 'f')
5827                    chr += 10 + c - 'a';
5828                else
5829                    chr += 10 + c - 'A';
5830            }
5831            if (chr == 0xffffffff && PyErr_Occurred())
5832                /* _decoding_error will have already written into the
5833                   target buffer. */
5834                break;
5835        store:
5836            /* when we get here, chr is a 32-bit unicode character */
5837            message = "illegal Unicode character";
5838            if (chr > MAX_UNICODE)
5839                goto error;
5840            WRITECHAR(chr);
5841            break;
5842
5843            /* \N{name} */
5844        case 'N':
5845            message = "malformed \\N character escape";
5846            if (ucnhash_CAPI == NULL) {
5847                /* load the unicode data module */
5848                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(
5849                                                PyUnicodeData_CAPSULE_NAME, 1);
5850                if (ucnhash_CAPI == NULL)
5851                    goto ucnhashError;
5852            }
5853            if (*s == '{') {
5854                const char *start = s+1;
5855                /* look for the closing brace */
5856                while (*s != '}' && s < end)
5857                    s++;
5858                if (s > start && s < end && *s == '}') {
5859                    /* found a name.  look it up in the unicode database */
5860                    message = "unknown Unicode character name";
5861                    s++;
5862                    if (s - start - 1 <= INT_MAX &&
5863                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1),
5864                                              &chr, 0))
5865                        goto store;
5866                }
5867            }
5868            goto error;
5869
5870        default:
5871            if (s > end) {
5872                message = "\\ at end of string";
5873                s--;
5874                goto error;
5875            }
5876            else {
5877                WRITECHAR('\\');
5878                WRITECHAR((unsigned char)s[-1]);
5879            }
5880            break;
5881        }
5882        continue;
5883
5884      error:
5885        endinpos = s-starts;
5886        if (unicode_decode_call_errorhandler_writer(
5887                errors, &errorHandler,
5888                "unicodeescape", message,
5889                &starts, &end, &startinpos, &endinpos, &exc, &s,
5890                &writer))
5891            goto onError;
5892        continue;
5893    }
5894#undef WRITECHAR
5895
5896    Py_XDECREF(errorHandler);
5897    Py_XDECREF(exc);
5898    return _PyUnicodeWriter_Finish(&writer);
5899
5900  ucnhashError:
5901    PyErr_SetString(
5902        PyExc_UnicodeError,
5903        "\\N escapes not supported (can't load unicodedata module)"
5904        );
5905    _PyUnicodeWriter_Dealloc(&writer);
5906    Py_XDECREF(errorHandler);
5907    Py_XDECREF(exc);
5908    return NULL;
5909
5910  onError:
5911    _PyUnicodeWriter_Dealloc(&writer);
5912    Py_XDECREF(errorHandler);
5913    Py_XDECREF(exc);
5914    return NULL;
5915}
5916
5917/* Return a Unicode-Escape string version of the Unicode object.
5918
5919   If quotes is true, the string is enclosed in u"" or u'' quotes as
5920   appropriate.
5921
5922*/
5923
5924PyObject *
5925PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
5926{
5927    Py_ssize_t i, len;
5928    PyObject *repr;
5929    char *p;
5930    int kind;
5931    void *data;
5932    Py_ssize_t expandsize = 0;
5933
5934    /* Initial allocation is based on the longest-possible character
5935       escape.
5936
5937       For UCS1 strings it's '\xxx', 4 bytes per source character.
5938       For UCS2 strings it's '\uxxxx', 6 bytes per source character.
5939       For UCS4 strings it's '\U00xxxxxx', 10 bytes per source character.
5940    */
5941
5942    if (!PyUnicode_Check(unicode)) {
5943        PyErr_BadArgument();
5944        return NULL;
5945    }
5946    if (PyUnicode_READY(unicode) == -1)
5947        return NULL;
5948    len = PyUnicode_GET_LENGTH(unicode);
5949    kind = PyUnicode_KIND(unicode);
5950    data = PyUnicode_DATA(unicode);
5951    switch (kind) {
5952    case PyUnicode_1BYTE_KIND: expandsize = 4; break;
5953    case PyUnicode_2BYTE_KIND: expandsize = 6; break;
5954    case PyUnicode_4BYTE_KIND: expandsize = 10; break;
5955    }
5956
5957    if (len == 0)
5958        return PyBytes_FromStringAndSize(NULL, 0);
5959
5960    if (len > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
5961        return PyErr_NoMemory();
5962
5963    repr = PyBytes_FromStringAndSize(NULL,
5964                                     2
5965                                     + expandsize*len
5966                                     + 1);
5967    if (repr == NULL)
5968        return NULL;
5969
5970    p = PyBytes_AS_STRING(repr);
5971
5972    for (i = 0; i < len; i++) {
5973        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
5974
5975        /* Escape backslashes */
5976        if (ch == '\\') {
5977            *p++ = '\\';
5978            *p++ = (char) ch;
5979            continue;
5980        }
5981
5982        /* Map 21-bit characters to '\U00xxxxxx' */
5983        else if (ch >= 0x10000) {
5984            assert(ch <= MAX_UNICODE);
5985            *p++ = '\\';
5986            *p++ = 'U';
5987            *p++ = Py_hexdigits[(ch >> 28) & 0x0000000F];
5988            *p++ = Py_hexdigits[(ch >> 24) & 0x0000000F];
5989            *p++ = Py_hexdigits[(ch >> 20) & 0x0000000F];
5990            *p++ = Py_hexdigits[(ch >> 16) & 0x0000000F];
5991            *p++ = Py_hexdigits[(ch >> 12) & 0x0000000F];
5992            *p++ = Py_hexdigits[(ch >> 8) & 0x0000000F];
5993            *p++ = Py_hexdigits[(ch >> 4) & 0x0000000F];
5994            *p++ = Py_hexdigits[ch & 0x0000000F];
5995            continue;
5996        }
5997
5998        /* Map 16-bit characters to '\uxxxx' */
5999        if (ch >= 256) {
6000            *p++ = '\\';
6001            *p++ = 'u';
6002            *p++ = Py_hexdigits[(ch >> 12) & 0x000F];
6003            *p++ = Py_hexdigits[(ch >> 8) & 0x000F];
6004            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6005            *p++ = Py_hexdigits[ch & 0x000F];
6006        }
6007
6008        /* Map special whitespace to '\t', \n', '\r' */
6009        else if (ch == '\t') {
6010            *p++ = '\\';
6011            *p++ = 't';
6012        }
6013        else if (ch == '\n') {
6014            *p++ = '\\';
6015            *p++ = 'n';
6016        }
6017        else if (ch == '\r') {
6018            *p++ = '\\';
6019            *p++ = 'r';
6020        }
6021
6022        /* Map non-printable US ASCII to '\xhh' */
6023        else if (ch < ' ' || ch >= 0x7F) {
6024            *p++ = '\\';
6025            *p++ = 'x';
6026            *p++ = Py_hexdigits[(ch >> 4) & 0x000F];
6027            *p++ = Py_hexdigits[ch & 0x000F];
6028        }
6029
6030        /* Copy everything else as-is */
6031        else
6032            *p++ = (char) ch;
6033    }
6034
6035    assert(p - PyBytes_AS_STRING(repr) > 0);
6036    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
6037        return NULL;
6038    return repr;
6039}
6040
6041PyObject *
6042PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
6043                              Py_ssize_t size)
6044{
6045    PyObject *result;
6046    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6047    if (tmp == NULL)
6048        return NULL;
6049    result = PyUnicode_AsUnicodeEscapeString(tmp);
6050    Py_DECREF(tmp);
6051    return result;
6052}
6053
6054/* --- Raw Unicode Escape Codec ------------------------------------------- */
6055
6056PyObject *
6057PyUnicode_DecodeRawUnicodeEscape(const char *s,
6058                                 Py_ssize_t size,
6059                                 const char *errors)
6060{
6061    const char *starts = s;
6062    Py_ssize_t startinpos;
6063    Py_ssize_t endinpos;
6064    _PyUnicodeWriter writer;
6065    const char *end;
6066    const char *bs;
6067    PyObject *errorHandler = NULL;
6068    PyObject *exc = NULL;
6069
6070    if (size == 0)
6071        _Py_RETURN_UNICODE_EMPTY();
6072
6073    /* Escaped strings will always be longer than the resulting
6074       Unicode string, so we start with size here and then reduce the
6075       length after conversion to the true value. (But decoding error
6076       handler might have to resize the string) */
6077    _PyUnicodeWriter_Init(&writer);
6078    writer.min_length = size;
6079
6080    end = s + size;
6081    while (s < end) {
6082        unsigned char c;
6083        Py_UCS4 x;
6084        int i;
6085        int count;
6086
6087        /* Non-escape characters are interpreted as Unicode ordinals */
6088        if (*s != '\\') {
6089            x = (unsigned char)*s++;
6090            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6091                goto onError;
6092            continue;
6093        }
6094        startinpos = s-starts;
6095
6096        /* \u-escapes are only interpreted iff the number of leading
6097           backslashes if odd */
6098        bs = s;
6099        for (;s < end;) {
6100            if (*s != '\\')
6101                break;
6102            x = (unsigned char)*s++;
6103            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6104                goto onError;
6105        }
6106        if (((s - bs) & 1) == 0 ||
6107            s >= end ||
6108            (*s != 'u' && *s != 'U')) {
6109            continue;
6110        }
6111        writer.pos--;
6112        count = *s=='u' ? 4 : 8;
6113        s++;
6114
6115        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
6116        for (x = 0, i = 0; i < count; ++i, ++s) {
6117            c = (unsigned char)*s;
6118            if (!Py_ISXDIGIT(c)) {
6119                endinpos = s-starts;
6120                if (unicode_decode_call_errorhandler_writer(
6121                        errors, &errorHandler,
6122                        "rawunicodeescape", "truncated \\uXXXX",
6123                        &starts, &end, &startinpos, &endinpos, &exc, &s,
6124                        &writer))
6125                    goto onError;
6126                goto nextByte;
6127            }
6128            x = (x<<4) & ~0xF;
6129            if (c >= '0' && c <= '9')
6130                x += c - '0';
6131            else if (c >= 'a' && c <= 'f')
6132                x += 10 + c - 'a';
6133            else
6134                x += 10 + c - 'A';
6135        }
6136        if (x <= MAX_UNICODE) {
6137            if (_PyUnicodeWriter_WriteCharInline(&writer, x) < 0)
6138                goto onError;
6139        }
6140        else {
6141            endinpos = s-starts;
6142            if (unicode_decode_call_errorhandler_writer(
6143                    errors, &errorHandler,
6144                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
6145                    &starts, &end, &startinpos, &endinpos, &exc, &s,
6146                    &writer))
6147                goto onError;
6148        }
6149      nextByte:
6150        ;
6151    }
6152    Py_XDECREF(errorHandler);
6153    Py_XDECREF(exc);
6154    return _PyUnicodeWriter_Finish(&writer);
6155
6156  onError:
6157    _PyUnicodeWriter_Dealloc(&writer);
6158    Py_XDECREF(errorHandler);
6159    Py_XDECREF(exc);
6160    return NULL;
6161}
6162
6163
6164PyObject *
6165PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
6166{
6167    PyObject *repr;
6168    char *p;
6169    char *q;
6170    Py_ssize_t expandsize, pos;
6171    int kind;
6172    void *data;
6173    Py_ssize_t len;
6174
6175    if (!PyUnicode_Check(unicode)) {
6176        PyErr_BadArgument();
6177        return NULL;
6178    }
6179    if (PyUnicode_READY(unicode) == -1)
6180        return NULL;
6181    kind = PyUnicode_KIND(unicode);
6182    data = PyUnicode_DATA(unicode);
6183    len = PyUnicode_GET_LENGTH(unicode);
6184    /* 4 byte characters can take up 10 bytes, 2 byte characters can take up 6
6185       bytes, and 1 byte characters 4. */
6186    expandsize = kind * 2 + 2;
6187
6188    if (len > PY_SSIZE_T_MAX / expandsize)
6189        return PyErr_NoMemory();
6190
6191    repr = PyBytes_FromStringAndSize(NULL, expandsize * len);
6192    if (repr == NULL)
6193        return NULL;
6194    if (len == 0)
6195        return repr;
6196
6197    p = q = PyBytes_AS_STRING(repr);
6198    for (pos = 0; pos < len; pos++) {
6199        Py_UCS4 ch = PyUnicode_READ(kind, data, pos);
6200        /* Map 32-bit characters to '\Uxxxxxxxx' */
6201        if (ch >= 0x10000) {
6202            assert(ch <= MAX_UNICODE);
6203            *p++ = '\\';
6204            *p++ = 'U';
6205            *p++ = Py_hexdigits[(ch >> 28) & 0xf];
6206            *p++ = Py_hexdigits[(ch >> 24) & 0xf];
6207            *p++ = Py_hexdigits[(ch >> 20) & 0xf];
6208            *p++ = Py_hexdigits[(ch >> 16) & 0xf];
6209            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6210            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6211            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6212            *p++ = Py_hexdigits[ch & 15];
6213        }
6214        /* Map 16-bit characters to '\uxxxx' */
6215        else if (ch >= 256) {
6216            *p++ = '\\';
6217            *p++ = 'u';
6218            *p++ = Py_hexdigits[(ch >> 12) & 0xf];
6219            *p++ = Py_hexdigits[(ch >> 8) & 0xf];
6220            *p++ = Py_hexdigits[(ch >> 4) & 0xf];
6221            *p++ = Py_hexdigits[ch & 15];
6222        }
6223        /* Copy everything else as-is */
6224        else
6225            *p++ = (char) ch;
6226    }
6227
6228    assert(p > q);
6229    if (_PyBytes_Resize(&repr, p - q) < 0)
6230        return NULL;
6231    return repr;
6232}
6233
6234PyObject *
6235PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
6236                                 Py_ssize_t size)
6237{
6238    PyObject *result;
6239    PyObject *tmp = PyUnicode_FromUnicode(s, size);
6240    if (tmp == NULL)
6241        return NULL;
6242    result = PyUnicode_AsRawUnicodeEscapeString(tmp);
6243    Py_DECREF(tmp);
6244    return result;
6245}
6246
6247/* --- Unicode Internal Codec ------------------------------------------- */
6248
6249PyObject *
6250_PyUnicode_DecodeUnicodeInternal(const char *s,
6251                                 Py_ssize_t size,
6252                                 const char *errors)
6253{
6254    const char *starts = s;
6255    Py_ssize_t startinpos;
6256    Py_ssize_t endinpos;
6257    _PyUnicodeWriter writer;
6258    const char *end;
6259    const char *reason;
6260    PyObject *errorHandler = NULL;
6261    PyObject *exc = NULL;
6262
6263    if (PyErr_WarnEx(PyExc_DeprecationWarning,
6264                     "unicode_internal codec has been deprecated",
6265                     1))
6266        return NULL;
6267
6268    if (size == 0)
6269        _Py_RETURN_UNICODE_EMPTY();
6270
6271    _PyUnicodeWriter_Init(&writer);
6272    if (size / Py_UNICODE_SIZE > PY_SSIZE_T_MAX - 1) {
6273        PyErr_NoMemory();
6274        goto onError;
6275    }
6276    writer.min_length = (size + (Py_UNICODE_SIZE - 1)) / Py_UNICODE_SIZE;
6277
6278    end = s + size;
6279    while (s < end) {
6280        Py_UNICODE uch;
6281        Py_UCS4 ch;
6282        if (end - s < Py_UNICODE_SIZE) {
6283            endinpos = end-starts;
6284            reason = "truncated input";
6285            goto error;
6286        }
6287        /* We copy the raw representation one byte at a time because the
6288           pointer may be unaligned (see test_codeccallbacks). */
6289        ((char *) &uch)[0] = s[0];
6290        ((char *) &uch)[1] = s[1];
6291#ifdef Py_UNICODE_WIDE
6292        ((char *) &uch)[2] = s[2];
6293        ((char *) &uch)[3] = s[3];
6294#endif
6295        ch = uch;
6296#ifdef Py_UNICODE_WIDE
6297        /* We have to sanity check the raw data, otherwise doom looms for
6298           some malformed UCS-4 data. */
6299        if (ch > 0x10ffff) {
6300            endinpos = s - starts + Py_UNICODE_SIZE;
6301            reason = "illegal code point (> 0x10FFFF)";
6302            goto error;
6303        }
6304#endif
6305        s += Py_UNICODE_SIZE;
6306#ifndef Py_UNICODE_WIDE
6307        if (Py_UNICODE_IS_HIGH_SURROGATE(ch) && end - s >= Py_UNICODE_SIZE)
6308        {
6309            Py_UNICODE uch2;
6310            ((char *) &uch2)[0] = s[0];
6311            ((char *) &uch2)[1] = s[1];
6312            if (Py_UNICODE_IS_LOW_SURROGATE(uch2))
6313            {
6314                ch = Py_UNICODE_JOIN_SURROGATES(uch, uch2);
6315                s += Py_UNICODE_SIZE;
6316            }
6317        }
6318#endif
6319
6320        if (_PyUnicodeWriter_WriteCharInline(&writer, ch) < 0)
6321            goto onError;
6322        continue;
6323
6324  error:
6325        startinpos = s - starts;
6326        if (unicode_decode_call_errorhandler_writer(
6327                errors, &errorHandler,
6328                "unicode_internal", reason,
6329                &starts, &end, &startinpos, &endinpos, &exc, &s,
6330                &writer))
6331            goto onError;
6332    }
6333
6334    Py_XDECREF(errorHandler);
6335    Py_XDECREF(exc);
6336    return _PyUnicodeWriter_Finish(&writer);
6337
6338  onError:
6339    _PyUnicodeWriter_Dealloc(&writer);
6340    Py_XDECREF(errorHandler);
6341    Py_XDECREF(exc);
6342    return NULL;
6343}
6344
6345/* --- Latin-1 Codec ------------------------------------------------------ */
6346
6347PyObject *
6348PyUnicode_DecodeLatin1(const char *s,
6349                       Py_ssize_t size,
6350                       const char *errors)
6351{
6352    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
6353    return _PyUnicode_FromUCS1((unsigned char*)s, size);
6354}
6355
6356/* create or adjust a UnicodeEncodeError */
6357static void
6358make_encode_exception(PyObject **exceptionObject,
6359                      const char *encoding,
6360                      PyObject *unicode,
6361                      Py_ssize_t startpos, Py_ssize_t endpos,
6362                      const char *reason)
6363{
6364    if (*exceptionObject == NULL) {
6365        *exceptionObject = PyObject_CallFunction(
6366            PyExc_UnicodeEncodeError, "sOnns",
6367            encoding, unicode, startpos, endpos, reason);
6368    }
6369    else {
6370        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
6371            goto onError;
6372        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
6373            goto onError;
6374        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
6375            goto onError;
6376        return;
6377      onError:
6378        Py_CLEAR(*exceptionObject);
6379    }
6380}
6381
6382/* raises a UnicodeEncodeError */
6383static void
6384raise_encode_exception(PyObject **exceptionObject,
6385                       const char *encoding,
6386                       PyObject *unicode,
6387                       Py_ssize_t startpos, Py_ssize_t endpos,
6388                       const char *reason)
6389{
6390    make_encode_exception(exceptionObject,
6391                          encoding, unicode, startpos, endpos, reason);
6392    if (*exceptionObject != NULL)
6393        PyCodec_StrictErrors(*exceptionObject);
6394}
6395
6396/* error handling callback helper:
6397   build arguments, call the callback and check the arguments,
6398   put the result into newpos and return the replacement string, which
6399   has to be freed by the caller */
6400static PyObject *
6401unicode_encode_call_errorhandler(const char *errors,
6402                                 PyObject **errorHandler,
6403                                 const char *encoding, const char *reason,
6404                                 PyObject *unicode, PyObject **exceptionObject,
6405                                 Py_ssize_t startpos, Py_ssize_t endpos,
6406                                 Py_ssize_t *newpos)
6407{
6408    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
6409    Py_ssize_t len;
6410    PyObject *restuple;
6411    PyObject *resunicode;
6412
6413    if (*errorHandler == NULL) {
6414        *errorHandler = PyCodec_LookupError(errors);
6415        if (*errorHandler == NULL)
6416            return NULL;
6417    }
6418
6419    if (PyUnicode_READY(unicode) == -1)
6420        return NULL;
6421    len = PyUnicode_GET_LENGTH(unicode);
6422
6423    make_encode_exception(exceptionObject,
6424                          encoding, unicode, startpos, endpos, reason);
6425    if (*exceptionObject == NULL)
6426        return NULL;
6427
6428    restuple = PyObject_CallFunctionObjArgs(
6429        *errorHandler, *exceptionObject, NULL);
6430    if (restuple == NULL)
6431        return NULL;
6432    if (!PyTuple_Check(restuple)) {
6433        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6434        Py_DECREF(restuple);
6435        return NULL;
6436    }
6437    if (!PyArg_ParseTuple(restuple, argparse,
6438                          &resunicode, newpos)) {
6439        Py_DECREF(restuple);
6440        return NULL;
6441    }
6442    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
6443        PyErr_SetString(PyExc_TypeError, &argparse[3]);
6444        Py_DECREF(restuple);
6445        return NULL;
6446    }
6447    if (*newpos<0)
6448        *newpos = len + *newpos;
6449    if (*newpos<0 || *newpos>len) {
6450        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6451        Py_DECREF(restuple);
6452        return NULL;
6453    }
6454    Py_INCREF(resunicode);
6455    Py_DECREF(restuple);
6456    return resunicode;
6457}
6458
6459static PyObject *
6460unicode_encode_ucs1(PyObject *unicode,
6461                    const char *errors,
6462                    unsigned int limit)
6463{
6464    /* input state */
6465    Py_ssize_t pos=0, size;
6466    int kind;
6467    void *data;
6468    /* output object */
6469    PyObject *res;
6470    /* pointer into the output */
6471    char *str;
6472    /* current output position */
6473    Py_ssize_t ressize;
6474    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
6475    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
6476    PyObject *errorHandler = NULL;
6477    PyObject *exc = NULL;
6478    /* the following variable is used for caching string comparisons
6479     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6480    int known_errorHandler = -1;
6481
6482    if (PyUnicode_READY(unicode) == -1)
6483        return NULL;
6484    size = PyUnicode_GET_LENGTH(unicode);
6485    kind = PyUnicode_KIND(unicode);
6486    data = PyUnicode_DATA(unicode);
6487    /* allocate enough for a simple encoding without
6488       replacements, if we need more, we'll resize */
6489    if (size == 0)
6490        return PyBytes_FromStringAndSize(NULL, 0);
6491    res = PyBytes_FromStringAndSize(NULL, size);
6492    if (res == NULL)
6493        return NULL;
6494    str = PyBytes_AS_STRING(res);
6495    ressize = size;
6496
6497    while (pos < size) {
6498        Py_UCS4 c = PyUnicode_READ(kind, data, pos);
6499
6500        /* can we encode this? */
6501        if (c<limit) {
6502            /* no overflow check, because we know that the space is enough */
6503            *str++ = (char)c;
6504            ++pos;
6505        }
6506        else {
6507            Py_ssize_t requiredsize;
6508            PyObject *repunicode;
6509            Py_ssize_t repsize, newpos, respos, i;
6510            /* startpos for collecting unencodable chars */
6511            Py_ssize_t collstart = pos;
6512            Py_ssize_t collend = pos;
6513            /* find all unecodable characters */
6514            while ((collend < size) && (PyUnicode_READ(kind, data, collend) >= limit))
6515                ++collend;
6516            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
6517            if (known_errorHandler==-1) {
6518                if ((errors==NULL) || (!strcmp(errors, "strict")))
6519                    known_errorHandler = 1;
6520                else if (!strcmp(errors, "replace"))
6521                    known_errorHandler = 2;
6522                else if (!strcmp(errors, "ignore"))
6523                    known_errorHandler = 3;
6524                else if (!strcmp(errors, "xmlcharrefreplace"))
6525                    known_errorHandler = 4;
6526                else
6527                    known_errorHandler = 0;
6528            }
6529            switch (known_errorHandler) {
6530            case 1: /* strict */
6531                raise_encode_exception(&exc, encoding, unicode, collstart, collend, reason);
6532                goto onError;
6533            case 2: /* replace */
6534                while (collstart++ < collend)
6535                    *str++ = '?'; /* fall through */
6536            case 3: /* ignore */
6537                pos = collend;
6538                break;
6539            case 4: /* xmlcharrefreplace */
6540                respos = str - PyBytes_AS_STRING(res);
6541                requiredsize = respos;
6542                /* determine replacement size */
6543                for (i = collstart; i < collend; ++i) {
6544                    Py_UCS4 ch = PyUnicode_READ(kind, data, i);
6545                    Py_ssize_t incr;
6546                    if (ch < 10)
6547                        incr = 2+1+1;
6548                    else if (ch < 100)
6549                        incr = 2+2+1;
6550                    else if (ch < 1000)
6551                        incr = 2+3+1;
6552                    else if (ch < 10000)
6553                        incr = 2+4+1;
6554                    else if (ch < 100000)
6555                        incr = 2+5+1;
6556                    else if (ch < 1000000)
6557                        incr = 2+6+1;
6558                    else {
6559                        assert(ch <= MAX_UNICODE);
6560                        incr = 2+7+1;
6561                    }
6562                    if (requiredsize > PY_SSIZE_T_MAX - incr)
6563                        goto overflow;
6564                    requiredsize += incr;
6565                }
6566                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6567                    goto overflow;
6568                requiredsize += size - collend;
6569                if (requiredsize > ressize) {
6570                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6571                        requiredsize = 2*ressize;
6572                    if (_PyBytes_Resize(&res, requiredsize))
6573                        goto onError;
6574                    str = PyBytes_AS_STRING(res) + respos;
6575                    ressize = requiredsize;
6576                }
6577                /* generate replacement */
6578                for (i = collstart; i < collend; ++i) {
6579                    str += sprintf(str, "&#%d;", PyUnicode_READ(kind, data, i));
6580                }
6581                pos = collend;
6582                break;
6583            default:
6584                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6585                                                              encoding, reason, unicode, &exc,
6586                                                              collstart, collend, &newpos);
6587                if (repunicode == NULL || (PyUnicode_Check(repunicode) &&
6588                                           PyUnicode_READY(repunicode) == -1))
6589                    goto onError;
6590                if (PyBytes_Check(repunicode)) {
6591                    /* Directly copy bytes result to output. */
6592                    repsize = PyBytes_Size(repunicode);
6593                    if (repsize > 1) {
6594                        /* Make room for all additional bytes. */
6595                        respos = str - PyBytes_AS_STRING(res);
6596                        if (ressize > PY_SSIZE_T_MAX - repsize - 1) {
6597                            Py_DECREF(repunicode);
6598                            goto overflow;
6599                        }
6600                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
6601                            Py_DECREF(repunicode);
6602                            goto onError;
6603                        }
6604                        str = PyBytes_AS_STRING(res) + respos;
6605                        ressize += repsize-1;
6606                    }
6607                    memcpy(str, PyBytes_AsString(repunicode), repsize);
6608                    str += repsize;
6609                    pos = newpos;
6610                    Py_DECREF(repunicode);
6611                    break;
6612                }
6613                /* need more space? (at least enough for what we
6614                   have+the replacement+the rest of the string, so
6615                   we won't have to check space for encodable characters) */
6616                respos = str - PyBytes_AS_STRING(res);
6617                repsize = PyUnicode_GET_LENGTH(repunicode);
6618                requiredsize = respos;
6619                if (requiredsize > PY_SSIZE_T_MAX - repsize)
6620                    goto overflow;
6621                requiredsize += repsize;
6622                if (requiredsize > PY_SSIZE_T_MAX - (size - collend))
6623                    goto overflow;
6624                requiredsize += size - collend;
6625                if (requiredsize > ressize) {
6626                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
6627                        requiredsize = 2*ressize;
6628                    if (_PyBytes_Resize(&res, requiredsize)) {
6629                        Py_DECREF(repunicode);
6630                        goto onError;
6631                    }
6632                    str = PyBytes_AS_STRING(res) + respos;
6633                    ressize = requiredsize;
6634                }
6635                /* check if there is anything unencodable in the replacement
6636                   and copy it to the output */
6637                for (i = 0; repsize-->0; ++i, ++str) {
6638                    c = PyUnicode_READ_CHAR(repunicode, i);
6639                    if (c >= limit) {
6640                        raise_encode_exception(&exc, encoding, unicode,
6641                                               pos, pos+1, reason);
6642                        Py_DECREF(repunicode);
6643                        goto onError;
6644                    }
6645                    *str = (char)c;
6646                }
6647                pos = newpos;
6648                Py_DECREF(repunicode);
6649            }
6650        }
6651    }
6652    /* Resize if we allocated to much */
6653    size = str - PyBytes_AS_STRING(res);
6654    if (size < ressize) { /* If this falls res will be NULL */
6655        assert(size >= 0);
6656        if (_PyBytes_Resize(&res, size) < 0)
6657            goto onError;
6658    }
6659
6660    Py_XDECREF(errorHandler);
6661    Py_XDECREF(exc);
6662    return res;
6663
6664  overflow:
6665    PyErr_SetString(PyExc_OverflowError,
6666                    "encoded result is too long for a Python string");
6667
6668  onError:
6669    Py_XDECREF(res);
6670    Py_XDECREF(errorHandler);
6671    Py_XDECREF(exc);
6672    return NULL;
6673}
6674
6675/* Deprecated */
6676PyObject *
6677PyUnicode_EncodeLatin1(const Py_UNICODE *p,
6678                       Py_ssize_t size,
6679                       const char *errors)
6680{
6681    PyObject *result;
6682    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6683    if (unicode == NULL)
6684        return NULL;
6685    result = unicode_encode_ucs1(unicode, errors, 256);
6686    Py_DECREF(unicode);
6687    return result;
6688}
6689
6690PyObject *
6691_PyUnicode_AsLatin1String(PyObject *unicode, const char *errors)
6692{
6693    if (!PyUnicode_Check(unicode)) {
6694        PyErr_BadArgument();
6695        return NULL;
6696    }
6697    if (PyUnicode_READY(unicode) == -1)
6698        return NULL;
6699    /* Fast path: if it is a one-byte string, construct
6700       bytes object directly. */
6701    if (PyUnicode_KIND(unicode) == PyUnicode_1BYTE_KIND)
6702        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6703                                         PyUnicode_GET_LENGTH(unicode));
6704    /* Non-Latin-1 characters present. Defer to above function to
6705       raise the exception. */
6706    return unicode_encode_ucs1(unicode, errors, 256);
6707}
6708
6709PyObject*
6710PyUnicode_AsLatin1String(PyObject *unicode)
6711{
6712    return _PyUnicode_AsLatin1String(unicode, NULL);
6713}
6714
6715/* --- 7-bit ASCII Codec -------------------------------------------------- */
6716
6717PyObject *
6718PyUnicode_DecodeASCII(const char *s,
6719                      Py_ssize_t size,
6720                      const char *errors)
6721{
6722    const char *starts = s;
6723    _PyUnicodeWriter writer;
6724    int kind;
6725    void *data;
6726    Py_ssize_t startinpos;
6727    Py_ssize_t endinpos;
6728    Py_ssize_t outpos;
6729    const char *e;
6730    PyObject *errorHandler = NULL;
6731    PyObject *exc = NULL;
6732
6733    if (size == 0)
6734        _Py_RETURN_UNICODE_EMPTY();
6735
6736    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
6737    if (size == 1 && (unsigned char)s[0] < 128)
6738        return get_latin1_char((unsigned char)s[0]);
6739
6740    _PyUnicodeWriter_Init(&writer);
6741    writer.min_length = size;
6742    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) < 0)
6743        return NULL;
6744
6745    e = s + size;
6746    data = writer.data;
6747    outpos = ascii_decode(s, e, (Py_UCS1 *)data);
6748    writer.pos = outpos;
6749    if (writer.pos == size)
6750        return _PyUnicodeWriter_Finish(&writer);
6751
6752    s += writer.pos;
6753    kind = writer.kind;
6754    while (s < e) {
6755        unsigned char c = (unsigned char)*s;
6756        if (c < 128) {
6757            PyUnicode_WRITE(kind, data, writer.pos, c);
6758            writer.pos++;
6759            ++s;
6760        }
6761        else {
6762            startinpos = s-starts;
6763            endinpos = startinpos + 1;
6764            if (unicode_decode_call_errorhandler_writer(
6765                    errors, &errorHandler,
6766                    "ascii", "ordinal not in range(128)",
6767                    &starts, &e, &startinpos, &endinpos, &exc, &s,
6768                    &writer))
6769                goto onError;
6770            kind = writer.kind;
6771            data = writer.data;
6772        }
6773    }
6774    Py_XDECREF(errorHandler);
6775    Py_XDECREF(exc);
6776    return _PyUnicodeWriter_Finish(&writer);
6777
6778  onError:
6779    _PyUnicodeWriter_Dealloc(&writer);
6780    Py_XDECREF(errorHandler);
6781    Py_XDECREF(exc);
6782    return NULL;
6783}
6784
6785/* Deprecated */
6786PyObject *
6787PyUnicode_EncodeASCII(const Py_UNICODE *p,
6788                      Py_ssize_t size,
6789                      const char *errors)
6790{
6791    PyObject *result;
6792    PyObject *unicode = PyUnicode_FromUnicode(p, size);
6793    if (unicode == NULL)
6794        return NULL;
6795    result = unicode_encode_ucs1(unicode, errors, 128);
6796    Py_DECREF(unicode);
6797    return result;
6798}
6799
6800PyObject *
6801_PyUnicode_AsASCIIString(PyObject *unicode, const char *errors)
6802{
6803    if (!PyUnicode_Check(unicode)) {
6804        PyErr_BadArgument();
6805        return NULL;
6806    }
6807    if (PyUnicode_READY(unicode) == -1)
6808        return NULL;
6809    /* Fast path: if it is an ASCII-only string, construct bytes object
6810       directly. Else defer to above function to raise the exception. */
6811    if (PyUnicode_IS_ASCII(unicode))
6812        return PyBytes_FromStringAndSize(PyUnicode_DATA(unicode),
6813                                         PyUnicode_GET_LENGTH(unicode));
6814    return unicode_encode_ucs1(unicode, errors, 128);
6815}
6816
6817PyObject *
6818PyUnicode_AsASCIIString(PyObject *unicode)
6819{
6820    return _PyUnicode_AsASCIIString(unicode, NULL);
6821}
6822
6823#ifdef HAVE_MBCS
6824
6825/* --- MBCS codecs for Windows -------------------------------------------- */
6826
6827#if SIZEOF_INT < SIZEOF_SIZE_T
6828#define NEED_RETRY
6829#endif
6830
6831#ifndef WC_ERR_INVALID_CHARS
6832#  define WC_ERR_INVALID_CHARS 0x0080
6833#endif
6834
6835static char*
6836code_page_name(UINT code_page, PyObject **obj)
6837{
6838    *obj = NULL;
6839    if (code_page == CP_ACP)
6840        return "mbcs";
6841    if (code_page == CP_UTF7)
6842        return "CP_UTF7";
6843    if (code_page == CP_UTF8)
6844        return "CP_UTF8";
6845
6846    *obj = PyBytes_FromFormat("cp%u", code_page);
6847    if (*obj == NULL)
6848        return NULL;
6849    return PyBytes_AS_STRING(*obj);
6850}
6851
6852static int
6853is_dbcs_lead_byte(UINT code_page, const char *s, int offset)
6854{
6855    const char *curr = s + offset;
6856    const char *prev;
6857
6858    if (!IsDBCSLeadByteEx(code_page, *curr))
6859        return 0;
6860
6861    prev = CharPrevExA(code_page, s, curr, 0);
6862    if (prev == curr)
6863        return 1;
6864    /* FIXME: This code is limited to "true" double-byte encodings,
6865       as it assumes an incomplete character consists of a single
6866       byte. */
6867    if (curr - prev == 2)
6868        return 1;
6869    if (!IsDBCSLeadByteEx(code_page, *prev))
6870        return 1;
6871    return 0;
6872}
6873
6874static DWORD
6875decode_code_page_flags(UINT code_page)
6876{
6877    if (code_page == CP_UTF7) {
6878        /* The CP_UTF7 decoder only supports flags=0 */
6879        return 0;
6880    }
6881    else
6882        return MB_ERR_INVALID_CHARS;
6883}
6884
6885/*
6886 * Decode a byte string from a Windows code page into unicode object in strict
6887 * mode.
6888 *
6889 * Returns consumed size if succeed, returns -2 on decode error, or raise an
6890 * OSError and returns -1 on other error.
6891 */
6892static int
6893decode_code_page_strict(UINT code_page,
6894                        PyObject **v,
6895                        const char *in,
6896                        int insize)
6897{
6898    const DWORD flags = decode_code_page_flags(code_page);
6899    wchar_t *out;
6900    DWORD outsize;
6901
6902    /* First get the size of the result */
6903    assert(insize > 0);
6904    outsize = MultiByteToWideChar(code_page, flags, in, insize, NULL, 0);
6905    if (outsize <= 0)
6906        goto error;
6907
6908    if (*v == NULL) {
6909        /* Create unicode object */
6910        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6911        *v = (PyObject*)_PyUnicode_New(outsize);
6912        if (*v == NULL)
6913            return -1;
6914        out = PyUnicode_AS_UNICODE(*v);
6915    }
6916    else {
6917        /* Extend unicode object */
6918        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
6919        if (unicode_resize(v, n + outsize) < 0)
6920            return -1;
6921        out = PyUnicode_AS_UNICODE(*v) + n;
6922    }
6923
6924    /* Do the conversion */
6925    outsize = MultiByteToWideChar(code_page, flags, in, insize, out, outsize);
6926    if (outsize <= 0)
6927        goto error;
6928    return insize;
6929
6930error:
6931    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
6932        return -2;
6933    PyErr_SetFromWindowsErr(0);
6934    return -1;
6935}
6936
6937/*
6938 * Decode a byte string from a code page into unicode object with an error
6939 * handler.
6940 *
6941 * Returns consumed size if succeed, or raise an OSError or
6942 * UnicodeDecodeError exception and returns -1 on error.
6943 */
6944static int
6945decode_code_page_errors(UINT code_page,
6946                        PyObject **v,
6947                        const char *in, const int size,
6948                        const char *errors)
6949{
6950    const char *startin = in;
6951    const char *endin = in + size;
6952    const DWORD flags = decode_code_page_flags(code_page);
6953    /* Ideally, we should get reason from FormatMessage. This is the Windows
6954       2000 English version of the message. */
6955    const char *reason = "No mapping for the Unicode character exists "
6956                         "in the target code page.";
6957    /* each step cannot decode more than 1 character, but a character can be
6958       represented as a surrogate pair */
6959    wchar_t buffer[2], *startout, *out;
6960    int insize;
6961    Py_ssize_t outsize;
6962    PyObject *errorHandler = NULL;
6963    PyObject *exc = NULL;
6964    PyObject *encoding_obj = NULL;
6965    char *encoding;
6966    DWORD err;
6967    int ret = -1;
6968
6969    assert(size > 0);
6970
6971    encoding = code_page_name(code_page, &encoding_obj);
6972    if (encoding == NULL)
6973        return -1;
6974
6975    if (errors == NULL || strcmp(errors, "strict") == 0) {
6976        /* The last error was ERROR_NO_UNICODE_TRANSLATION, then we raise a
6977           UnicodeDecodeError. */
6978        make_decode_exception(&exc, encoding, in, size, 0, 0, reason);
6979        if (exc != NULL) {
6980            PyCodec_StrictErrors(exc);
6981            Py_CLEAR(exc);
6982        }
6983        goto error;
6984    }
6985
6986    if (*v == NULL) {
6987        /* Create unicode object */
6988        if (size > PY_SSIZE_T_MAX / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
6989            PyErr_NoMemory();
6990            goto error;
6991        }
6992        /* FIXME: don't use _PyUnicode_New(), but allocate a wchar_t* buffer */
6993        *v = (PyObject*)_PyUnicode_New(size * Py_ARRAY_LENGTH(buffer));
6994        if (*v == NULL)
6995            goto error;
6996        startout = PyUnicode_AS_UNICODE(*v);
6997    }
6998    else {
6999        /* Extend unicode object */
7000        Py_ssize_t n = PyUnicode_GET_SIZE(*v);
7001        if (size > (PY_SSIZE_T_MAX - n) / (Py_ssize_t)Py_ARRAY_LENGTH(buffer)) {
7002            PyErr_NoMemory();
7003            goto error;
7004        }
7005        if (unicode_resize(v, n + size * Py_ARRAY_LENGTH(buffer)) < 0)
7006            goto error;
7007        startout = PyUnicode_AS_UNICODE(*v) + n;
7008    }
7009
7010    /* Decode the byte string character per character */
7011    out = startout;
7012    while (in < endin)
7013    {
7014        /* Decode a character */
7015        insize = 1;
7016        do
7017        {
7018            outsize = MultiByteToWideChar(code_page, flags,
7019                                          in, insize,
7020                                          buffer, Py_ARRAY_LENGTH(buffer));
7021            if (outsize > 0)
7022                break;
7023            err = GetLastError();
7024            if (err != ERROR_NO_UNICODE_TRANSLATION
7025                && err != ERROR_INSUFFICIENT_BUFFER)
7026            {
7027                PyErr_SetFromWindowsErr(0);
7028                goto error;
7029            }
7030            insize++;
7031        }
7032        /* 4=maximum length of a UTF-8 sequence */
7033        while (insize <= 4 && (in + insize) <= endin);
7034
7035        if (outsize <= 0) {
7036            Py_ssize_t startinpos, endinpos, outpos;
7037
7038            startinpos = in - startin;
7039            endinpos = startinpos + 1;
7040            outpos = out - PyUnicode_AS_UNICODE(*v);
7041            if (unicode_decode_call_errorhandler_wchar(
7042                    errors, &errorHandler,
7043                    encoding, reason,
7044                    &startin, &endin, &startinpos, &endinpos, &exc, &in,
7045                    v, &outpos))
7046            {
7047                goto error;
7048            }
7049            out = PyUnicode_AS_UNICODE(*v) + outpos;
7050        }
7051        else {
7052            in += insize;
7053            memcpy(out, buffer, outsize * sizeof(wchar_t));
7054            out += outsize;
7055        }
7056    }
7057
7058    /* write a NUL character at the end */
7059    *out = 0;
7060
7061    /* Extend unicode object */
7062    outsize = out - startout;
7063    assert(outsize <= PyUnicode_WSTR_LENGTH(*v));
7064    if (unicode_resize(v, outsize) < 0)
7065        goto error;
7066    ret = size;
7067
7068error:
7069    Py_XDECREF(encoding_obj);
7070    Py_XDECREF(errorHandler);
7071    Py_XDECREF(exc);
7072    return ret;
7073}
7074
7075static PyObject *
7076decode_code_page_stateful(int code_page,
7077                          const char *s, Py_ssize_t size,
7078                          const char *errors, Py_ssize_t *consumed)
7079{
7080    PyObject *v = NULL;
7081    int chunk_size, final, converted, done;
7082
7083    if (code_page < 0) {
7084        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7085        return NULL;
7086    }
7087
7088    if (consumed)
7089        *consumed = 0;
7090
7091    do
7092    {
7093#ifdef NEED_RETRY
7094        if (size > INT_MAX) {
7095            chunk_size = INT_MAX;
7096            final = 0;
7097            done = 0;
7098        }
7099        else
7100#endif
7101        {
7102            chunk_size = (int)size;
7103            final = (consumed == NULL);
7104            done = 1;
7105        }
7106
7107        /* Skip trailing lead-byte unless 'final' is set */
7108        if (!final && is_dbcs_lead_byte(code_page, s, chunk_size - 1))
7109            --chunk_size;
7110
7111        if (chunk_size == 0 && done) {
7112            if (v != NULL)
7113                break;
7114            _Py_RETURN_UNICODE_EMPTY();
7115        }
7116
7117
7118        converted = decode_code_page_strict(code_page, &v,
7119                                            s, chunk_size);
7120        if (converted == -2)
7121            converted = decode_code_page_errors(code_page, &v,
7122                                                s, chunk_size,
7123                                                errors);
7124        assert(converted != 0);
7125
7126        if (converted < 0) {
7127            Py_XDECREF(v);
7128            return NULL;
7129        }
7130
7131        if (consumed)
7132            *consumed += converted;
7133
7134        s += converted;
7135        size -= converted;
7136    } while (!done);
7137
7138    return unicode_result(v);
7139}
7140
7141PyObject *
7142PyUnicode_DecodeCodePageStateful(int code_page,
7143                                 const char *s,
7144                                 Py_ssize_t size,
7145                                 const char *errors,
7146                                 Py_ssize_t *consumed)
7147{
7148    return decode_code_page_stateful(code_page, s, size, errors, consumed);
7149}
7150
7151PyObject *
7152PyUnicode_DecodeMBCSStateful(const char *s,
7153                             Py_ssize_t size,
7154                             const char *errors,
7155                             Py_ssize_t *consumed)
7156{
7157    return decode_code_page_stateful(CP_ACP, s, size, errors, consumed);
7158}
7159
7160PyObject *
7161PyUnicode_DecodeMBCS(const char *s,
7162                     Py_ssize_t size,
7163                     const char *errors)
7164{
7165    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
7166}
7167
7168static DWORD
7169encode_code_page_flags(UINT code_page, const char *errors)
7170{
7171    if (code_page == CP_UTF8) {
7172        if (winver.dwMajorVersion >= 6)
7173            /* CP_UTF8 supports WC_ERR_INVALID_CHARS on Windows Vista
7174               and later */
7175            return WC_ERR_INVALID_CHARS;
7176        else
7177            /* CP_UTF8 only supports flags=0 on Windows older than Vista */
7178            return 0;
7179    }
7180    else if (code_page == CP_UTF7) {
7181        /* CP_UTF7 only supports flags=0 */
7182        return 0;
7183    }
7184    else {
7185        if (errors != NULL && strcmp(errors, "replace") == 0)
7186            return 0;
7187        else
7188            return WC_NO_BEST_FIT_CHARS;
7189    }
7190}
7191
7192/*
7193 * Encode a Unicode string to a Windows code page into a byte string in strict
7194 * mode.
7195 *
7196 * Returns consumed characters if succeed, returns -2 on encode error, or raise
7197 * an OSError and returns -1 on other error.
7198 */
7199static int
7200encode_code_page_strict(UINT code_page, PyObject **outbytes,
7201                        PyObject *unicode, Py_ssize_t offset, int len,
7202                        const char* errors)
7203{
7204    BOOL usedDefaultChar = FALSE;
7205    BOOL *pusedDefaultChar = &usedDefaultChar;
7206    int outsize;
7207    PyObject *exc = NULL;
7208    wchar_t *p;
7209    Py_ssize_t size;
7210    const DWORD flags = encode_code_page_flags(code_page, NULL);
7211    char *out;
7212    /* Create a substring so that we can get the UTF-16 representation
7213       of just the slice under consideration. */
7214    PyObject *substring;
7215
7216    assert(len > 0);
7217
7218    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7219        pusedDefaultChar = &usedDefaultChar;
7220    else
7221        pusedDefaultChar = NULL;
7222
7223    substring = PyUnicode_Substring(unicode, offset, offset+len);
7224    if (substring == NULL)
7225        return -1;
7226    p = PyUnicode_AsUnicodeAndSize(substring, &size);
7227    if (p == NULL) {
7228        Py_DECREF(substring);
7229        return -1;
7230    }
7231    assert(size <= INT_MAX);
7232
7233    /* First get the size of the result */
7234    outsize = WideCharToMultiByte(code_page, flags,
7235                                  p, (int)size,
7236                                  NULL, 0,
7237                                  NULL, pusedDefaultChar);
7238    if (outsize <= 0)
7239        goto error;
7240    /* If we used a default char, then we failed! */
7241    if (pusedDefaultChar && *pusedDefaultChar) {
7242        Py_DECREF(substring);
7243        return -2;
7244    }
7245
7246    if (*outbytes == NULL) {
7247        /* Create string object */
7248        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7249        if (*outbytes == NULL) {
7250            Py_DECREF(substring);
7251            return -1;
7252        }
7253        out = PyBytes_AS_STRING(*outbytes);
7254    }
7255    else {
7256        /* Extend string object */
7257        const Py_ssize_t n = PyBytes_Size(*outbytes);
7258        if (outsize > PY_SSIZE_T_MAX - n) {
7259            PyErr_NoMemory();
7260            Py_DECREF(substring);
7261            return -1;
7262        }
7263        if (_PyBytes_Resize(outbytes, n + outsize) < 0) {
7264            Py_DECREF(substring);
7265            return -1;
7266        }
7267        out = PyBytes_AS_STRING(*outbytes) + n;
7268    }
7269
7270    /* Do the conversion */
7271    outsize = WideCharToMultiByte(code_page, flags,
7272                                  p, (int)size,
7273                                  out, outsize,
7274                                  NULL, pusedDefaultChar);
7275    Py_CLEAR(substring);
7276    if (outsize <= 0)
7277        goto error;
7278    if (pusedDefaultChar && *pusedDefaultChar)
7279        return -2;
7280    return 0;
7281
7282error:
7283    Py_XDECREF(substring);
7284    if (GetLastError() == ERROR_NO_UNICODE_TRANSLATION)
7285        return -2;
7286    PyErr_SetFromWindowsErr(0);
7287    return -1;
7288}
7289
7290/*
7291 * Encode a Unicode string to a Windows code page into a byte string using a
7292 * error handler.
7293 *
7294 * Returns consumed characters if succeed, or raise an OSError and returns
7295 * -1 on other error.
7296 */
7297static int
7298encode_code_page_errors(UINT code_page, PyObject **outbytes,
7299                        PyObject *unicode, Py_ssize_t unicode_offset,
7300                        Py_ssize_t insize, const char* errors)
7301{
7302    const DWORD flags = encode_code_page_flags(code_page, errors);
7303    Py_ssize_t pos = unicode_offset;
7304    Py_ssize_t endin = unicode_offset + insize;
7305    /* Ideally, we should get reason from FormatMessage. This is the Windows
7306       2000 English version of the message. */
7307    const char *reason = "invalid character";
7308    /* 4=maximum length of a UTF-8 sequence */
7309    char buffer[4];
7310    BOOL usedDefaultChar = FALSE, *pusedDefaultChar;
7311    Py_ssize_t outsize;
7312    char *out;
7313    PyObject *errorHandler = NULL;
7314    PyObject *exc = NULL;
7315    PyObject *encoding_obj = NULL;
7316    char *encoding;
7317    Py_ssize_t newpos, newoutsize;
7318    PyObject *rep;
7319    int ret = -1;
7320
7321    assert(insize > 0);
7322
7323    encoding = code_page_name(code_page, &encoding_obj);
7324    if (encoding == NULL)
7325        return -1;
7326
7327    if (errors == NULL || strcmp(errors, "strict") == 0) {
7328        /* The last error was ERROR_NO_UNICODE_TRANSLATION,
7329           then we raise a UnicodeEncodeError. */
7330        make_encode_exception(&exc, encoding, unicode, 0, 0, reason);
7331        if (exc != NULL) {
7332            PyCodec_StrictErrors(exc);
7333            Py_DECREF(exc);
7334        }
7335        Py_XDECREF(encoding_obj);
7336        return -1;
7337    }
7338
7339    if (code_page != CP_UTF8 && code_page != CP_UTF7)
7340        pusedDefaultChar = &usedDefaultChar;
7341    else
7342        pusedDefaultChar = NULL;
7343
7344    if (Py_ARRAY_LENGTH(buffer) > PY_SSIZE_T_MAX / insize) {
7345        PyErr_NoMemory();
7346        goto error;
7347    }
7348    outsize = insize * Py_ARRAY_LENGTH(buffer);
7349
7350    if (*outbytes == NULL) {
7351        /* Create string object */
7352        *outbytes = PyBytes_FromStringAndSize(NULL, outsize);
7353        if (*outbytes == NULL)
7354            goto error;
7355        out = PyBytes_AS_STRING(*outbytes);
7356    }
7357    else {
7358        /* Extend string object */
7359        Py_ssize_t n = PyBytes_Size(*outbytes);
7360        if (n > PY_SSIZE_T_MAX - outsize) {
7361            PyErr_NoMemory();
7362            goto error;
7363        }
7364        if (_PyBytes_Resize(outbytes, n + outsize) < 0)
7365            goto error;
7366        out = PyBytes_AS_STRING(*outbytes) + n;
7367    }
7368
7369    /* Encode the string character per character */
7370    while (pos < endin)
7371    {
7372        Py_UCS4 ch = PyUnicode_READ_CHAR(unicode, pos);
7373        wchar_t chars[2];
7374        int charsize;
7375        if (ch < 0x10000) {
7376            chars[0] = (wchar_t)ch;
7377            charsize = 1;
7378        }
7379        else {
7380            chars[0] = Py_UNICODE_HIGH_SURROGATE(ch);
7381            chars[1] = Py_UNICODE_LOW_SURROGATE(ch);
7382            charsize = 2;
7383        }
7384
7385        outsize = WideCharToMultiByte(code_page, flags,
7386                                      chars, charsize,
7387                                      buffer, Py_ARRAY_LENGTH(buffer),
7388                                      NULL, pusedDefaultChar);
7389        if (outsize > 0) {
7390            if (pusedDefaultChar == NULL || !(*pusedDefaultChar))
7391            {
7392                pos++;
7393                memcpy(out, buffer, outsize);
7394                out += outsize;
7395                continue;
7396            }
7397        }
7398        else if (GetLastError() != ERROR_NO_UNICODE_TRANSLATION) {
7399            PyErr_SetFromWindowsErr(0);
7400            goto error;
7401        }
7402
7403        rep = unicode_encode_call_errorhandler(
7404                  errors, &errorHandler, encoding, reason,
7405                  unicode, &exc,
7406                  pos, pos + 1, &newpos);
7407        if (rep == NULL)
7408            goto error;
7409        pos = newpos;
7410
7411        if (PyBytes_Check(rep)) {
7412            outsize = PyBytes_GET_SIZE(rep);
7413            if (outsize != 1) {
7414                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7415                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7416                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7417                    Py_DECREF(rep);
7418                    goto error;
7419                }
7420                out = PyBytes_AS_STRING(*outbytes) + offset;
7421            }
7422            memcpy(out, PyBytes_AS_STRING(rep), outsize);
7423            out += outsize;
7424        }
7425        else {
7426            Py_ssize_t i;
7427            enum PyUnicode_Kind kind;
7428            void *data;
7429
7430            if (PyUnicode_READY(rep) == -1) {
7431                Py_DECREF(rep);
7432                goto error;
7433            }
7434
7435            outsize = PyUnicode_GET_LENGTH(rep);
7436            if (outsize != 1) {
7437                Py_ssize_t offset = out - PyBytes_AS_STRING(*outbytes);
7438                newoutsize = PyBytes_GET_SIZE(*outbytes) + (outsize - 1);
7439                if (_PyBytes_Resize(outbytes, newoutsize) < 0) {
7440                    Py_DECREF(rep);
7441                    goto error;
7442                }
7443                out = PyBytes_AS_STRING(*outbytes) + offset;
7444            }
7445            kind = PyUnicode_KIND(rep);
7446            data = PyUnicode_DATA(rep);
7447            for (i=0; i < outsize; i++) {
7448                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7449                if (ch > 127) {
7450                    raise_encode_exception(&exc,
7451                        encoding, unicode,
7452                        pos, pos + 1,
7453                        "unable to encode error handler result to ASCII");
7454                    Py_DECREF(rep);
7455                    goto error;
7456                }
7457                *out = (unsigned char)ch;
7458                out++;
7459            }
7460        }
7461        Py_DECREF(rep);
7462    }
7463    /* write a NUL byte */
7464    *out = 0;
7465    outsize = out - PyBytes_AS_STRING(*outbytes);
7466    assert(outsize <= PyBytes_GET_SIZE(*outbytes));
7467    if (_PyBytes_Resize(outbytes, outsize) < 0)
7468        goto error;
7469    ret = 0;
7470
7471error:
7472    Py_XDECREF(encoding_obj);
7473    Py_XDECREF(errorHandler);
7474    Py_XDECREF(exc);
7475    return ret;
7476}
7477
7478static PyObject *
7479encode_code_page(int code_page,
7480                 PyObject *unicode,
7481                 const char *errors)
7482{
7483    Py_ssize_t len;
7484    PyObject *outbytes = NULL;
7485    Py_ssize_t offset;
7486    int chunk_len, ret, done;
7487
7488    if (PyUnicode_READY(unicode) == -1)
7489        return NULL;
7490    len = PyUnicode_GET_LENGTH(unicode);
7491
7492    if (code_page < 0) {
7493        PyErr_SetString(PyExc_ValueError, "invalid code page number");
7494        return NULL;
7495    }
7496
7497    if (len == 0)
7498        return PyBytes_FromStringAndSize(NULL, 0);
7499
7500    offset = 0;
7501    do
7502    {
7503#ifdef NEED_RETRY
7504        /* UTF-16 encoding may double the size, so use only INT_MAX/2
7505           chunks. */
7506        if (len > INT_MAX/2) {
7507            chunk_len = INT_MAX/2;
7508            done = 0;
7509        }
7510        else
7511#endif
7512        {
7513            chunk_len = (int)len;
7514            done = 1;
7515        }
7516
7517        ret = encode_code_page_strict(code_page, &outbytes,
7518                                      unicode, offset, chunk_len,
7519                                      errors);
7520        if (ret == -2)
7521            ret = encode_code_page_errors(code_page, &outbytes,
7522                                          unicode, offset,
7523                                          chunk_len, errors);
7524        if (ret < 0) {
7525            Py_XDECREF(outbytes);
7526            return NULL;
7527        }
7528
7529        offset += chunk_len;
7530        len -= chunk_len;
7531    } while (!done);
7532
7533    return outbytes;
7534}
7535
7536PyObject *
7537PyUnicode_EncodeMBCS(const Py_UNICODE *p,
7538                     Py_ssize_t size,
7539                     const char *errors)
7540{
7541    PyObject *unicode, *res;
7542    unicode = PyUnicode_FromUnicode(p, size);
7543    if (unicode == NULL)
7544        return NULL;
7545    res = encode_code_page(CP_ACP, unicode, errors);
7546    Py_DECREF(unicode);
7547    return res;
7548}
7549
7550PyObject *
7551PyUnicode_EncodeCodePage(int code_page,
7552                         PyObject *unicode,
7553                         const char *errors)
7554{
7555    return encode_code_page(code_page, unicode, errors);
7556}
7557
7558PyObject *
7559PyUnicode_AsMBCSString(PyObject *unicode)
7560{
7561    if (!PyUnicode_Check(unicode)) {
7562        PyErr_BadArgument();
7563        return NULL;
7564    }
7565    return PyUnicode_EncodeCodePage(CP_ACP, unicode, NULL);
7566}
7567
7568#undef NEED_RETRY
7569
7570#endif /* HAVE_MBCS */
7571
7572/* --- Character Mapping Codec -------------------------------------------- */
7573
7574static int
7575charmap_decode_string(const char *s,
7576                      Py_ssize_t size,
7577                      PyObject *mapping,
7578                      const char *errors,
7579                      _PyUnicodeWriter *writer)
7580{
7581    const char *starts = s;
7582    const char *e;
7583    Py_ssize_t startinpos, endinpos;
7584    PyObject *errorHandler = NULL, *exc = NULL;
7585    Py_ssize_t maplen;
7586    enum PyUnicode_Kind mapkind;
7587    void *mapdata;
7588    Py_UCS4 x;
7589    unsigned char ch;
7590
7591    if (PyUnicode_READY(mapping) == -1)
7592        return -1;
7593
7594    maplen = PyUnicode_GET_LENGTH(mapping);
7595    mapdata = PyUnicode_DATA(mapping);
7596    mapkind = PyUnicode_KIND(mapping);
7597
7598    e = s + size;
7599
7600    if (mapkind == PyUnicode_1BYTE_KIND && maplen >= 256) {
7601        /* fast-path for cp037, cp500 and iso8859_1 encodings. iso8859_1
7602         * is disabled in encoding aliases, latin1 is preferred because
7603         * its implementation is faster. */
7604        Py_UCS1 *mapdata_ucs1 = (Py_UCS1 *)mapdata;
7605        Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7606        Py_UCS4 maxchar = writer->maxchar;
7607
7608        assert (writer->kind == PyUnicode_1BYTE_KIND);
7609        while (s < e) {
7610            ch = *s;
7611            x = mapdata_ucs1[ch];
7612            if (x > maxchar) {
7613                if (_PyUnicodeWriter_Prepare(writer, 1, 0xff) == -1)
7614                    goto onError;
7615                maxchar = writer->maxchar;
7616                outdata = (Py_UCS1 *)writer->data;
7617            }
7618            outdata[writer->pos] = x;
7619            writer->pos++;
7620            ++s;
7621        }
7622        return 0;
7623    }
7624
7625    while (s < e) {
7626        if (mapkind == PyUnicode_2BYTE_KIND && maplen >= 256) {
7627            enum PyUnicode_Kind outkind = writer->kind;
7628            Py_UCS2 *mapdata_ucs2 = (Py_UCS2 *)mapdata;
7629            if (outkind == PyUnicode_1BYTE_KIND) {
7630                Py_UCS1 *outdata = (Py_UCS1 *)writer->data;
7631                Py_UCS4 maxchar = writer->maxchar;
7632                while (s < e) {
7633                    ch = *s;
7634                    x = mapdata_ucs2[ch];
7635                    if (x > maxchar)
7636                        goto Error;
7637                    outdata[writer->pos] = x;
7638                    writer->pos++;
7639                    ++s;
7640                }
7641                break;
7642            }
7643            else if (outkind == PyUnicode_2BYTE_KIND) {
7644                Py_UCS2 *outdata = (Py_UCS2 *)writer->data;
7645                while (s < e) {
7646                    ch = *s;
7647                    x = mapdata_ucs2[ch];
7648                    if (x == 0xFFFE)
7649                        goto Error;
7650                    outdata[writer->pos] = x;
7651                    writer->pos++;
7652                    ++s;
7653                }
7654                break;
7655            }
7656        }
7657        ch = *s;
7658
7659        if (ch < maplen)
7660            x = PyUnicode_READ(mapkind, mapdata, ch);
7661        else
7662            x = 0xfffe; /* invalid value */
7663Error:
7664        if (x == 0xfffe)
7665        {
7666            /* undefined mapping */
7667            startinpos = s-starts;
7668            endinpos = startinpos+1;
7669            if (unicode_decode_call_errorhandler_writer(
7670                    errors, &errorHandler,
7671                    "charmap", "character maps to <undefined>",
7672                    &starts, &e, &startinpos, &endinpos, &exc, &s,
7673                    writer)) {
7674                goto onError;
7675            }
7676            continue;
7677        }
7678
7679        if (_PyUnicodeWriter_WriteCharInline(writer, x) < 0)
7680            goto onError;
7681        ++s;
7682    }
7683    Py_XDECREF(errorHandler);
7684    Py_XDECREF(exc);
7685    return 0;
7686
7687onError:
7688    Py_XDECREF(errorHandler);
7689    Py_XDECREF(exc);
7690    return -1;
7691}
7692
7693static int
7694charmap_decode_mapping(const char *s,
7695                       Py_ssize_t size,
7696                       PyObject *mapping,
7697                       const char *errors,
7698                       _PyUnicodeWriter *writer)
7699{
7700    const char *starts = s;
7701    const char *e;
7702    Py_ssize_t startinpos, endinpos;
7703    PyObject *errorHandler = NULL, *exc = NULL;
7704    unsigned char ch;
7705    PyObject *key, *item = NULL;
7706
7707    e = s + size;
7708
7709    while (s < e) {
7710        ch = *s;
7711
7712        /* Get mapping (char ordinal -> integer, Unicode char or None) */
7713        key = PyLong_FromLong((long)ch);
7714        if (key == NULL)
7715            goto onError;
7716
7717        item = PyObject_GetItem(mapping, key);
7718        Py_DECREF(key);
7719        if (item == NULL) {
7720            if (PyErr_ExceptionMatches(PyExc_LookupError)) {
7721                /* No mapping found means: mapping is undefined. */
7722                PyErr_Clear();
7723                goto Undefined;
7724            } else
7725                goto onError;
7726        }
7727
7728        /* Apply mapping */
7729        if (item == Py_None)
7730            goto Undefined;
7731        if (PyLong_Check(item)) {
7732            long value = PyLong_AS_LONG(item);
7733            if (value == 0xFFFE)
7734                goto Undefined;
7735            if (value < 0 || value > MAX_UNICODE) {
7736                PyErr_Format(PyExc_TypeError,
7737                             "character mapping must be in range(0x%lx)",
7738                             (unsigned long)MAX_UNICODE + 1);
7739                goto onError;
7740            }
7741
7742            if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7743                goto onError;
7744        }
7745        else if (PyUnicode_Check(item)) {
7746            if (PyUnicode_READY(item) == -1)
7747                goto onError;
7748            if (PyUnicode_GET_LENGTH(item) == 1) {
7749                Py_UCS4 value = PyUnicode_READ_CHAR(item, 0);
7750                if (value == 0xFFFE)
7751                    goto Undefined;
7752                if (_PyUnicodeWriter_WriteCharInline(writer, value) < 0)
7753                    goto onError;
7754            }
7755            else {
7756                writer->overallocate = 1;
7757                if (_PyUnicodeWriter_WriteStr(writer, item) == -1)
7758                    goto onError;
7759            }
7760        }
7761        else {
7762            /* wrong return value */
7763            PyErr_SetString(PyExc_TypeError,
7764                            "character mapping must return integer, None or str");
7765            goto onError;
7766        }
7767        Py_CLEAR(item);
7768        ++s;
7769        continue;
7770
7771Undefined:
7772        /* undefined mapping */
7773        Py_CLEAR(item);
7774        startinpos = s-starts;
7775        endinpos = startinpos+1;
7776        if (unicode_decode_call_errorhandler_writer(
7777                errors, &errorHandler,
7778                "charmap", "character maps to <undefined>",
7779                &starts, &e, &startinpos, &endinpos, &exc, &s,
7780                writer)) {
7781            goto onError;
7782        }
7783    }
7784    Py_XDECREF(errorHandler);
7785    Py_XDECREF(exc);
7786    return 0;
7787
7788onError:
7789    Py_XDECREF(item);
7790    Py_XDECREF(errorHandler);
7791    Py_XDECREF(exc);
7792    return -1;
7793}
7794
7795PyObject *
7796PyUnicode_DecodeCharmap(const char *s,
7797                        Py_ssize_t size,
7798                        PyObject *mapping,
7799                        const char *errors)
7800{
7801    _PyUnicodeWriter writer;
7802
7803    /* Default to Latin-1 */
7804    if (mapping == NULL)
7805        return PyUnicode_DecodeLatin1(s, size, errors);
7806
7807    if (size == 0)
7808        _Py_RETURN_UNICODE_EMPTY();
7809    _PyUnicodeWriter_Init(&writer);
7810    writer.min_length = size;
7811    if (_PyUnicodeWriter_Prepare(&writer, writer.min_length, 127) == -1)
7812        goto onError;
7813
7814    if (PyUnicode_CheckExact(mapping)) {
7815        if (charmap_decode_string(s, size, mapping, errors, &writer) < 0)
7816            goto onError;
7817    }
7818    else {
7819        if (charmap_decode_mapping(s, size, mapping, errors, &writer) < 0)
7820            goto onError;
7821    }
7822    return _PyUnicodeWriter_Finish(&writer);
7823
7824  onError:
7825    _PyUnicodeWriter_Dealloc(&writer);
7826    return NULL;
7827}
7828
7829/* Charmap encoding: the lookup table */
7830
7831struct encoding_map {
7832    PyObject_HEAD
7833    unsigned char level1[32];
7834    int count2, count3;
7835    unsigned char level23[1];
7836};
7837
7838static PyObject*
7839encoding_map_size(PyObject *obj, PyObject* args)
7840{
7841    struct encoding_map *map = (struct encoding_map*)obj;
7842    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
7843                           128*map->count3);
7844}
7845
7846static PyMethodDef encoding_map_methods[] = {
7847    {"size", encoding_map_size, METH_NOARGS,
7848     PyDoc_STR("Return the size (in bytes) of this object") },
7849    { 0 }
7850};
7851
7852static void
7853encoding_map_dealloc(PyObject* o)
7854{
7855    PyObject_FREE(o);
7856}
7857
7858static PyTypeObject EncodingMapType = {
7859    PyVarObject_HEAD_INIT(NULL, 0)
7860    "EncodingMap",          /*tp_name*/
7861    sizeof(struct encoding_map),   /*tp_basicsize*/
7862    0,                      /*tp_itemsize*/
7863    /* methods */
7864    encoding_map_dealloc,   /*tp_dealloc*/
7865    0,                      /*tp_print*/
7866    0,                      /*tp_getattr*/
7867    0,                      /*tp_setattr*/
7868    0,                      /*tp_reserved*/
7869    0,                      /*tp_repr*/
7870    0,                      /*tp_as_number*/
7871    0,                      /*tp_as_sequence*/
7872    0,                      /*tp_as_mapping*/
7873    0,                      /*tp_hash*/
7874    0,                      /*tp_call*/
7875    0,                      /*tp_str*/
7876    0,                      /*tp_getattro*/
7877    0,                      /*tp_setattro*/
7878    0,                      /*tp_as_buffer*/
7879    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
7880    0,                      /*tp_doc*/
7881    0,                      /*tp_traverse*/
7882    0,                      /*tp_clear*/
7883    0,                      /*tp_richcompare*/
7884    0,                      /*tp_weaklistoffset*/
7885    0,                      /*tp_iter*/
7886    0,                      /*tp_iternext*/
7887    encoding_map_methods,   /*tp_methods*/
7888    0,                      /*tp_members*/
7889    0,                      /*tp_getset*/
7890    0,                      /*tp_base*/
7891    0,                      /*tp_dict*/
7892    0,                      /*tp_descr_get*/
7893    0,                      /*tp_descr_set*/
7894    0,                      /*tp_dictoffset*/
7895    0,                      /*tp_init*/
7896    0,                      /*tp_alloc*/
7897    0,                      /*tp_new*/
7898    0,                      /*tp_free*/
7899    0,                      /*tp_is_gc*/
7900};
7901
7902PyObject*
7903PyUnicode_BuildEncodingMap(PyObject* string)
7904{
7905    PyObject *result;
7906    struct encoding_map *mresult;
7907    int i;
7908    int need_dict = 0;
7909    unsigned char level1[32];
7910    unsigned char level2[512];
7911    unsigned char *mlevel1, *mlevel2, *mlevel3;
7912    int count2 = 0, count3 = 0;
7913    int kind;
7914    void *data;
7915    Py_ssize_t length;
7916    Py_UCS4 ch;
7917
7918    if (!PyUnicode_Check(string) || !PyUnicode_GET_LENGTH(string)) {
7919        PyErr_BadArgument();
7920        return NULL;
7921    }
7922    kind = PyUnicode_KIND(string);
7923    data = PyUnicode_DATA(string);
7924    length = PyUnicode_GET_LENGTH(string);
7925    length = Py_MIN(length, 256);
7926    memset(level1, 0xFF, sizeof level1);
7927    memset(level2, 0xFF, sizeof level2);
7928
7929    /* If there isn't a one-to-one mapping of NULL to \0,
7930       or if there are non-BMP characters, we need to use
7931       a mapping dictionary. */
7932    if (PyUnicode_READ(kind, data, 0) != 0)
7933        need_dict = 1;
7934    for (i = 1; i < length; i++) {
7935        int l1, l2;
7936        ch = PyUnicode_READ(kind, data, i);
7937        if (ch == 0 || ch > 0xFFFF) {
7938            need_dict = 1;
7939            break;
7940        }
7941        if (ch == 0xFFFE)
7942            /* unmapped character */
7943            continue;
7944        l1 = ch >> 11;
7945        l2 = ch >> 7;
7946        if (level1[l1] == 0xFF)
7947            level1[l1] = count2++;
7948        if (level2[l2] == 0xFF)
7949            level2[l2] = count3++;
7950    }
7951
7952    if (count2 >= 0xFF || count3 >= 0xFF)
7953        need_dict = 1;
7954
7955    if (need_dict) {
7956        PyObject *result = PyDict_New();
7957        PyObject *key, *value;
7958        if (!result)
7959            return NULL;
7960        for (i = 0; i < length; i++) {
7961            key = PyLong_FromLong(PyUnicode_READ(kind, data, i));
7962            value = PyLong_FromLong(i);
7963            if (!key || !value)
7964                goto failed1;
7965            if (PyDict_SetItem(result, key, value) == -1)
7966                goto failed1;
7967            Py_DECREF(key);
7968            Py_DECREF(value);
7969        }
7970        return result;
7971      failed1:
7972        Py_XDECREF(key);
7973        Py_XDECREF(value);
7974        Py_DECREF(result);
7975        return NULL;
7976    }
7977
7978    /* Create a three-level trie */
7979    result = PyObject_MALLOC(sizeof(struct encoding_map) +
7980                             16*count2 + 128*count3 - 1);
7981    if (!result)
7982        return PyErr_NoMemory();
7983    PyObject_Init(result, &EncodingMapType);
7984    mresult = (struct encoding_map*)result;
7985    mresult->count2 = count2;
7986    mresult->count3 = count3;
7987    mlevel1 = mresult->level1;
7988    mlevel2 = mresult->level23;
7989    mlevel3 = mresult->level23 + 16*count2;
7990    memcpy(mlevel1, level1, 32);
7991    memset(mlevel2, 0xFF, 16*count2);
7992    memset(mlevel3, 0, 128*count3);
7993    count3 = 0;
7994    for (i = 1; i < length; i++) {
7995        int o1, o2, o3, i2, i3;
7996        Py_UCS4 ch = PyUnicode_READ(kind, data, i);
7997        if (ch == 0xFFFE)
7998            /* unmapped character */
7999            continue;
8000        o1 = ch>>11;
8001        o2 = (ch>>7) & 0xF;
8002        i2 = 16*mlevel1[o1] + o2;
8003        if (mlevel2[i2] == 0xFF)
8004            mlevel2[i2] = count3++;
8005        o3 = ch & 0x7F;
8006        i3 = 128*mlevel2[i2] + o3;
8007        mlevel3[i3] = i;
8008    }
8009    return result;
8010}
8011
8012static int
8013encoding_map_lookup(Py_UCS4 c, PyObject *mapping)
8014{
8015    struct encoding_map *map = (struct encoding_map*)mapping;
8016    int l1 = c>>11;
8017    int l2 = (c>>7) & 0xF;
8018    int l3 = c & 0x7F;
8019    int i;
8020
8021    if (c > 0xFFFF)
8022        return -1;
8023    if (c == 0)
8024        return 0;
8025    /* level 1*/
8026    i = map->level1[l1];
8027    if (i == 0xFF) {
8028        return -1;
8029    }
8030    /* level 2*/
8031    i = map->level23[16*i+l2];
8032    if (i == 0xFF) {
8033        return -1;
8034    }
8035    /* level 3 */
8036    i = map->level23[16*map->count2 + 128*i + l3];
8037    if (i == 0) {
8038        return -1;
8039    }
8040    return i;
8041}
8042
8043/* Lookup the character ch in the mapping. If the character
8044   can't be found, Py_None is returned (or NULL, if another
8045   error occurred). */
8046static PyObject *
8047charmapencode_lookup(Py_UCS4 c, PyObject *mapping)
8048{
8049    PyObject *w = PyLong_FromLong((long)c);
8050    PyObject *x;
8051
8052    if (w == NULL)
8053        return NULL;
8054    x = PyObject_GetItem(mapping, w);
8055    Py_DECREF(w);
8056    if (x == NULL) {
8057        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8058            /* No mapping found means: mapping is undefined. */
8059            PyErr_Clear();
8060            x = Py_None;
8061            Py_INCREF(x);
8062            return x;
8063        } else
8064            return NULL;
8065    }
8066    else if (x == Py_None)
8067        return x;
8068    else if (PyLong_Check(x)) {
8069        long value = PyLong_AS_LONG(x);
8070        if (value < 0 || value > 255) {
8071            PyErr_SetString(PyExc_TypeError,
8072                            "character mapping must be in range(256)");
8073            Py_DECREF(x);
8074            return NULL;
8075        }
8076        return x;
8077    }
8078    else if (PyBytes_Check(x))
8079        return x;
8080    else {
8081        /* wrong return value */
8082        PyErr_Format(PyExc_TypeError,
8083                     "character mapping must return integer, bytes or None, not %.400s",
8084                     x->ob_type->tp_name);
8085        Py_DECREF(x);
8086        return NULL;
8087    }
8088}
8089
8090static int
8091charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
8092{
8093    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8094    /* exponentially overallocate to minimize reallocations */
8095    if (requiredsize < 2*outsize)
8096        requiredsize = 2*outsize;
8097    if (_PyBytes_Resize(outobj, requiredsize))
8098        return -1;
8099    return 0;
8100}
8101
8102typedef enum charmapencode_result {
8103    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
8104} charmapencode_result;
8105/* lookup the character, put the result in the output string and adjust
8106   various state variables. Resize the output bytes object if not enough
8107   space is available. Return a new reference to the object that
8108   was put in the output buffer, or Py_None, if the mapping was undefined
8109   (in which case no character was written) or NULL, if a
8110   reallocation error occurred. The caller must decref the result */
8111static charmapencode_result
8112charmapencode_output(Py_UCS4 c, PyObject *mapping,
8113                     PyObject **outobj, Py_ssize_t *outpos)
8114{
8115    PyObject *rep;
8116    char *outstart;
8117    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
8118
8119    if (Py_TYPE(mapping) == &EncodingMapType) {
8120        int res = encoding_map_lookup(c, mapping);
8121        Py_ssize_t requiredsize = *outpos+1;
8122        if (res == -1)
8123            return enc_FAILED;
8124        if (outsize<requiredsize)
8125            if (charmapencode_resize(outobj, outpos, requiredsize))
8126                return enc_EXCEPTION;
8127        outstart = PyBytes_AS_STRING(*outobj);
8128        outstart[(*outpos)++] = (char)res;
8129        return enc_SUCCESS;
8130    }
8131
8132    rep = charmapencode_lookup(c, mapping);
8133    if (rep==NULL)
8134        return enc_EXCEPTION;
8135    else if (rep==Py_None) {
8136        Py_DECREF(rep);
8137        return enc_FAILED;
8138    } else {
8139        if (PyLong_Check(rep)) {
8140            Py_ssize_t requiredsize = *outpos+1;
8141            if (outsize<requiredsize)
8142                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8143                    Py_DECREF(rep);
8144                    return enc_EXCEPTION;
8145                }
8146            outstart = PyBytes_AS_STRING(*outobj);
8147            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
8148        }
8149        else {
8150            const char *repchars = PyBytes_AS_STRING(rep);
8151            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
8152            Py_ssize_t requiredsize = *outpos+repsize;
8153            if (outsize<requiredsize)
8154                if (charmapencode_resize(outobj, outpos, requiredsize)) {
8155                    Py_DECREF(rep);
8156                    return enc_EXCEPTION;
8157                }
8158            outstart = PyBytes_AS_STRING(*outobj);
8159            memcpy(outstart + *outpos, repchars, repsize);
8160            *outpos += repsize;
8161        }
8162    }
8163    Py_DECREF(rep);
8164    return enc_SUCCESS;
8165}
8166
8167/* handle an error in PyUnicode_EncodeCharmap
8168   Return 0 on success, -1 on error */
8169static int
8170charmap_encoding_error(
8171    PyObject *unicode, Py_ssize_t *inpos, PyObject *mapping,
8172    PyObject **exceptionObject,
8173    int *known_errorHandler, PyObject **errorHandler, const char *errors,
8174    PyObject **res, Py_ssize_t *respos)
8175{
8176    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8177    Py_ssize_t size, repsize;
8178    Py_ssize_t newpos;
8179    enum PyUnicode_Kind kind;
8180    void *data;
8181    Py_ssize_t index;
8182    /* startpos for collecting unencodable chars */
8183    Py_ssize_t collstartpos = *inpos;
8184    Py_ssize_t collendpos = *inpos+1;
8185    Py_ssize_t collpos;
8186    char *encoding = "charmap";
8187    char *reason = "character maps to <undefined>";
8188    charmapencode_result x;
8189    Py_UCS4 ch;
8190    int val;
8191
8192    if (PyUnicode_READY(unicode) == -1)
8193        return -1;
8194    size = PyUnicode_GET_LENGTH(unicode);
8195    /* find all unencodable characters */
8196    while (collendpos < size) {
8197        PyObject *rep;
8198        if (Py_TYPE(mapping) == &EncodingMapType) {
8199            ch = PyUnicode_READ_CHAR(unicode, collendpos);
8200            val = encoding_map_lookup(ch, mapping);
8201            if (val != -1)
8202                break;
8203            ++collendpos;
8204            continue;
8205        }
8206
8207        ch = PyUnicode_READ_CHAR(unicode, collendpos);
8208        rep = charmapencode_lookup(ch, mapping);
8209        if (rep==NULL)
8210            return -1;
8211        else if (rep!=Py_None) {
8212            Py_DECREF(rep);
8213            break;
8214        }
8215        Py_DECREF(rep);
8216        ++collendpos;
8217    }
8218    /* cache callback name lookup
8219     * (if not done yet, i.e. it's the first error) */
8220    if (*known_errorHandler==-1) {
8221        if ((errors==NULL) || (!strcmp(errors, "strict")))
8222            *known_errorHandler = 1;
8223        else if (!strcmp(errors, "replace"))
8224            *known_errorHandler = 2;
8225        else if (!strcmp(errors, "ignore"))
8226            *known_errorHandler = 3;
8227        else if (!strcmp(errors, "xmlcharrefreplace"))
8228            *known_errorHandler = 4;
8229        else
8230            *known_errorHandler = 0;
8231    }
8232    switch (*known_errorHandler) {
8233    case 1: /* strict */
8234        raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8235        return -1;
8236    case 2: /* replace */
8237        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
8238            x = charmapencode_output('?', mapping, res, respos);
8239            if (x==enc_EXCEPTION) {
8240                return -1;
8241            }
8242            else if (x==enc_FAILED) {
8243                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8244                return -1;
8245            }
8246        }
8247        /* fall through */
8248    case 3: /* ignore */
8249        *inpos = collendpos;
8250        break;
8251    case 4: /* xmlcharrefreplace */
8252        /* generate replacement (temporarily (mis)uses p) */
8253        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
8254            char buffer[2+29+1+1];
8255            char *cp;
8256            sprintf(buffer, "&#%d;", (int)PyUnicode_READ_CHAR(unicode, collpos));
8257            for (cp = buffer; *cp; ++cp) {
8258                x = charmapencode_output(*cp, mapping, res, respos);
8259                if (x==enc_EXCEPTION)
8260                    return -1;
8261                else if (x==enc_FAILED) {
8262                    raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8263                    return -1;
8264                }
8265            }
8266        }
8267        *inpos = collendpos;
8268        break;
8269    default:
8270        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
8271                                                      encoding, reason, unicode, exceptionObject,
8272                                                      collstartpos, collendpos, &newpos);
8273        if (repunicode == NULL)
8274            return -1;
8275        if (PyBytes_Check(repunicode)) {
8276            /* Directly copy bytes result to output. */
8277            Py_ssize_t outsize = PyBytes_Size(*res);
8278            Py_ssize_t requiredsize;
8279            repsize = PyBytes_Size(repunicode);
8280            requiredsize = *respos + repsize;
8281            if (requiredsize > outsize)
8282                /* Make room for all additional bytes. */
8283                if (charmapencode_resize(res, respos, requiredsize)) {
8284                    Py_DECREF(repunicode);
8285                    return -1;
8286                }
8287            memcpy(PyBytes_AsString(*res) + *respos,
8288                   PyBytes_AsString(repunicode),  repsize);
8289            *respos += repsize;
8290            *inpos = newpos;
8291            Py_DECREF(repunicode);
8292            break;
8293        }
8294        /* generate replacement  */
8295        if (PyUnicode_READY(repunicode) == -1) {
8296            Py_DECREF(repunicode);
8297            return -1;
8298        }
8299        repsize = PyUnicode_GET_LENGTH(repunicode);
8300        data = PyUnicode_DATA(repunicode);
8301        kind = PyUnicode_KIND(repunicode);
8302        for (index = 0; index < repsize; index++) {
8303            Py_UCS4 repch = PyUnicode_READ(kind, data, index);
8304            x = charmapencode_output(repch, mapping, res, respos);
8305            if (x==enc_EXCEPTION) {
8306                Py_DECREF(repunicode);
8307                return -1;
8308            }
8309            else if (x==enc_FAILED) {
8310                Py_DECREF(repunicode);
8311                raise_encode_exception(exceptionObject, encoding, unicode, collstartpos, collendpos, reason);
8312                return -1;
8313            }
8314        }
8315        *inpos = newpos;
8316        Py_DECREF(repunicode);
8317    }
8318    return 0;
8319}
8320
8321PyObject *
8322_PyUnicode_EncodeCharmap(PyObject *unicode,
8323                         PyObject *mapping,
8324                         const char *errors)
8325{
8326    /* output object */
8327    PyObject *res = NULL;
8328    /* current input position */
8329    Py_ssize_t inpos = 0;
8330    Py_ssize_t size;
8331    /* current output position */
8332    Py_ssize_t respos = 0;
8333    PyObject *errorHandler = NULL;
8334    PyObject *exc = NULL;
8335    /* the following variable is used for caching string comparisons
8336     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8337     * 3=ignore, 4=xmlcharrefreplace */
8338    int known_errorHandler = -1;
8339    void *data;
8340    int kind;
8341
8342    if (PyUnicode_READY(unicode) == -1)
8343        return NULL;
8344    size = PyUnicode_GET_LENGTH(unicode);
8345    data = PyUnicode_DATA(unicode);
8346    kind = PyUnicode_KIND(unicode);
8347
8348    /* Default to Latin-1 */
8349    if (mapping == NULL)
8350        return unicode_encode_ucs1(unicode, errors, 256);
8351
8352    /* allocate enough for a simple encoding without
8353       replacements, if we need more, we'll resize */
8354    res = PyBytes_FromStringAndSize(NULL, size);
8355    if (res == NULL)
8356        goto onError;
8357    if (size == 0)
8358        return res;
8359
8360    while (inpos<size) {
8361        Py_UCS4 ch = PyUnicode_READ(kind, data, inpos);
8362        /* try to encode it */
8363        charmapencode_result x = charmapencode_output(ch, mapping, &res, &respos);
8364        if (x==enc_EXCEPTION) /* error */
8365            goto onError;
8366        if (x==enc_FAILED) { /* unencodable character */
8367            if (charmap_encoding_error(unicode, &inpos, mapping,
8368                                       &exc,
8369                                       &known_errorHandler, &errorHandler, errors,
8370                                       &res, &respos)) {
8371                goto onError;
8372            }
8373        }
8374        else
8375            /* done with this character => adjust input position */
8376            ++inpos;
8377    }
8378
8379    /* Resize if we allocated to much */
8380    if (respos<PyBytes_GET_SIZE(res))
8381        if (_PyBytes_Resize(&res, respos) < 0)
8382            goto onError;
8383
8384    Py_XDECREF(exc);
8385    Py_XDECREF(errorHandler);
8386    return res;
8387
8388  onError:
8389    Py_XDECREF(res);
8390    Py_XDECREF(exc);
8391    Py_XDECREF(errorHandler);
8392    return NULL;
8393}
8394
8395/* Deprecated */
8396PyObject *
8397PyUnicode_EncodeCharmap(const Py_UNICODE *p,
8398                        Py_ssize_t size,
8399                        PyObject *mapping,
8400                        const char *errors)
8401{
8402    PyObject *result;
8403    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8404    if (unicode == NULL)
8405        return NULL;
8406    result = _PyUnicode_EncodeCharmap(unicode, mapping, errors);
8407    Py_DECREF(unicode);
8408    return result;
8409}
8410
8411PyObject *
8412PyUnicode_AsCharmapString(PyObject *unicode,
8413                          PyObject *mapping)
8414{
8415    if (!PyUnicode_Check(unicode) || mapping == NULL) {
8416        PyErr_BadArgument();
8417        return NULL;
8418    }
8419    return _PyUnicode_EncodeCharmap(unicode, mapping, NULL);
8420}
8421
8422/* create or adjust a UnicodeTranslateError */
8423static void
8424make_translate_exception(PyObject **exceptionObject,
8425                         PyObject *unicode,
8426                         Py_ssize_t startpos, Py_ssize_t endpos,
8427                         const char *reason)
8428{
8429    if (*exceptionObject == NULL) {
8430        *exceptionObject = _PyUnicodeTranslateError_Create(
8431            unicode, startpos, endpos, reason);
8432    }
8433    else {
8434        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
8435            goto onError;
8436        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
8437            goto onError;
8438        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
8439            goto onError;
8440        return;
8441      onError:
8442        Py_CLEAR(*exceptionObject);
8443    }
8444}
8445
8446/* error handling callback helper:
8447   build arguments, call the callback and check the arguments,
8448   put the result into newpos and return the replacement string, which
8449   has to be freed by the caller */
8450static PyObject *
8451unicode_translate_call_errorhandler(const char *errors,
8452                                    PyObject **errorHandler,
8453                                    const char *reason,
8454                                    PyObject *unicode, PyObject **exceptionObject,
8455                                    Py_ssize_t startpos, Py_ssize_t endpos,
8456                                    Py_ssize_t *newpos)
8457{
8458    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
8459
8460    Py_ssize_t i_newpos;
8461    PyObject *restuple;
8462    PyObject *resunicode;
8463
8464    if (*errorHandler == NULL) {
8465        *errorHandler = PyCodec_LookupError(errors);
8466        if (*errorHandler == NULL)
8467            return NULL;
8468    }
8469
8470    make_translate_exception(exceptionObject,
8471                             unicode, startpos, endpos, reason);
8472    if (*exceptionObject == NULL)
8473        return NULL;
8474
8475    restuple = PyObject_CallFunctionObjArgs(
8476        *errorHandler, *exceptionObject, NULL);
8477    if (restuple == NULL)
8478        return NULL;
8479    if (!PyTuple_Check(restuple)) {
8480        PyErr_SetString(PyExc_TypeError, &argparse[4]);
8481        Py_DECREF(restuple);
8482        return NULL;
8483    }
8484    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
8485                          &resunicode, &i_newpos)) {
8486        Py_DECREF(restuple);
8487        return NULL;
8488    }
8489    if (i_newpos<0)
8490        *newpos = PyUnicode_GET_LENGTH(unicode)+i_newpos;
8491    else
8492        *newpos = i_newpos;
8493    if (*newpos<0 || *newpos>PyUnicode_GET_LENGTH(unicode)) {
8494        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
8495        Py_DECREF(restuple);
8496        return NULL;
8497    }
8498    Py_INCREF(resunicode);
8499    Py_DECREF(restuple);
8500    return resunicode;
8501}
8502
8503/* Lookup the character ch in the mapping and put the result in result,
8504   which must be decrefed by the caller.
8505   Return 0 on success, -1 on error */
8506static int
8507charmaptranslate_lookup(Py_UCS4 c, PyObject *mapping, PyObject **result)
8508{
8509    PyObject *w = PyLong_FromLong((long)c);
8510    PyObject *x;
8511
8512    if (w == NULL)
8513        return -1;
8514    x = PyObject_GetItem(mapping, w);
8515    Py_DECREF(w);
8516    if (x == NULL) {
8517        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
8518            /* No mapping found means: use 1:1 mapping. */
8519            PyErr_Clear();
8520            *result = NULL;
8521            return 0;
8522        } else
8523            return -1;
8524    }
8525    else if (x == Py_None) {
8526        *result = x;
8527        return 0;
8528    }
8529    else if (PyLong_Check(x)) {
8530        long value = PyLong_AS_LONG(x);
8531        long max = PyUnicode_GetMax();
8532        if (value < 0 || value > max) {
8533            PyErr_Format(PyExc_TypeError,
8534                         "character mapping must be in range(0x%x)", max+1);
8535            Py_DECREF(x);
8536            return -1;
8537        }
8538        *result = x;
8539        return 0;
8540    }
8541    else if (PyUnicode_Check(x)) {
8542        *result = x;
8543        return 0;
8544    }
8545    else {
8546        /* wrong return value */
8547        PyErr_SetString(PyExc_TypeError,
8548                        "character mapping must return integer, None or str");
8549        Py_DECREF(x);
8550        return -1;
8551    }
8552}
8553/* ensure that *outobj is at least requiredsize characters long,
8554   if not reallocate and adjust various state variables.
8555   Return 0 on success, -1 on error */
8556static int
8557charmaptranslate_makespace(Py_UCS4 **outobj, Py_ssize_t *psize,
8558                               Py_ssize_t requiredsize)
8559{
8560    Py_ssize_t oldsize = *psize;
8561    Py_UCS4 *new_outobj;
8562    if (requiredsize > oldsize) {
8563        /* exponentially overallocate to minimize reallocations */
8564        if (requiredsize < 2 * oldsize)
8565            requiredsize = 2 * oldsize;
8566        new_outobj = PyMem_Realloc(*outobj, requiredsize * sizeof(Py_UCS4));
8567        if (new_outobj == 0)
8568            return -1;
8569        *outobj = new_outobj;
8570        *psize = requiredsize;
8571    }
8572    return 0;
8573}
8574/* lookup the character, put the result in the output string and adjust
8575   various state variables. Return a new reference to the object that
8576   was put in the output buffer in *result, or Py_None, if the mapping was
8577   undefined (in which case no character was written).
8578   The called must decref result.
8579   Return 0 on success, -1 on error. */
8580static int
8581charmaptranslate_output(PyObject *input, Py_ssize_t ipos,
8582                        PyObject *mapping, Py_UCS4 **output,
8583                        Py_ssize_t *osize, Py_ssize_t *opos,
8584                        PyObject **res)
8585{
8586    Py_UCS4 curinp = PyUnicode_READ_CHAR(input, ipos);
8587    if (charmaptranslate_lookup(curinp, mapping, res))
8588        return -1;
8589    if (*res==NULL) {
8590        /* not found => default to 1:1 mapping */
8591        (*output)[(*opos)++] = curinp;
8592    }
8593    else if (*res==Py_None)
8594        ;
8595    else if (PyLong_Check(*res)) {
8596        /* no overflow check, because we know that the space is enough */
8597        (*output)[(*opos)++] = (Py_UCS4)PyLong_AS_LONG(*res);
8598    }
8599    else if (PyUnicode_Check(*res)) {
8600        Py_ssize_t repsize;
8601        if (PyUnicode_READY(*res) == -1)
8602            return -1;
8603        repsize = PyUnicode_GET_LENGTH(*res);
8604        if (repsize==1) {
8605            /* no overflow check, because we know that the space is enough */
8606            (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, 0);
8607        }
8608        else if (repsize!=0) {
8609            /* more than one character */
8610            Py_ssize_t requiredsize = *opos +
8611                (PyUnicode_GET_LENGTH(input) - ipos) +
8612                repsize - 1;
8613            Py_ssize_t i;
8614            if (charmaptranslate_makespace(output, osize, requiredsize))
8615                return -1;
8616            for(i = 0; i < repsize; i++)
8617                (*output)[(*opos)++] = PyUnicode_READ_CHAR(*res, i);
8618        }
8619    }
8620    else
8621        return -1;
8622    return 0;
8623}
8624
8625PyObject *
8626_PyUnicode_TranslateCharmap(PyObject *input,
8627                            PyObject *mapping,
8628                            const char *errors)
8629{
8630    /* input object */
8631    char *idata;
8632    Py_ssize_t size, i;
8633    int kind;
8634    /* output buffer */
8635    Py_UCS4 *output = NULL;
8636    Py_ssize_t osize;
8637    PyObject *res;
8638    /* current output position */
8639    Py_ssize_t opos;
8640    char *reason = "character maps to <undefined>";
8641    PyObject *errorHandler = NULL;
8642    PyObject *exc = NULL;
8643    /* the following variable is used for caching string comparisons
8644     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
8645     * 3=ignore, 4=xmlcharrefreplace */
8646    int known_errorHandler = -1;
8647
8648    if (mapping == NULL) {
8649        PyErr_BadArgument();
8650        return NULL;
8651    }
8652
8653    if (PyUnicode_READY(input) == -1)
8654        return NULL;
8655    idata = (char*)PyUnicode_DATA(input);
8656    kind = PyUnicode_KIND(input);
8657    size = PyUnicode_GET_LENGTH(input);
8658    i = 0;
8659
8660    if (size == 0) {
8661        Py_INCREF(input);
8662        return input;
8663    }
8664
8665    /* allocate enough for a simple 1:1 translation without
8666       replacements, if we need more, we'll resize */
8667    osize = size;
8668    output = PyMem_NEW(Py_UCS4, osize);
8669    opos = 0;
8670    if (output == NULL) {
8671        PyErr_NoMemory();
8672        goto onError;
8673    }
8674
8675    while (i<size) {
8676        /* try to encode it */
8677        PyObject *x = NULL;
8678        if (charmaptranslate_output(input, i, mapping,
8679                                    &output, &osize, &opos, &x)) {
8680            Py_XDECREF(x);
8681            goto onError;
8682        }
8683        Py_XDECREF(x);
8684        if (x!=Py_None) /* it worked => adjust input pointer */
8685            ++i;
8686        else { /* untranslatable character */
8687            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
8688            Py_ssize_t repsize;
8689            Py_ssize_t newpos;
8690            Py_ssize_t uni2;
8691            /* startpos for collecting untranslatable chars */
8692            Py_ssize_t collstart = i;
8693            Py_ssize_t collend = i+1;
8694            Py_ssize_t coll;
8695
8696            /* find all untranslatable characters */
8697            while (collend < size) {
8698                if (charmaptranslate_lookup(PyUnicode_READ(kind,idata, collend), mapping, &x))
8699                    goto onError;
8700                Py_XDECREF(x);
8701                if (x!=Py_None)
8702                    break;
8703                ++collend;
8704            }
8705            /* cache callback name lookup
8706             * (if not done yet, i.e. it's the first error) */
8707            if (known_errorHandler==-1) {
8708                if ((errors==NULL) || (!strcmp(errors, "strict")))
8709                    known_errorHandler = 1;
8710                else if (!strcmp(errors, "replace"))
8711                    known_errorHandler = 2;
8712                else if (!strcmp(errors, "ignore"))
8713                    known_errorHandler = 3;
8714                else if (!strcmp(errors, "xmlcharrefreplace"))
8715                    known_errorHandler = 4;
8716                else
8717                    known_errorHandler = 0;
8718            }
8719            switch (known_errorHandler) {
8720            case 1: /* strict */
8721                make_translate_exception(&exc,
8722                                         input, collstart, collend, reason);
8723                if (exc != NULL)
8724                    PyCodec_StrictErrors(exc);
8725                goto onError;
8726            case 2: /* replace */
8727                /* No need to check for space, this is a 1:1 replacement */
8728                for (coll = collstart; coll<collend; coll++)
8729                    output[opos++] = '?';
8730                /* fall through */
8731            case 3: /* ignore */
8732                i = collend;
8733                break;
8734            case 4: /* xmlcharrefreplace */
8735                /* generate replacement (temporarily (mis)uses i) */
8736                for (i = collstart; i < collend; ++i) {
8737                    char buffer[2+29+1+1];
8738                    char *cp;
8739                    sprintf(buffer, "&#%d;", PyUnicode_READ(kind, idata, i));
8740                    if (charmaptranslate_makespace(&output, &osize,
8741                                                   opos+strlen(buffer)+(size-collend)))
8742                        goto onError;
8743                    for (cp = buffer; *cp; ++cp)
8744                        output[opos++] = *cp;
8745                }
8746                i = collend;
8747                break;
8748            default:
8749                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
8750                                                                 reason, input, &exc,
8751                                                                 collstart, collend, &newpos);
8752                if (repunicode == NULL)
8753                    goto onError;
8754                if (PyUnicode_READY(repunicode) == -1) {
8755                    Py_DECREF(repunicode);
8756                    goto onError;
8757                }
8758                /* generate replacement  */
8759                repsize = PyUnicode_GET_LENGTH(repunicode);
8760                if (charmaptranslate_makespace(&output, &osize,
8761                                               opos+repsize+(size-collend))) {
8762                    Py_DECREF(repunicode);
8763                    goto onError;
8764                }
8765                for (uni2 = 0; repsize-->0; ++uni2)
8766                    output[opos++] = PyUnicode_READ_CHAR(repunicode, uni2);
8767                i = newpos;
8768                Py_DECREF(repunicode);
8769            }
8770        }
8771    }
8772    res = PyUnicode_FromKindAndData(PyUnicode_4BYTE_KIND, output, opos);
8773    if (!res)
8774        goto onError;
8775    PyMem_Free(output);
8776    Py_XDECREF(exc);
8777    Py_XDECREF(errorHandler);
8778    return res;
8779
8780  onError:
8781    PyMem_Free(output);
8782    Py_XDECREF(exc);
8783    Py_XDECREF(errorHandler);
8784    return NULL;
8785}
8786
8787/* Deprecated. Use PyUnicode_Translate instead. */
8788PyObject *
8789PyUnicode_TranslateCharmap(const Py_UNICODE *p,
8790                           Py_ssize_t size,
8791                           PyObject *mapping,
8792                           const char *errors)
8793{
8794    PyObject *result;
8795    PyObject *unicode = PyUnicode_FromUnicode(p, size);
8796    if (!unicode)
8797        return NULL;
8798    result = _PyUnicode_TranslateCharmap(unicode, mapping, errors);
8799    Py_DECREF(unicode);
8800    return result;
8801}
8802
8803PyObject *
8804PyUnicode_Translate(PyObject *str,
8805                    PyObject *mapping,
8806                    const char *errors)
8807{
8808    PyObject *result;
8809
8810    str = PyUnicode_FromObject(str);
8811    if (str == NULL)
8812        return NULL;
8813    result = _PyUnicode_TranslateCharmap(str, mapping, errors);
8814    Py_DECREF(str);
8815    return result;
8816}
8817
8818static Py_UCS4
8819fix_decimal_and_space_to_ascii(PyObject *self)
8820{
8821    /* No need to call PyUnicode_READY(self) because this function is only
8822       called as a callback from fixup() which does it already. */
8823    const Py_ssize_t len = PyUnicode_GET_LENGTH(self);
8824    const int kind = PyUnicode_KIND(self);
8825    void *data = PyUnicode_DATA(self);
8826    Py_UCS4 maxchar = 127, ch, fixed;
8827    int modified = 0;
8828    Py_ssize_t i;
8829
8830    for (i = 0; i < len; ++i) {
8831        ch = PyUnicode_READ(kind, data, i);
8832        fixed = 0;
8833        if (ch > 127) {
8834            if (Py_UNICODE_ISSPACE(ch))
8835                fixed = ' ';
8836            else {
8837                const int decimal = Py_UNICODE_TODECIMAL(ch);
8838                if (decimal >= 0)
8839                    fixed = '0' + decimal;
8840            }
8841            if (fixed != 0) {
8842                modified = 1;
8843                maxchar = Py_MAX(maxchar, fixed);
8844                PyUnicode_WRITE(kind, data, i, fixed);
8845            }
8846            else
8847                maxchar = Py_MAX(maxchar, ch);
8848        }
8849    }
8850
8851    return (modified) ? maxchar : 0;
8852}
8853
8854PyObject *
8855_PyUnicode_TransformDecimalAndSpaceToASCII(PyObject *unicode)
8856{
8857    if (!PyUnicode_Check(unicode)) {
8858        PyErr_BadInternalCall();
8859        return NULL;
8860    }
8861    if (PyUnicode_READY(unicode) == -1)
8862        return NULL;
8863    if (PyUnicode_MAX_CHAR_VALUE(unicode) <= 127) {
8864        /* If the string is already ASCII, just return the same string */
8865        Py_INCREF(unicode);
8866        return unicode;
8867    }
8868    return fixup(unicode, fix_decimal_and_space_to_ascii);
8869}
8870
8871PyObject *
8872PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
8873                                  Py_ssize_t length)
8874{
8875    PyObject *decimal;
8876    Py_ssize_t i;
8877    Py_UCS4 maxchar;
8878    enum PyUnicode_Kind kind;
8879    void *data;
8880
8881    maxchar = 127;
8882    for (i = 0; i < length; i++) {
8883        Py_UNICODE ch = s[i];
8884        if (ch > 127) {
8885            int decimal = Py_UNICODE_TODECIMAL(ch);
8886            if (decimal >= 0)
8887                ch = '0' + decimal;
8888            maxchar = Py_MAX(maxchar, ch);
8889        }
8890    }
8891
8892    /* Copy to a new string */
8893    decimal = PyUnicode_New(length, maxchar);
8894    if (decimal == NULL)
8895        return decimal;
8896    kind = PyUnicode_KIND(decimal);
8897    data = PyUnicode_DATA(decimal);
8898    /* Iterate over code points */
8899    for (i = 0; i < length; i++) {
8900        Py_UNICODE ch = s[i];
8901        if (ch > 127) {
8902            int decimal = Py_UNICODE_TODECIMAL(ch);
8903            if (decimal >= 0)
8904                ch = '0' + decimal;
8905        }
8906        PyUnicode_WRITE(kind, data, i, ch);
8907    }
8908    return unicode_result(decimal);
8909}
8910/* --- Decimal Encoder ---------------------------------------------------- */
8911
8912int
8913PyUnicode_EncodeDecimal(Py_UNICODE *s,
8914                        Py_ssize_t length,
8915                        char *output,
8916                        const char *errors)
8917{
8918    PyObject *unicode;
8919    Py_ssize_t i;
8920    enum PyUnicode_Kind kind;
8921    void *data;
8922
8923    if (output == NULL) {
8924        PyErr_BadArgument();
8925        return -1;
8926    }
8927
8928    unicode = PyUnicode_FromUnicode(s, length);
8929    if (unicode == NULL)
8930        return -1;
8931
8932    if (PyUnicode_READY(unicode) == -1) {
8933        Py_DECREF(unicode);
8934        return -1;
8935    }
8936    kind = PyUnicode_KIND(unicode);
8937    data = PyUnicode_DATA(unicode);
8938
8939    for (i=0; i < length; ) {
8940        PyObject *exc;
8941        Py_UCS4 ch;
8942        int decimal;
8943        Py_ssize_t startpos;
8944
8945        ch = PyUnicode_READ(kind, data, i);
8946
8947        if (Py_UNICODE_ISSPACE(ch)) {
8948            *output++ = ' ';
8949            i++;
8950            continue;
8951        }
8952        decimal = Py_UNICODE_TODECIMAL(ch);
8953        if (decimal >= 0) {
8954            *output++ = '0' + decimal;
8955            i++;
8956            continue;
8957        }
8958        if (0 < ch && ch < 256) {
8959            *output++ = (char)ch;
8960            i++;
8961            continue;
8962        }
8963
8964        startpos = i;
8965        exc = NULL;
8966        raise_encode_exception(&exc, "decimal", unicode,
8967                               startpos, startpos+1,
8968                               "invalid decimal Unicode string");
8969        Py_XDECREF(exc);
8970        Py_DECREF(unicode);
8971        return -1;
8972    }
8973    /* 0-terminate the output string */
8974    *output++ = '\0';
8975    Py_DECREF(unicode);
8976    return 0;
8977}
8978
8979/* --- Helpers ------------------------------------------------------------ */
8980
8981static Py_ssize_t
8982any_find_slice(int direction, PyObject* s1, PyObject* s2,
8983               Py_ssize_t start,
8984               Py_ssize_t end)
8985{
8986    int kind1, kind2, kind;
8987    void *buf1, *buf2;
8988    Py_ssize_t len1, len2, result;
8989
8990    kind1 = PyUnicode_KIND(s1);
8991    kind2 = PyUnicode_KIND(s2);
8992    kind = kind1 > kind2 ? kind1 : kind2;
8993    buf1 = PyUnicode_DATA(s1);
8994    buf2 = PyUnicode_DATA(s2);
8995    if (kind1 != kind)
8996        buf1 = _PyUnicode_AsKind(s1, kind);
8997    if (!buf1)
8998        return -2;
8999    if (kind2 != kind)
9000        buf2 = _PyUnicode_AsKind(s2, kind);
9001    if (!buf2) {
9002        if (kind1 != kind) PyMem_Free(buf1);
9003        return -2;
9004    }
9005    len1 = PyUnicode_GET_LENGTH(s1);
9006    len2 = PyUnicode_GET_LENGTH(s2);
9007
9008    if (direction > 0) {
9009        switch (kind) {
9010        case PyUnicode_1BYTE_KIND:
9011            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9012                result = asciilib_find_slice(buf1, len1, buf2, len2, start, end);
9013            else
9014                result = ucs1lib_find_slice(buf1, len1, buf2, len2, start, end);
9015            break;
9016        case PyUnicode_2BYTE_KIND:
9017            result = ucs2lib_find_slice(buf1, len1, buf2, len2, start, end);
9018            break;
9019        case PyUnicode_4BYTE_KIND:
9020            result = ucs4lib_find_slice(buf1, len1, buf2, len2, start, end);
9021            break;
9022        default:
9023            assert(0); result = -2;
9024        }
9025    }
9026    else {
9027        switch (kind) {
9028        case PyUnicode_1BYTE_KIND:
9029            if (PyUnicode_IS_ASCII(s1) && PyUnicode_IS_ASCII(s2))
9030                result = asciilib_rfind_slice(buf1, len1, buf2, len2, start, end);
9031            else
9032                result = ucs1lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9033            break;
9034        case PyUnicode_2BYTE_KIND:
9035            result = ucs2lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9036            break;
9037        case PyUnicode_4BYTE_KIND:
9038            result = ucs4lib_rfind_slice(buf1, len1, buf2, len2, start, end);
9039            break;
9040        default:
9041            assert(0); result = -2;
9042        }
9043    }
9044
9045    if (kind1 != kind)
9046        PyMem_Free(buf1);
9047    if (kind2 != kind)
9048        PyMem_Free(buf2);
9049
9050    return result;
9051}
9052
9053Py_ssize_t
9054_PyUnicode_InsertThousandsGrouping(
9055    PyObject *unicode, Py_ssize_t index,
9056    Py_ssize_t n_buffer,
9057    void *digits, Py_ssize_t n_digits,
9058    Py_ssize_t min_width,
9059    const char *grouping, PyObject *thousands_sep,
9060    Py_UCS4 *maxchar)
9061{
9062    unsigned int kind, thousands_sep_kind;
9063    char *data, *thousands_sep_data;
9064    Py_ssize_t thousands_sep_len;
9065    Py_ssize_t len;
9066
9067    if (unicode != NULL) {
9068        kind = PyUnicode_KIND(unicode);
9069        data = (char *) PyUnicode_DATA(unicode) + index * kind;
9070    }
9071    else {
9072        kind = PyUnicode_1BYTE_KIND;
9073        data = NULL;
9074    }
9075    thousands_sep_kind = PyUnicode_KIND(thousands_sep);
9076    thousands_sep_data = PyUnicode_DATA(thousands_sep);
9077    thousands_sep_len = PyUnicode_GET_LENGTH(thousands_sep);
9078    if (unicode != NULL && thousands_sep_kind != kind) {
9079        if (thousands_sep_kind < kind) {
9080            thousands_sep_data = _PyUnicode_AsKind(thousands_sep, kind);
9081            if (!thousands_sep_data)
9082                return -1;
9083        }
9084        else {
9085            data = _PyUnicode_AsKind(unicode, thousands_sep_kind);
9086            if (!data)
9087                return -1;
9088        }
9089    }
9090
9091    switch (kind) {
9092    case PyUnicode_1BYTE_KIND:
9093        if (unicode != NULL && PyUnicode_IS_ASCII(unicode))
9094            len = asciilib_InsertThousandsGrouping(
9095                (Py_UCS1 *) data, n_buffer, (Py_UCS1 *) digits, n_digits,
9096                min_width, grouping,
9097                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9098        else
9099            len = ucs1lib_InsertThousandsGrouping(
9100                (Py_UCS1*)data, n_buffer, (Py_UCS1*)digits, n_digits,
9101                min_width, grouping,
9102                (Py_UCS1 *) thousands_sep_data, thousands_sep_len);
9103        break;
9104    case PyUnicode_2BYTE_KIND:
9105        len = ucs2lib_InsertThousandsGrouping(
9106            (Py_UCS2 *) data, n_buffer, (Py_UCS2 *) digits, n_digits,
9107            min_width, grouping,
9108            (Py_UCS2 *) thousands_sep_data, thousands_sep_len);
9109        break;
9110    case PyUnicode_4BYTE_KIND:
9111        len = ucs4lib_InsertThousandsGrouping(
9112            (Py_UCS4 *) data, n_buffer, (Py_UCS4 *) digits, n_digits,
9113            min_width, grouping,
9114            (Py_UCS4 *) thousands_sep_data, thousands_sep_len);
9115        break;
9116    default:
9117        assert(0);
9118        return -1;
9119    }
9120    if (unicode != NULL && thousands_sep_kind != kind) {
9121        if (thousands_sep_kind < kind)
9122            PyMem_Free(thousands_sep_data);
9123        else
9124            PyMem_Free(data);
9125    }
9126    if (unicode == NULL) {
9127        *maxchar = 127;
9128        if (len != n_digits) {
9129            *maxchar = Py_MAX(*maxchar,
9130                                   PyUnicode_MAX_CHAR_VALUE(thousands_sep));
9131        }
9132    }
9133    return len;
9134}
9135
9136
9137/* helper macro to fixup start/end slice values */
9138#define ADJUST_INDICES(start, end, len)         \
9139    if (end > len)                              \
9140        end = len;                              \
9141    else if (end < 0) {                         \
9142        end += len;                             \
9143        if (end < 0)                            \
9144            end = 0;                            \
9145    }                                           \
9146    if (start < 0) {                            \
9147        start += len;                           \
9148        if (start < 0)                          \
9149            start = 0;                          \
9150    }
9151
9152Py_ssize_t
9153PyUnicode_Count(PyObject *str,
9154                PyObject *substr,
9155                Py_ssize_t start,
9156                Py_ssize_t end)
9157{
9158    Py_ssize_t result;
9159    PyObject* str_obj;
9160    PyObject* sub_obj;
9161    int kind1, kind2, kind;
9162    void *buf1 = NULL, *buf2 = NULL;
9163    Py_ssize_t len1, len2;
9164
9165    str_obj = PyUnicode_FromObject(str);
9166    if (!str_obj)
9167        return -1;
9168    sub_obj = PyUnicode_FromObject(substr);
9169    if (!sub_obj) {
9170        Py_DECREF(str_obj);
9171        return -1;
9172    }
9173    if (PyUnicode_READY(sub_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
9174        Py_DECREF(sub_obj);
9175        Py_DECREF(str_obj);
9176        return -1;
9177    }
9178
9179    kind1 = PyUnicode_KIND(str_obj);
9180    kind2 = PyUnicode_KIND(sub_obj);
9181    kind = kind1;
9182    buf1 = PyUnicode_DATA(str_obj);
9183    buf2 = PyUnicode_DATA(sub_obj);
9184    if (kind2 != kind) {
9185        if (kind2 > kind) {
9186            Py_DECREF(sub_obj);
9187            Py_DECREF(str_obj);
9188            return 0;
9189        }
9190        buf2 = _PyUnicode_AsKind(sub_obj, kind);
9191    }
9192    if (!buf2)
9193        goto onError;
9194    len1 = PyUnicode_GET_LENGTH(str_obj);
9195    len2 = PyUnicode_GET_LENGTH(sub_obj);
9196
9197    ADJUST_INDICES(start, end, len1);
9198    switch (kind) {
9199    case PyUnicode_1BYTE_KIND:
9200        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sub_obj))
9201            result = asciilib_count(
9202                ((Py_UCS1*)buf1) + start, end - start,
9203                buf2, len2, PY_SSIZE_T_MAX
9204                );
9205        else
9206            result = ucs1lib_count(
9207                ((Py_UCS1*)buf1) + start, end - start,
9208                buf2, len2, PY_SSIZE_T_MAX
9209                );
9210        break;
9211    case PyUnicode_2BYTE_KIND:
9212        result = ucs2lib_count(
9213            ((Py_UCS2*)buf1) + start, end - start,
9214            buf2, len2, PY_SSIZE_T_MAX
9215            );
9216        break;
9217    case PyUnicode_4BYTE_KIND:
9218        result = ucs4lib_count(
9219            ((Py_UCS4*)buf1) + start, end - start,
9220            buf2, len2, PY_SSIZE_T_MAX
9221            );
9222        break;
9223    default:
9224        assert(0); result = 0;
9225    }
9226
9227    Py_DECREF(sub_obj);
9228    Py_DECREF(str_obj);
9229
9230    if (kind2 != kind)
9231        PyMem_Free(buf2);
9232
9233    return result;
9234  onError:
9235    Py_DECREF(sub_obj);
9236    Py_DECREF(str_obj);
9237    if (kind2 != kind && buf2)
9238        PyMem_Free(buf2);
9239    return -1;
9240}
9241
9242Py_ssize_t
9243PyUnicode_Find(PyObject *str,
9244               PyObject *sub,
9245               Py_ssize_t start,
9246               Py_ssize_t end,
9247               int direction)
9248{
9249    Py_ssize_t result;
9250
9251    str = PyUnicode_FromObject(str);
9252    if (!str)
9253        return -2;
9254    sub = PyUnicode_FromObject(sub);
9255    if (!sub) {
9256        Py_DECREF(str);
9257        return -2;
9258    }
9259    if (PyUnicode_READY(sub) == -1 || PyUnicode_READY(str) == -1) {
9260        Py_DECREF(sub);
9261        Py_DECREF(str);
9262        return -2;
9263    }
9264
9265    result = any_find_slice(direction,
9266        str, sub, start, end
9267        );
9268
9269    Py_DECREF(str);
9270    Py_DECREF(sub);
9271
9272    return result;
9273}
9274
9275Py_ssize_t
9276PyUnicode_FindChar(PyObject *str, Py_UCS4 ch,
9277                   Py_ssize_t start, Py_ssize_t end,
9278                   int direction)
9279{
9280    int kind;
9281    Py_ssize_t result;
9282    if (PyUnicode_READY(str) == -1)
9283        return -2;
9284    if (start < 0 || end < 0) {
9285        PyErr_SetString(PyExc_IndexError, "string index out of range");
9286        return -2;
9287    }
9288    if (end > PyUnicode_GET_LENGTH(str))
9289        end = PyUnicode_GET_LENGTH(str);
9290    kind = PyUnicode_KIND(str);
9291    result = findchar(PyUnicode_1BYTE_DATA(str) + kind*start,
9292                      kind, end-start, ch, direction);
9293    if (result == -1)
9294        return -1;
9295    else
9296        return start + result;
9297}
9298
9299static int
9300tailmatch(PyObject *self,
9301          PyObject *substring,
9302          Py_ssize_t start,
9303          Py_ssize_t end,
9304          int direction)
9305{
9306    int kind_self;
9307    int kind_sub;
9308    void *data_self;
9309    void *data_sub;
9310    Py_ssize_t offset;
9311    Py_ssize_t i;
9312    Py_ssize_t end_sub;
9313
9314    if (PyUnicode_READY(self) == -1 ||
9315        PyUnicode_READY(substring) == -1)
9316        return -1;
9317
9318    if (PyUnicode_GET_LENGTH(substring) == 0)
9319        return 1;
9320
9321    ADJUST_INDICES(start, end, PyUnicode_GET_LENGTH(self));
9322    end -= PyUnicode_GET_LENGTH(substring);
9323    if (end < start)
9324        return 0;
9325
9326    kind_self = PyUnicode_KIND(self);
9327    data_self = PyUnicode_DATA(self);
9328    kind_sub = PyUnicode_KIND(substring);
9329    data_sub = PyUnicode_DATA(substring);
9330    end_sub = PyUnicode_GET_LENGTH(substring) - 1;
9331
9332    if (direction > 0)
9333        offset = end;
9334    else
9335        offset = start;
9336
9337    if (PyUnicode_READ(kind_self, data_self, offset) ==
9338        PyUnicode_READ(kind_sub, data_sub, 0) &&
9339        PyUnicode_READ(kind_self, data_self, offset + end_sub) ==
9340        PyUnicode_READ(kind_sub, data_sub, end_sub)) {
9341        /* If both are of the same kind, memcmp is sufficient */
9342        if (kind_self == kind_sub) {
9343            return ! memcmp((char *)data_self +
9344                                (offset * PyUnicode_KIND(substring)),
9345                            data_sub,
9346                            PyUnicode_GET_LENGTH(substring) *
9347                                PyUnicode_KIND(substring));
9348        }
9349        /* otherwise we have to compare each character by first accesing it */
9350        else {
9351            /* We do not need to compare 0 and len(substring)-1 because
9352               the if statement above ensured already that they are equal
9353               when we end up here. */
9354            for (i = 1; i < end_sub; ++i) {
9355                if (PyUnicode_READ(kind_self, data_self, offset + i) !=
9356                    PyUnicode_READ(kind_sub, data_sub, i))
9357                    return 0;
9358            }
9359            return 1;
9360        }
9361    }
9362
9363    return 0;
9364}
9365
9366Py_ssize_t
9367PyUnicode_Tailmatch(PyObject *str,
9368                    PyObject *substr,
9369                    Py_ssize_t start,
9370                    Py_ssize_t end,
9371                    int direction)
9372{
9373    Py_ssize_t result;
9374
9375    str = PyUnicode_FromObject(str);
9376    if (str == NULL)
9377        return -1;
9378    substr = PyUnicode_FromObject(substr);
9379    if (substr == NULL) {
9380        Py_DECREF(str);
9381        return -1;
9382    }
9383
9384    result = tailmatch(str, substr,
9385                       start, end, direction);
9386    Py_DECREF(str);
9387    Py_DECREF(substr);
9388    return result;
9389}
9390
9391/* Apply fixfct filter to the Unicode object self and return a
9392   reference to the modified object */
9393
9394static PyObject *
9395fixup(PyObject *self,
9396      Py_UCS4 (*fixfct)(PyObject *s))
9397{
9398    PyObject *u;
9399    Py_UCS4 maxchar_old, maxchar_new = 0;
9400    PyObject *v;
9401
9402    u = _PyUnicode_Copy(self);
9403    if (u == NULL)
9404        return NULL;
9405    maxchar_old = PyUnicode_MAX_CHAR_VALUE(u);
9406
9407    /* fix functions return the new maximum character in a string,
9408       if the kind of the resulting unicode object does not change,
9409       everything is fine.  Otherwise we need to change the string kind
9410       and re-run the fix function. */
9411    maxchar_new = fixfct(u);
9412
9413    if (maxchar_new == 0) {
9414        /* no changes */;
9415        if (PyUnicode_CheckExact(self)) {
9416            Py_DECREF(u);
9417            Py_INCREF(self);
9418            return self;
9419        }
9420        else
9421            return u;
9422    }
9423
9424    maxchar_new = align_maxchar(maxchar_new);
9425
9426    if (maxchar_new == maxchar_old)
9427        return u;
9428
9429    /* In case the maximum character changed, we need to
9430       convert the string to the new category. */
9431    v = PyUnicode_New(PyUnicode_GET_LENGTH(self), maxchar_new);
9432    if (v == NULL) {
9433        Py_DECREF(u);
9434        return NULL;
9435    }
9436    if (maxchar_new > maxchar_old) {
9437        /* If the maxchar increased so that the kind changed, not all
9438           characters are representable anymore and we need to fix the
9439           string again. This only happens in very few cases. */
9440        _PyUnicode_FastCopyCharacters(v, 0,
9441                                      self, 0, PyUnicode_GET_LENGTH(self));
9442        maxchar_old = fixfct(v);
9443        assert(maxchar_old > 0 && maxchar_old <= maxchar_new);
9444    }
9445    else {
9446        _PyUnicode_FastCopyCharacters(v, 0,
9447                                      u, 0, PyUnicode_GET_LENGTH(self));
9448    }
9449    Py_DECREF(u);
9450    assert(_PyUnicode_CheckConsistency(v, 1));
9451    return v;
9452}
9453
9454static PyObject *
9455ascii_upper_or_lower(PyObject *self, int lower)
9456{
9457    Py_ssize_t len = PyUnicode_GET_LENGTH(self);
9458    char *resdata, *data = PyUnicode_DATA(self);
9459    PyObject *res;
9460
9461    res = PyUnicode_New(len, 127);
9462    if (res == NULL)
9463        return NULL;
9464    resdata = PyUnicode_DATA(res);
9465    if (lower)
9466        _Py_bytes_lower(resdata, data, len);
9467    else
9468        _Py_bytes_upper(resdata, data, len);
9469    return res;
9470}
9471
9472static Py_UCS4
9473handle_capital_sigma(int kind, void *data, Py_ssize_t length, Py_ssize_t i)
9474{
9475    Py_ssize_t j;
9476    int final_sigma;
9477    Py_UCS4 c = 0;
9478    /* U+03A3 is in the Final_Sigma context when, it is found like this:
9479
9480     \p{cased}\p{case-ignorable}*U+03A3!(\p{case-ignorable}*\p{cased})
9481
9482    where ! is a negation and \p{xxx} is a character with property xxx.
9483    */
9484    for (j = i - 1; j >= 0; j--) {
9485        c = PyUnicode_READ(kind, data, j);
9486        if (!_PyUnicode_IsCaseIgnorable(c))
9487            break;
9488    }
9489    final_sigma = j >= 0 && _PyUnicode_IsCased(c);
9490    if (final_sigma) {
9491        for (j = i + 1; j < length; j++) {
9492            c = PyUnicode_READ(kind, data, j);
9493            if (!_PyUnicode_IsCaseIgnorable(c))
9494                break;
9495        }
9496        final_sigma = j == length || !_PyUnicode_IsCased(c);
9497    }
9498    return (final_sigma) ? 0x3C2 : 0x3C3;
9499}
9500
9501static int
9502lower_ucs4(int kind, void *data, Py_ssize_t length, Py_ssize_t i,
9503           Py_UCS4 c, Py_UCS4 *mapped)
9504{
9505    /* Obscure special case. */
9506    if (c == 0x3A3) {
9507        mapped[0] = handle_capital_sigma(kind, data, length, i);
9508        return 1;
9509    }
9510    return _PyUnicode_ToLowerFull(c, mapped);
9511}
9512
9513static Py_ssize_t
9514do_capitalize(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9515{
9516    Py_ssize_t i, k = 0;
9517    int n_res, j;
9518    Py_UCS4 c, mapped[3];
9519
9520    c = PyUnicode_READ(kind, data, 0);
9521    n_res = _PyUnicode_ToUpperFull(c, mapped);
9522    for (j = 0; j < n_res; j++) {
9523        *maxchar = Py_MAX(*maxchar, mapped[j]);
9524        res[k++] = mapped[j];
9525    }
9526    for (i = 1; i < length; i++) {
9527        c = PyUnicode_READ(kind, data, i);
9528        n_res = lower_ucs4(kind, data, length, i, c, mapped);
9529        for (j = 0; j < n_res; j++) {
9530            *maxchar = Py_MAX(*maxchar, mapped[j]);
9531            res[k++] = mapped[j];
9532        }
9533    }
9534    return k;
9535}
9536
9537static Py_ssize_t
9538do_swapcase(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar) {
9539    Py_ssize_t i, k = 0;
9540
9541    for (i = 0; i < length; i++) {
9542        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9543        int n_res, j;
9544        if (Py_UNICODE_ISUPPER(c)) {
9545            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9546        }
9547        else if (Py_UNICODE_ISLOWER(c)) {
9548            n_res = _PyUnicode_ToUpperFull(c, mapped);
9549        }
9550        else {
9551            n_res = 1;
9552            mapped[0] = c;
9553        }
9554        for (j = 0; j < n_res; j++) {
9555            *maxchar = Py_MAX(*maxchar, mapped[j]);
9556            res[k++] = mapped[j];
9557        }
9558    }
9559    return k;
9560}
9561
9562static Py_ssize_t
9563do_upper_or_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res,
9564                  Py_UCS4 *maxchar, int lower)
9565{
9566    Py_ssize_t i, k = 0;
9567
9568    for (i = 0; i < length; i++) {
9569        Py_UCS4 c = PyUnicode_READ(kind, data, i), mapped[3];
9570        int n_res, j;
9571        if (lower)
9572            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9573        else
9574            n_res = _PyUnicode_ToUpperFull(c, mapped);
9575        for (j = 0; j < n_res; j++) {
9576            *maxchar = Py_MAX(*maxchar, mapped[j]);
9577            res[k++] = mapped[j];
9578        }
9579    }
9580    return k;
9581}
9582
9583static Py_ssize_t
9584do_upper(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9585{
9586    return do_upper_or_lower(kind, data, length, res, maxchar, 0);
9587}
9588
9589static Py_ssize_t
9590do_lower(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9591{
9592    return do_upper_or_lower(kind, data, length, res, maxchar, 1);
9593}
9594
9595static Py_ssize_t
9596do_casefold(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9597{
9598    Py_ssize_t i, k = 0;
9599
9600    for (i = 0; i < length; i++) {
9601        Py_UCS4 c = PyUnicode_READ(kind, data, i);
9602        Py_UCS4 mapped[3];
9603        int j, n_res = _PyUnicode_ToFoldedFull(c, mapped);
9604        for (j = 0; j < n_res; j++) {
9605            *maxchar = Py_MAX(*maxchar, mapped[j]);
9606            res[k++] = mapped[j];
9607        }
9608    }
9609    return k;
9610}
9611
9612static Py_ssize_t
9613do_title(int kind, void *data, Py_ssize_t length, Py_UCS4 *res, Py_UCS4 *maxchar)
9614{
9615    Py_ssize_t i, k = 0;
9616    int previous_is_cased;
9617
9618    previous_is_cased = 0;
9619    for (i = 0; i < length; i++) {
9620        const Py_UCS4 c = PyUnicode_READ(kind, data, i);
9621        Py_UCS4 mapped[3];
9622        int n_res, j;
9623
9624        if (previous_is_cased)
9625            n_res = lower_ucs4(kind, data, length, i, c, mapped);
9626        else
9627            n_res = _PyUnicode_ToTitleFull(c, mapped);
9628
9629        for (j = 0; j < n_res; j++) {
9630            *maxchar = Py_MAX(*maxchar, mapped[j]);
9631            res[k++] = mapped[j];
9632        }
9633
9634        previous_is_cased = _PyUnicode_IsCased(c);
9635    }
9636    return k;
9637}
9638
9639static PyObject *
9640case_operation(PyObject *self,
9641               Py_ssize_t (*perform)(int, void *, Py_ssize_t, Py_UCS4 *, Py_UCS4 *))
9642{
9643    PyObject *res = NULL;
9644    Py_ssize_t length, newlength = 0;
9645    int kind, outkind;
9646    void *data, *outdata;
9647    Py_UCS4 maxchar = 0, *tmp, *tmpend;
9648
9649    assert(PyUnicode_IS_READY(self));
9650
9651    kind = PyUnicode_KIND(self);
9652    data = PyUnicode_DATA(self);
9653    length = PyUnicode_GET_LENGTH(self);
9654    if ((size_t) length > PY_SSIZE_T_MAX / (3 * sizeof(Py_UCS4))) {
9655        PyErr_SetString(PyExc_OverflowError, "string is too long");
9656        return NULL;
9657    }
9658    tmp = PyMem_MALLOC(sizeof(Py_UCS4) * 3 * length);
9659    if (tmp == NULL)
9660        return PyErr_NoMemory();
9661    newlength = perform(kind, data, length, tmp, &maxchar);
9662    res = PyUnicode_New(newlength, maxchar);
9663    if (res == NULL)
9664        goto leave;
9665    tmpend = tmp + newlength;
9666    outdata = PyUnicode_DATA(res);
9667    outkind = PyUnicode_KIND(res);
9668    switch (outkind) {
9669    case PyUnicode_1BYTE_KIND:
9670        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS1, tmp, tmpend, outdata);
9671        break;
9672    case PyUnicode_2BYTE_KIND:
9673        _PyUnicode_CONVERT_BYTES(Py_UCS4, Py_UCS2, tmp, tmpend, outdata);
9674        break;
9675    case PyUnicode_4BYTE_KIND:
9676        memcpy(outdata, tmp, sizeof(Py_UCS4) * newlength);
9677        break;
9678    default:
9679        assert(0);
9680        break;
9681    }
9682  leave:
9683    PyMem_FREE(tmp);
9684    return res;
9685}
9686
9687PyObject *
9688PyUnicode_Join(PyObject *separator, PyObject *seq)
9689{
9690    PyObject *sep = NULL;
9691    Py_ssize_t seplen;
9692    PyObject *res = NULL; /* the result */
9693    PyObject *fseq;          /* PySequence_Fast(seq) */
9694    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
9695    PyObject **items;
9696    PyObject *item;
9697    Py_ssize_t sz, i, res_offset;
9698    Py_UCS4 maxchar;
9699    Py_UCS4 item_maxchar;
9700    int use_memcpy;
9701    unsigned char *res_data = NULL, *sep_data = NULL;
9702    PyObject *last_obj;
9703    unsigned int kind = 0;
9704
9705    fseq = PySequence_Fast(seq, "can only join an iterable");
9706    if (fseq == NULL) {
9707        return NULL;
9708    }
9709
9710    /* NOTE: the following code can't call back into Python code,
9711     * so we are sure that fseq won't be mutated.
9712     */
9713
9714    seqlen = PySequence_Fast_GET_SIZE(fseq);
9715    /* If empty sequence, return u"". */
9716    if (seqlen == 0) {
9717        Py_DECREF(fseq);
9718        _Py_RETURN_UNICODE_EMPTY();
9719    }
9720
9721    /* If singleton sequence with an exact Unicode, return that. */
9722    last_obj = NULL;
9723    items = PySequence_Fast_ITEMS(fseq);
9724    if (seqlen == 1) {
9725        if (PyUnicode_CheckExact(items[0])) {
9726            res = items[0];
9727            Py_INCREF(res);
9728            Py_DECREF(fseq);
9729            return res;
9730        }
9731        seplen = 0;
9732        maxchar = 0;
9733    }
9734    else {
9735        /* Set up sep and seplen */
9736        if (separator == NULL) {
9737            /* fall back to a blank space separator */
9738            sep = PyUnicode_FromOrdinal(' ');
9739            if (!sep)
9740                goto onError;
9741            seplen = 1;
9742            maxchar = 32;
9743        }
9744        else {
9745            if (!PyUnicode_Check(separator)) {
9746                PyErr_Format(PyExc_TypeError,
9747                             "separator: expected str instance,"
9748                             " %.80s found",
9749                             Py_TYPE(separator)->tp_name);
9750                goto onError;
9751            }
9752            if (PyUnicode_READY(separator))
9753                goto onError;
9754            sep = separator;
9755            seplen = PyUnicode_GET_LENGTH(separator);
9756            maxchar = PyUnicode_MAX_CHAR_VALUE(separator);
9757            /* inc refcount to keep this code path symmetric with the
9758               above case of a blank separator */
9759            Py_INCREF(sep);
9760        }
9761        last_obj = sep;
9762    }
9763
9764    /* There are at least two things to join, or else we have a subclass
9765     * of str in the sequence.
9766     * Do a pre-pass to figure out the total amount of space we'll
9767     * need (sz), and see whether all argument are strings.
9768     */
9769    sz = 0;
9770#ifdef Py_DEBUG
9771    use_memcpy = 0;
9772#else
9773    use_memcpy = 1;
9774#endif
9775    for (i = 0; i < seqlen; i++) {
9776        const Py_ssize_t old_sz = sz;
9777        item = items[i];
9778        if (!PyUnicode_Check(item)) {
9779            PyErr_Format(PyExc_TypeError,
9780                         "sequence item %zd: expected str instance,"
9781                         " %.80s found",
9782                         i, Py_TYPE(item)->tp_name);
9783            goto onError;
9784        }
9785        if (PyUnicode_READY(item) == -1)
9786            goto onError;
9787        sz += PyUnicode_GET_LENGTH(item);
9788        item_maxchar = PyUnicode_MAX_CHAR_VALUE(item);
9789        maxchar = Py_MAX(maxchar, item_maxchar);
9790        if (i != 0)
9791            sz += seplen;
9792        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
9793            PyErr_SetString(PyExc_OverflowError,
9794                            "join() result is too long for a Python string");
9795            goto onError;
9796        }
9797        if (use_memcpy && last_obj != NULL) {
9798            if (PyUnicode_KIND(last_obj) != PyUnicode_KIND(item))
9799                use_memcpy = 0;
9800        }
9801        last_obj = item;
9802    }
9803
9804    res = PyUnicode_New(sz, maxchar);
9805    if (res == NULL)
9806        goto onError;
9807
9808    /* Catenate everything. */
9809#ifdef Py_DEBUG
9810    use_memcpy = 0;
9811#else
9812    if (use_memcpy) {
9813        res_data = PyUnicode_1BYTE_DATA(res);
9814        kind = PyUnicode_KIND(res);
9815        if (seplen != 0)
9816            sep_data = PyUnicode_1BYTE_DATA(sep);
9817    }
9818#endif
9819    if (use_memcpy) {
9820        for (i = 0; i < seqlen; ++i) {
9821            Py_ssize_t itemlen;
9822            item = items[i];
9823
9824            /* Copy item, and maybe the separator. */
9825            if (i && seplen != 0) {
9826                Py_MEMCPY(res_data,
9827                          sep_data,
9828                          kind * seplen);
9829                res_data += kind * seplen;
9830            }
9831
9832            itemlen = PyUnicode_GET_LENGTH(item);
9833            if (itemlen != 0) {
9834                Py_MEMCPY(res_data,
9835                          PyUnicode_DATA(item),
9836                          kind * itemlen);
9837                res_data += kind * itemlen;
9838            }
9839        }
9840        assert(res_data == PyUnicode_1BYTE_DATA(res)
9841                           + kind * PyUnicode_GET_LENGTH(res));
9842    }
9843    else {
9844        for (i = 0, res_offset = 0; i < seqlen; ++i) {
9845            Py_ssize_t itemlen;
9846            item = items[i];
9847
9848            /* Copy item, and maybe the separator. */
9849            if (i && seplen != 0) {
9850                _PyUnicode_FastCopyCharacters(res, res_offset, sep, 0, seplen);
9851                res_offset += seplen;
9852            }
9853
9854            itemlen = PyUnicode_GET_LENGTH(item);
9855            if (itemlen != 0) {
9856                _PyUnicode_FastCopyCharacters(res, res_offset, item, 0, itemlen);
9857                res_offset += itemlen;
9858            }
9859        }
9860        assert(res_offset == PyUnicode_GET_LENGTH(res));
9861    }
9862
9863    Py_DECREF(fseq);
9864    Py_XDECREF(sep);
9865    assert(_PyUnicode_CheckConsistency(res, 1));
9866    return res;
9867
9868  onError:
9869    Py_DECREF(fseq);
9870    Py_XDECREF(sep);
9871    Py_XDECREF(res);
9872    return NULL;
9873}
9874
9875#define FILL(kind, data, value, start, length) \
9876    do { \
9877        Py_ssize_t i_ = 0; \
9878        assert(kind != PyUnicode_WCHAR_KIND); \
9879        switch ((kind)) { \
9880        case PyUnicode_1BYTE_KIND: { \
9881            unsigned char * to_ = (unsigned char *)((data)) + (start); \
9882            memset(to_, (unsigned char)value, (length)); \
9883            break; \
9884        } \
9885        case PyUnicode_2BYTE_KIND: { \
9886            Py_UCS2 * to_ = (Py_UCS2 *)((data)) + (start); \
9887            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9888            break; \
9889        } \
9890        case PyUnicode_4BYTE_KIND: { \
9891            Py_UCS4 * to_ = (Py_UCS4 *)((data)) + (start); \
9892            for (; i_ < (length); ++i_, ++to_) *to_ = (value); \
9893            break; \
9894        } \
9895        default: assert(0); \
9896        } \
9897    } while (0)
9898
9899void
9900_PyUnicode_FastFill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9901                    Py_UCS4 fill_char)
9902{
9903    const enum PyUnicode_Kind kind = PyUnicode_KIND(unicode);
9904    const void *data = PyUnicode_DATA(unicode);
9905    assert(PyUnicode_IS_READY(unicode));
9906    assert(unicode_modifiable(unicode));
9907    assert(fill_char <= PyUnicode_MAX_CHAR_VALUE(unicode));
9908    assert(start >= 0);
9909    assert(start + length <= PyUnicode_GET_LENGTH(unicode));
9910    FILL(kind, data, fill_char, start, length);
9911}
9912
9913Py_ssize_t
9914PyUnicode_Fill(PyObject *unicode, Py_ssize_t start, Py_ssize_t length,
9915               Py_UCS4 fill_char)
9916{
9917    Py_ssize_t maxlen;
9918
9919    if (!PyUnicode_Check(unicode)) {
9920        PyErr_BadInternalCall();
9921        return -1;
9922    }
9923    if (PyUnicode_READY(unicode) == -1)
9924        return -1;
9925    if (unicode_check_modifiable(unicode))
9926        return -1;
9927
9928    if (start < 0) {
9929        PyErr_SetString(PyExc_IndexError, "string index out of range");
9930        return -1;
9931    }
9932    if (fill_char > PyUnicode_MAX_CHAR_VALUE(unicode)) {
9933        PyErr_SetString(PyExc_ValueError,
9934                         "fill character is bigger than "
9935                         "the string maximum character");
9936        return -1;
9937    }
9938
9939    maxlen = PyUnicode_GET_LENGTH(unicode) - start;
9940    length = Py_MIN(maxlen, length);
9941    if (length <= 0)
9942        return 0;
9943
9944    _PyUnicode_FastFill(unicode, start, length, fill_char);
9945    return length;
9946}
9947
9948static PyObject *
9949pad(PyObject *self,
9950    Py_ssize_t left,
9951    Py_ssize_t right,
9952    Py_UCS4 fill)
9953{
9954    PyObject *u;
9955    Py_UCS4 maxchar;
9956    int kind;
9957    void *data;
9958
9959    if (left < 0)
9960        left = 0;
9961    if (right < 0)
9962        right = 0;
9963
9964    if (left == 0 && right == 0)
9965        return unicode_result_unchanged(self);
9966
9967    if (left > PY_SSIZE_T_MAX - _PyUnicode_LENGTH(self) ||
9968        right > PY_SSIZE_T_MAX - (left + _PyUnicode_LENGTH(self))) {
9969        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
9970        return NULL;
9971    }
9972    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
9973    maxchar = Py_MAX(maxchar, fill);
9974    u = PyUnicode_New(left + _PyUnicode_LENGTH(self) + right, maxchar);
9975    if (!u)
9976        return NULL;
9977
9978    kind = PyUnicode_KIND(u);
9979    data = PyUnicode_DATA(u);
9980    if (left)
9981        FILL(kind, data, fill, 0, left);
9982    if (right)
9983        FILL(kind, data, fill, left + _PyUnicode_LENGTH(self), right);
9984    _PyUnicode_FastCopyCharacters(u, left, self, 0, _PyUnicode_LENGTH(self));
9985    assert(_PyUnicode_CheckConsistency(u, 1));
9986    return u;
9987}
9988
9989PyObject *
9990PyUnicode_Splitlines(PyObject *string, int keepends)
9991{
9992    PyObject *list;
9993
9994    string = PyUnicode_FromObject(string);
9995    if (string == NULL)
9996        return NULL;
9997    if (PyUnicode_READY(string) == -1) {
9998        Py_DECREF(string);
9999        return NULL;
10000    }
10001
10002    switch (PyUnicode_KIND(string)) {
10003    case PyUnicode_1BYTE_KIND:
10004        if (PyUnicode_IS_ASCII(string))
10005            list = asciilib_splitlines(
10006                string, PyUnicode_1BYTE_DATA(string),
10007                PyUnicode_GET_LENGTH(string), keepends);
10008        else
10009            list = ucs1lib_splitlines(
10010                string, PyUnicode_1BYTE_DATA(string),
10011                PyUnicode_GET_LENGTH(string), keepends);
10012        break;
10013    case PyUnicode_2BYTE_KIND:
10014        list = ucs2lib_splitlines(
10015            string, PyUnicode_2BYTE_DATA(string),
10016            PyUnicode_GET_LENGTH(string), keepends);
10017        break;
10018    case PyUnicode_4BYTE_KIND:
10019        list = ucs4lib_splitlines(
10020            string, PyUnicode_4BYTE_DATA(string),
10021            PyUnicode_GET_LENGTH(string), keepends);
10022        break;
10023    default:
10024        assert(0);
10025        list = 0;
10026    }
10027    Py_DECREF(string);
10028    return list;
10029}
10030
10031static PyObject *
10032split(PyObject *self,
10033      PyObject *substring,
10034      Py_ssize_t maxcount)
10035{
10036    int kind1, kind2, kind;
10037    void *buf1, *buf2;
10038    Py_ssize_t len1, len2;
10039    PyObject* out;
10040
10041    if (maxcount < 0)
10042        maxcount = PY_SSIZE_T_MAX;
10043
10044    if (PyUnicode_READY(self) == -1)
10045        return NULL;
10046
10047    if (substring == NULL)
10048        switch (PyUnicode_KIND(self)) {
10049        case PyUnicode_1BYTE_KIND:
10050            if (PyUnicode_IS_ASCII(self))
10051                return asciilib_split_whitespace(
10052                    self,  PyUnicode_1BYTE_DATA(self),
10053                    PyUnicode_GET_LENGTH(self), maxcount
10054                    );
10055            else
10056                return ucs1lib_split_whitespace(
10057                    self,  PyUnicode_1BYTE_DATA(self),
10058                    PyUnicode_GET_LENGTH(self), maxcount
10059                    );
10060        case PyUnicode_2BYTE_KIND:
10061            return ucs2lib_split_whitespace(
10062                self,  PyUnicode_2BYTE_DATA(self),
10063                PyUnicode_GET_LENGTH(self), maxcount
10064                );
10065        case PyUnicode_4BYTE_KIND:
10066            return ucs4lib_split_whitespace(
10067                self,  PyUnicode_4BYTE_DATA(self),
10068                PyUnicode_GET_LENGTH(self), maxcount
10069                );
10070        default:
10071            assert(0);
10072            return NULL;
10073        }
10074
10075    if (PyUnicode_READY(substring) == -1)
10076        return NULL;
10077
10078    kind1 = PyUnicode_KIND(self);
10079    kind2 = PyUnicode_KIND(substring);
10080    kind = kind1 > kind2 ? kind1 : kind2;
10081    buf1 = PyUnicode_DATA(self);
10082    buf2 = PyUnicode_DATA(substring);
10083    if (kind1 != kind)
10084        buf1 = _PyUnicode_AsKind(self, kind);
10085    if (!buf1)
10086        return NULL;
10087    if (kind2 != kind)
10088        buf2 = _PyUnicode_AsKind(substring, kind);
10089    if (!buf2) {
10090        if (kind1 != kind) PyMem_Free(buf1);
10091        return NULL;
10092    }
10093    len1 = PyUnicode_GET_LENGTH(self);
10094    len2 = PyUnicode_GET_LENGTH(substring);
10095
10096    switch (kind) {
10097    case PyUnicode_1BYTE_KIND:
10098        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10099            out = asciilib_split(
10100                self,  buf1, len1, buf2, len2, maxcount);
10101        else
10102            out = ucs1lib_split(
10103                self,  buf1, len1, buf2, len2, maxcount);
10104        break;
10105    case PyUnicode_2BYTE_KIND:
10106        out = ucs2lib_split(
10107            self,  buf1, len1, buf2, len2, maxcount);
10108        break;
10109    case PyUnicode_4BYTE_KIND:
10110        out = ucs4lib_split(
10111            self,  buf1, len1, buf2, len2, maxcount);
10112        break;
10113    default:
10114        out = NULL;
10115    }
10116    if (kind1 != kind)
10117        PyMem_Free(buf1);
10118    if (kind2 != kind)
10119        PyMem_Free(buf2);
10120    return out;
10121}
10122
10123static PyObject *
10124rsplit(PyObject *self,
10125       PyObject *substring,
10126       Py_ssize_t maxcount)
10127{
10128    int kind1, kind2, kind;
10129    void *buf1, *buf2;
10130    Py_ssize_t len1, len2;
10131    PyObject* out;
10132
10133    if (maxcount < 0)
10134        maxcount = PY_SSIZE_T_MAX;
10135
10136    if (PyUnicode_READY(self) == -1)
10137        return NULL;
10138
10139    if (substring == NULL)
10140        switch (PyUnicode_KIND(self)) {
10141        case PyUnicode_1BYTE_KIND:
10142            if (PyUnicode_IS_ASCII(self))
10143                return asciilib_rsplit_whitespace(
10144                    self,  PyUnicode_1BYTE_DATA(self),
10145                    PyUnicode_GET_LENGTH(self), maxcount
10146                    );
10147            else
10148                return ucs1lib_rsplit_whitespace(
10149                    self,  PyUnicode_1BYTE_DATA(self),
10150                    PyUnicode_GET_LENGTH(self), maxcount
10151                    );
10152        case PyUnicode_2BYTE_KIND:
10153            return ucs2lib_rsplit_whitespace(
10154                self,  PyUnicode_2BYTE_DATA(self),
10155                PyUnicode_GET_LENGTH(self), maxcount
10156                );
10157        case PyUnicode_4BYTE_KIND:
10158            return ucs4lib_rsplit_whitespace(
10159                self,  PyUnicode_4BYTE_DATA(self),
10160                PyUnicode_GET_LENGTH(self), maxcount
10161                );
10162        default:
10163            assert(0);
10164            return NULL;
10165        }
10166
10167    if (PyUnicode_READY(substring) == -1)
10168        return NULL;
10169
10170    kind1 = PyUnicode_KIND(self);
10171    kind2 = PyUnicode_KIND(substring);
10172    kind = kind1 > kind2 ? kind1 : kind2;
10173    buf1 = PyUnicode_DATA(self);
10174    buf2 = PyUnicode_DATA(substring);
10175    if (kind1 != kind)
10176        buf1 = _PyUnicode_AsKind(self, kind);
10177    if (!buf1)
10178        return NULL;
10179    if (kind2 != kind)
10180        buf2 = _PyUnicode_AsKind(substring, kind);
10181    if (!buf2) {
10182        if (kind1 != kind) PyMem_Free(buf1);
10183        return NULL;
10184    }
10185    len1 = PyUnicode_GET_LENGTH(self);
10186    len2 = PyUnicode_GET_LENGTH(substring);
10187
10188    switch (kind) {
10189    case PyUnicode_1BYTE_KIND:
10190        if (PyUnicode_IS_ASCII(self) && PyUnicode_IS_ASCII(substring))
10191            out = asciilib_rsplit(
10192                self,  buf1, len1, buf2, len2, maxcount);
10193        else
10194            out = ucs1lib_rsplit(
10195                self,  buf1, len1, buf2, len2, maxcount);
10196        break;
10197    case PyUnicode_2BYTE_KIND:
10198        out = ucs2lib_rsplit(
10199            self,  buf1, len1, buf2, len2, maxcount);
10200        break;
10201    case PyUnicode_4BYTE_KIND:
10202        out = ucs4lib_rsplit(
10203            self,  buf1, len1, buf2, len2, maxcount);
10204        break;
10205    default:
10206        out = NULL;
10207    }
10208    if (kind1 != kind)
10209        PyMem_Free(buf1);
10210    if (kind2 != kind)
10211        PyMem_Free(buf2);
10212    return out;
10213}
10214
10215static Py_ssize_t
10216anylib_find(int kind, PyObject *str1, void *buf1, Py_ssize_t len1,
10217            PyObject *str2, void *buf2, Py_ssize_t len2, Py_ssize_t offset)
10218{
10219    switch (kind) {
10220    case PyUnicode_1BYTE_KIND:
10221        if (PyUnicode_IS_ASCII(str1) && PyUnicode_IS_ASCII(str2))
10222            return asciilib_find(buf1, len1, buf2, len2, offset);
10223        else
10224            return ucs1lib_find(buf1, len1, buf2, len2, offset);
10225    case PyUnicode_2BYTE_KIND:
10226        return ucs2lib_find(buf1, len1, buf2, len2, offset);
10227    case PyUnicode_4BYTE_KIND:
10228        return ucs4lib_find(buf1, len1, buf2, len2, offset);
10229    }
10230    assert(0);
10231    return -1;
10232}
10233
10234static Py_ssize_t
10235anylib_count(int kind, PyObject *sstr, void* sbuf, Py_ssize_t slen,
10236             PyObject *str1, void *buf1, Py_ssize_t len1, Py_ssize_t maxcount)
10237{
10238    switch (kind) {
10239    case PyUnicode_1BYTE_KIND:
10240        if (PyUnicode_IS_ASCII(sstr) && PyUnicode_IS_ASCII(str1))
10241            return asciilib_count(sbuf, slen, buf1, len1, maxcount);
10242        else
10243            return ucs1lib_count(sbuf, slen, buf1, len1, maxcount);
10244    case PyUnicode_2BYTE_KIND:
10245        return ucs2lib_count(sbuf, slen, buf1, len1, maxcount);
10246    case PyUnicode_4BYTE_KIND:
10247        return ucs4lib_count(sbuf, slen, buf1, len1, maxcount);
10248    }
10249    assert(0);
10250    return 0;
10251}
10252
10253static void
10254replace_1char_inplace(PyObject *u, Py_ssize_t pos,
10255                      Py_UCS4 u1, Py_UCS4 u2, Py_ssize_t maxcount)
10256{
10257    int kind = PyUnicode_KIND(u);
10258    void *data = PyUnicode_DATA(u);
10259    Py_ssize_t len = PyUnicode_GET_LENGTH(u);
10260    if (kind == PyUnicode_1BYTE_KIND) {
10261        ucs1lib_replace_1char_inplace((Py_UCS1 *)data + pos,
10262                                      (Py_UCS1 *)data + len,
10263                                      u1, u2, maxcount);
10264    }
10265    else if (kind == PyUnicode_2BYTE_KIND) {
10266        ucs2lib_replace_1char_inplace((Py_UCS2 *)data + pos,
10267                                      (Py_UCS2 *)data + len,
10268                                      u1, u2, maxcount);
10269    }
10270    else {
10271        assert(kind == PyUnicode_4BYTE_KIND);
10272        ucs4lib_replace_1char_inplace((Py_UCS4 *)data + pos,
10273                                      (Py_UCS4 *)data + len,
10274                                      u1, u2, maxcount);
10275    }
10276}
10277
10278static PyObject *
10279replace(PyObject *self, PyObject *str1,
10280        PyObject *str2, Py_ssize_t maxcount)
10281{
10282    PyObject *u;
10283    char *sbuf = PyUnicode_DATA(self);
10284    char *buf1 = PyUnicode_DATA(str1);
10285    char *buf2 = PyUnicode_DATA(str2);
10286    int srelease = 0, release1 = 0, release2 = 0;
10287    int skind = PyUnicode_KIND(self);
10288    int kind1 = PyUnicode_KIND(str1);
10289    int kind2 = PyUnicode_KIND(str2);
10290    Py_ssize_t slen = PyUnicode_GET_LENGTH(self);
10291    Py_ssize_t len1 = PyUnicode_GET_LENGTH(str1);
10292    Py_ssize_t len2 = PyUnicode_GET_LENGTH(str2);
10293    int mayshrink;
10294    Py_UCS4 maxchar, maxchar_str1, maxchar_str2;
10295
10296    if (maxcount < 0)
10297        maxcount = PY_SSIZE_T_MAX;
10298    else if (maxcount == 0 || slen == 0)
10299        goto nothing;
10300
10301    if (str1 == str2)
10302        goto nothing;
10303
10304    maxchar = PyUnicode_MAX_CHAR_VALUE(self);
10305    maxchar_str1 = PyUnicode_MAX_CHAR_VALUE(str1);
10306    if (maxchar < maxchar_str1)
10307        /* substring too wide to be present */
10308        goto nothing;
10309    maxchar_str2 = PyUnicode_MAX_CHAR_VALUE(str2);
10310    /* Replacing str1 with str2 may cause a maxchar reduction in the
10311       result string. */
10312    mayshrink = (maxchar_str2 < maxchar_str1) && (maxchar == maxchar_str1);
10313    maxchar = Py_MAX(maxchar, maxchar_str2);
10314
10315    if (len1 == len2) {
10316        /* same length */
10317        if (len1 == 0)
10318            goto nothing;
10319        if (len1 == 1) {
10320            /* replace characters */
10321            Py_UCS4 u1, u2;
10322            Py_ssize_t pos;
10323
10324            u1 = PyUnicode_READ(kind1, buf1, 0);
10325            pos = findchar(sbuf, skind, slen, u1, 1);
10326            if (pos < 0)
10327                goto nothing;
10328            u2 = PyUnicode_READ(kind2, buf2, 0);
10329            u = PyUnicode_New(slen, maxchar);
10330            if (!u)
10331                goto error;
10332
10333            _PyUnicode_FastCopyCharacters(u, 0, self, 0, slen);
10334            replace_1char_inplace(u, pos, u1, u2, maxcount);
10335        }
10336        else {
10337            int rkind = skind;
10338            char *res;
10339            Py_ssize_t i;
10340
10341            if (kind1 < rkind) {
10342                /* widen substring */
10343                buf1 = _PyUnicode_AsKind(str1, rkind);
10344                if (!buf1) goto error;
10345                release1 = 1;
10346            }
10347            i = anylib_find(rkind, self, sbuf, slen, str1, buf1, len1, 0);
10348            if (i < 0)
10349                goto nothing;
10350            if (rkind > kind2) {
10351                /* widen replacement */
10352                buf2 = _PyUnicode_AsKind(str2, rkind);
10353                if (!buf2) goto error;
10354                release2 = 1;
10355            }
10356            else if (rkind < kind2) {
10357                /* widen self and buf1 */
10358                rkind = kind2;
10359                if (release1) PyMem_Free(buf1);
10360                release1 = 0;
10361                sbuf = _PyUnicode_AsKind(self, rkind);
10362                if (!sbuf) goto error;
10363                srelease = 1;
10364                buf1 = _PyUnicode_AsKind(str1, rkind);
10365                if (!buf1) goto error;
10366                release1 = 1;
10367            }
10368            u = PyUnicode_New(slen, maxchar);
10369            if (!u)
10370                goto error;
10371            assert(PyUnicode_KIND(u) == rkind);
10372            res = PyUnicode_DATA(u);
10373
10374            memcpy(res, sbuf, rkind * slen);
10375            /* change everything in-place, starting with this one */
10376            memcpy(res + rkind * i,
10377                   buf2,
10378                   rkind * len2);
10379            i += len1;
10380
10381            while ( --maxcount > 0) {
10382                i = anylib_find(rkind, self,
10383                                sbuf+rkind*i, slen-i,
10384                                str1, buf1, len1, i);
10385                if (i == -1)
10386                    break;
10387                memcpy(res + rkind * i,
10388                       buf2,
10389                       rkind * len2);
10390                i += len1;
10391            }
10392        }
10393    }
10394    else {
10395        Py_ssize_t n, i, j, ires;
10396        Py_ssize_t new_size;
10397        int rkind = skind;
10398        char *res;
10399
10400        if (kind1 < rkind) {
10401            /* widen substring */
10402            buf1 = _PyUnicode_AsKind(str1, rkind);
10403            if (!buf1) goto error;
10404            release1 = 1;
10405        }
10406        n = anylib_count(rkind, self, sbuf, slen, str1, buf1, len1, maxcount);
10407        if (n == 0)
10408            goto nothing;
10409        if (kind2 < rkind) {
10410            /* widen replacement */
10411            buf2 = _PyUnicode_AsKind(str2, rkind);
10412            if (!buf2) goto error;
10413            release2 = 1;
10414        }
10415        else if (kind2 > rkind) {
10416            /* widen self and buf1 */
10417            rkind = kind2;
10418            sbuf = _PyUnicode_AsKind(self, rkind);
10419            if (!sbuf) goto error;
10420            srelease = 1;
10421            if (release1) PyMem_Free(buf1);
10422            release1 = 0;
10423            buf1 = _PyUnicode_AsKind(str1, rkind);
10424            if (!buf1) goto error;
10425            release1 = 1;
10426        }
10427        /* new_size = PyUnicode_GET_LENGTH(self) + n * (PyUnicode_GET_LENGTH(str2) -
10428           PyUnicode_GET_LENGTH(str1))); */
10429        if (len2 > len1 && len2 - len1 > (PY_SSIZE_T_MAX - slen) / n) {
10430                PyErr_SetString(PyExc_OverflowError,
10431                                "replace string is too long");
10432                goto error;
10433        }
10434        new_size = slen + n * (len2 - len1);
10435        if (new_size == 0) {
10436            _Py_INCREF_UNICODE_EMPTY();
10437            if (!unicode_empty)
10438                goto error;
10439            u = unicode_empty;
10440            goto done;
10441        }
10442        if (new_size > (PY_SSIZE_T_MAX >> (rkind-1))) {
10443            PyErr_SetString(PyExc_OverflowError,
10444                            "replace string is too long");
10445            goto error;
10446        }
10447        u = PyUnicode_New(new_size, maxchar);
10448        if (!u)
10449            goto error;
10450        assert(PyUnicode_KIND(u) == rkind);
10451        res = PyUnicode_DATA(u);
10452        ires = i = 0;
10453        if (len1 > 0) {
10454            while (n-- > 0) {
10455                /* look for next match */
10456                j = anylib_find(rkind, self,
10457                                sbuf + rkind * i, slen-i,
10458                                str1, buf1, len1, i);
10459                if (j == -1)
10460                    break;
10461                else if (j > i) {
10462                    /* copy unchanged part [i:j] */
10463                    memcpy(res + rkind * ires,
10464                           sbuf + rkind * i,
10465                           rkind * (j-i));
10466                    ires += j - i;
10467                }
10468                /* copy substitution string */
10469                if (len2 > 0) {
10470                    memcpy(res + rkind * ires,
10471                           buf2,
10472                           rkind * len2);
10473                    ires += len2;
10474                }
10475                i = j + len1;
10476            }
10477            if (i < slen)
10478                /* copy tail [i:] */
10479                memcpy(res + rkind * ires,
10480                       sbuf + rkind * i,
10481                       rkind * (slen-i));
10482        }
10483        else {
10484            /* interleave */
10485            while (n > 0) {
10486                memcpy(res + rkind * ires,
10487                       buf2,
10488                       rkind * len2);
10489                ires += len2;
10490                if (--n <= 0)
10491                    break;
10492                memcpy(res + rkind * ires,
10493                       sbuf + rkind * i,
10494                       rkind);
10495                ires++;
10496                i++;
10497            }
10498            memcpy(res + rkind * ires,
10499                   sbuf + rkind * i,
10500                   rkind * (slen-i));
10501        }
10502    }
10503
10504    if (mayshrink) {
10505        unicode_adjust_maxchar(&u);
10506        if (u == NULL)
10507            goto error;
10508    }
10509
10510  done:
10511    if (srelease)
10512        PyMem_FREE(sbuf);
10513    if (release1)
10514        PyMem_FREE(buf1);
10515    if (release2)
10516        PyMem_FREE(buf2);
10517    assert(_PyUnicode_CheckConsistency(u, 1));
10518    return u;
10519
10520  nothing:
10521    /* nothing to replace; return original string (when possible) */
10522    if (srelease)
10523        PyMem_FREE(sbuf);
10524    if (release1)
10525        PyMem_FREE(buf1);
10526    if (release2)
10527        PyMem_FREE(buf2);
10528    return unicode_result_unchanged(self);
10529
10530  error:
10531    if (srelease && sbuf)
10532        PyMem_FREE(sbuf);
10533    if (release1 && buf1)
10534        PyMem_FREE(buf1);
10535    if (release2 && buf2)
10536        PyMem_FREE(buf2);
10537    return NULL;
10538}
10539
10540/* --- Unicode Object Methods --------------------------------------------- */
10541
10542PyDoc_STRVAR(title__doc__,
10543             "S.title() -> str\n\
10544\n\
10545Return a titlecased version of S, i.e. words start with title case\n\
10546characters, all remaining cased characters have lower case.");
10547
10548static PyObject*
10549unicode_title(PyObject *self)
10550{
10551    if (PyUnicode_READY(self) == -1)
10552        return NULL;
10553    return case_operation(self, do_title);
10554}
10555
10556PyDoc_STRVAR(capitalize__doc__,
10557             "S.capitalize() -> str\n\
10558\n\
10559Return a capitalized version of S, i.e. make the first character\n\
10560have upper case and the rest lower case.");
10561
10562static PyObject*
10563unicode_capitalize(PyObject *self)
10564{
10565    if (PyUnicode_READY(self) == -1)
10566        return NULL;
10567    if (PyUnicode_GET_LENGTH(self) == 0)
10568        return unicode_result_unchanged(self);
10569    return case_operation(self, do_capitalize);
10570}
10571
10572PyDoc_STRVAR(casefold__doc__,
10573             "S.casefold() -> str\n\
10574\n\
10575Return a version of S suitable for caseless comparisons.");
10576
10577static PyObject *
10578unicode_casefold(PyObject *self)
10579{
10580    if (PyUnicode_READY(self) == -1)
10581        return NULL;
10582    if (PyUnicode_IS_ASCII(self))
10583        return ascii_upper_or_lower(self, 1);
10584    return case_operation(self, do_casefold);
10585}
10586
10587
10588/* Argument converter.  Coerces to a single unicode character */
10589
10590static int
10591convert_uc(PyObject *obj, void *addr)
10592{
10593    Py_UCS4 *fillcharloc = (Py_UCS4 *)addr;
10594    PyObject *uniobj;
10595
10596    uniobj = PyUnicode_FromObject(obj);
10597    if (uniobj == NULL) {
10598        PyErr_SetString(PyExc_TypeError,
10599                        "The fill character cannot be converted to Unicode");
10600        return 0;
10601    }
10602    if (PyUnicode_GET_LENGTH(uniobj) != 1) {
10603        PyErr_SetString(PyExc_TypeError,
10604                        "The fill character must be exactly one character long");
10605        Py_DECREF(uniobj);
10606        return 0;
10607    }
10608    *fillcharloc = PyUnicode_READ_CHAR(uniobj, 0);
10609    Py_DECREF(uniobj);
10610    return 1;
10611}
10612
10613PyDoc_STRVAR(center__doc__,
10614             "S.center(width[, fillchar]) -> str\n\
10615\n\
10616Return S centered in a string of length width. Padding is\n\
10617done using the specified fill character (default is a space)");
10618
10619static PyObject *
10620unicode_center(PyObject *self, PyObject *args)
10621{
10622    Py_ssize_t marg, left;
10623    Py_ssize_t width;
10624    Py_UCS4 fillchar = ' ';
10625
10626    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
10627        return NULL;
10628
10629    if (PyUnicode_READY(self) == -1)
10630        return NULL;
10631
10632    if (PyUnicode_GET_LENGTH(self) >= width)
10633        return unicode_result_unchanged(self);
10634
10635    marg = width - PyUnicode_GET_LENGTH(self);
10636    left = marg / 2 + (marg & width & 1);
10637
10638    return pad(self, left, marg - left, fillchar);
10639}
10640
10641/* This function assumes that str1 and str2 are readied by the caller. */
10642
10643static int
10644unicode_compare(PyObject *str1, PyObject *str2)
10645{
10646#define COMPARE(TYPE1, TYPE2) \
10647    do { \
10648        TYPE1* p1 = (TYPE1 *)data1; \
10649        TYPE2* p2 = (TYPE2 *)data2; \
10650        TYPE1* end = p1 + len; \
10651        Py_UCS4 c1, c2; \
10652        for (; p1 != end; p1++, p2++) { \
10653            c1 = *p1; \
10654            c2 = *p2; \
10655            if (c1 != c2) \
10656                return (c1 < c2) ? -1 : 1; \
10657        } \
10658    } \
10659    while (0)
10660
10661    int kind1, kind2;
10662    void *data1, *data2;
10663    Py_ssize_t len1, len2, len;
10664
10665    kind1 = PyUnicode_KIND(str1);
10666    kind2 = PyUnicode_KIND(str2);
10667    data1 = PyUnicode_DATA(str1);
10668    data2 = PyUnicode_DATA(str2);
10669    len1 = PyUnicode_GET_LENGTH(str1);
10670    len2 = PyUnicode_GET_LENGTH(str2);
10671    len = Py_MIN(len1, len2);
10672
10673    switch(kind1) {
10674    case PyUnicode_1BYTE_KIND:
10675    {
10676        switch(kind2) {
10677        case PyUnicode_1BYTE_KIND:
10678        {
10679            int cmp = memcmp(data1, data2, len);
10680            /* normalize result of memcmp() into the range [-1; 1] */
10681            if (cmp < 0)
10682                return -1;
10683            if (cmp > 0)
10684                return 1;
10685            break;
10686        }
10687        case PyUnicode_2BYTE_KIND:
10688            COMPARE(Py_UCS1, Py_UCS2);
10689            break;
10690        case PyUnicode_4BYTE_KIND:
10691            COMPARE(Py_UCS1, Py_UCS4);
10692            break;
10693        default:
10694            assert(0);
10695        }
10696        break;
10697    }
10698    case PyUnicode_2BYTE_KIND:
10699    {
10700        switch(kind2) {
10701        case PyUnicode_1BYTE_KIND:
10702            COMPARE(Py_UCS2, Py_UCS1);
10703            break;
10704        case PyUnicode_2BYTE_KIND:
10705        {
10706            COMPARE(Py_UCS2, Py_UCS2);
10707            break;
10708        }
10709        case PyUnicode_4BYTE_KIND:
10710            COMPARE(Py_UCS2, Py_UCS4);
10711            break;
10712        default:
10713            assert(0);
10714        }
10715        break;
10716    }
10717    case PyUnicode_4BYTE_KIND:
10718    {
10719        switch(kind2) {
10720        case PyUnicode_1BYTE_KIND:
10721            COMPARE(Py_UCS4, Py_UCS1);
10722            break;
10723        case PyUnicode_2BYTE_KIND:
10724            COMPARE(Py_UCS4, Py_UCS2);
10725            break;
10726        case PyUnicode_4BYTE_KIND:
10727        {
10728#if defined(HAVE_WMEMCMP) && SIZEOF_WCHAR_T == 4
10729            int cmp = wmemcmp((wchar_t *)data1, (wchar_t *)data2, len);
10730            /* normalize result of wmemcmp() into the range [-1; 1] */
10731            if (cmp < 0)
10732                return -1;
10733            if (cmp > 0)
10734                return 1;
10735#else
10736            COMPARE(Py_UCS4, Py_UCS4);
10737#endif
10738            break;
10739        }
10740        default:
10741            assert(0);
10742        }
10743        break;
10744    }
10745    default:
10746        assert(0);
10747    }
10748
10749    if (len1 == len2)
10750        return 0;
10751    if (len1 < len2)
10752        return -1;
10753    else
10754        return 1;
10755
10756#undef COMPARE
10757}
10758
10759Py_LOCAL(int)
10760unicode_compare_eq(PyObject *str1, PyObject *str2)
10761{
10762    int kind;
10763    void *data1, *data2;
10764    Py_ssize_t len;
10765    int cmp;
10766
10767    len = PyUnicode_GET_LENGTH(str1);
10768    if (PyUnicode_GET_LENGTH(str2) != len)
10769        return 0;
10770    kind = PyUnicode_KIND(str1);
10771    if (PyUnicode_KIND(str2) != kind)
10772        return 0;
10773    data1 = PyUnicode_DATA(str1);
10774    data2 = PyUnicode_DATA(str2);
10775
10776    cmp = memcmp(data1, data2, len * kind);
10777    return (cmp == 0);
10778}
10779
10780
10781int
10782PyUnicode_Compare(PyObject *left, PyObject *right)
10783{
10784    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
10785        if (PyUnicode_READY(left) == -1 ||
10786            PyUnicode_READY(right) == -1)
10787            return -1;
10788
10789        /* a string is equal to itself */
10790        if (left == right)
10791            return 0;
10792
10793        return unicode_compare(left, right);
10794    }
10795    PyErr_Format(PyExc_TypeError,
10796                 "Can't compare %.100s and %.100s",
10797                 left->ob_type->tp_name,
10798                 right->ob_type->tp_name);
10799    return -1;
10800}
10801
10802int
10803_PyUnicode_CompareWithId(PyObject *left, _Py_Identifier *right)
10804{
10805    PyObject *right_str = _PyUnicode_FromId(right);   /* borrowed */
10806    if (right_str == NULL)
10807        return -1;
10808    return PyUnicode_Compare(left, right_str);
10809}
10810
10811int
10812PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
10813{
10814    Py_ssize_t i;
10815    int kind;
10816    Py_UCS4 chr;
10817
10818    assert(_PyUnicode_CHECK(uni));
10819    if (PyUnicode_READY(uni) == -1)
10820        return -1;
10821    kind = PyUnicode_KIND(uni);
10822    if (kind == PyUnicode_1BYTE_KIND) {
10823        const void *data = PyUnicode_1BYTE_DATA(uni);
10824        size_t len1 = (size_t)PyUnicode_GET_LENGTH(uni);
10825        size_t len, len2 = strlen(str);
10826        int cmp;
10827
10828        len = Py_MIN(len1, len2);
10829        cmp = memcmp(data, str, len);
10830        if (cmp != 0) {
10831            if (cmp < 0)
10832                return -1;
10833            else
10834                return 1;
10835        }
10836        if (len1 > len2)
10837            return 1; /* uni is longer */
10838        if (len2 > len1)
10839            return -1; /* str is longer */
10840        return 0;
10841    }
10842    else {
10843        void *data = PyUnicode_DATA(uni);
10844        /* Compare Unicode string and source character set string */
10845        for (i = 0; (chr = PyUnicode_READ(kind, data, i)) && str[i]; i++)
10846            if (chr != str[i])
10847                return (chr < (unsigned char)(str[i])) ? -1 : 1;
10848        /* This check keeps Python strings that end in '\0' from comparing equal
10849         to C strings identical up to that point. */
10850        if (PyUnicode_GET_LENGTH(uni) != i || chr)
10851            return 1; /* uni is longer */
10852        if (str[i])
10853            return -1; /* str is longer */
10854        return 0;
10855    }
10856}
10857
10858
10859#define TEST_COND(cond)                         \
10860    ((cond) ? Py_True : Py_False)
10861
10862PyObject *
10863PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
10864{
10865    int result;
10866    PyObject *v;
10867
10868    if (!PyUnicode_Check(left) || !PyUnicode_Check(right))
10869        Py_RETURN_NOTIMPLEMENTED;
10870
10871    if (PyUnicode_READY(left) == -1 ||
10872        PyUnicode_READY(right) == -1)
10873        return NULL;
10874
10875    if (left == right) {
10876        switch (op) {
10877        case Py_EQ:
10878        case Py_LE:
10879        case Py_GE:
10880            /* a string is equal to itself */
10881            v = Py_True;
10882            break;
10883        case Py_NE:
10884        case Py_LT:
10885        case Py_GT:
10886            v = Py_False;
10887            break;
10888        default:
10889            PyErr_BadArgument();
10890            return NULL;
10891        }
10892    }
10893    else if (op == Py_EQ || op == Py_NE) {
10894        result = unicode_compare_eq(left, right);
10895        result ^= (op == Py_NE);
10896        v = TEST_COND(result);
10897    }
10898    else {
10899        result = unicode_compare(left, right);
10900
10901        /* Convert the return value to a Boolean */
10902        switch (op) {
10903        case Py_LE:
10904            v = TEST_COND(result <= 0);
10905            break;
10906        case Py_GE:
10907            v = TEST_COND(result >= 0);
10908            break;
10909        case Py_LT:
10910            v = TEST_COND(result == -1);
10911            break;
10912        case Py_GT:
10913            v = TEST_COND(result == 1);
10914            break;
10915        default:
10916            PyErr_BadArgument();
10917            return NULL;
10918        }
10919    }
10920    Py_INCREF(v);
10921    return v;
10922}
10923
10924int
10925PyUnicode_Contains(PyObject *container, PyObject *element)
10926{
10927    PyObject *str, *sub;
10928    int kind1, kind2;
10929    void *buf1, *buf2;
10930    Py_ssize_t len1, len2;
10931    int result;
10932
10933    /* Coerce the two arguments */
10934    sub = PyUnicode_FromObject(element);
10935    if (!sub) {
10936        PyErr_Format(PyExc_TypeError,
10937                     "'in <string>' requires string as left operand, not %s",
10938                     element->ob_type->tp_name);
10939        return -1;
10940    }
10941
10942    str = PyUnicode_FromObject(container);
10943    if (!str) {
10944        Py_DECREF(sub);
10945        return -1;
10946    }
10947
10948    kind1 = PyUnicode_KIND(str);
10949    kind2 = PyUnicode_KIND(sub);
10950    buf1 = PyUnicode_DATA(str);
10951    buf2 = PyUnicode_DATA(sub);
10952    if (kind2 != kind1) {
10953        if (kind2 > kind1) {
10954            Py_DECREF(sub);
10955            Py_DECREF(str);
10956            return 0;
10957        }
10958        buf2 = _PyUnicode_AsKind(sub, kind1);
10959    }
10960    if (!buf2) {
10961        Py_DECREF(sub);
10962        Py_DECREF(str);
10963        return -1;
10964    }
10965    len1 = PyUnicode_GET_LENGTH(str);
10966    len2 = PyUnicode_GET_LENGTH(sub);
10967
10968    switch (kind1) {
10969    case PyUnicode_1BYTE_KIND:
10970        result = ucs1lib_find(buf1, len1, buf2, len2, 0) != -1;
10971        break;
10972    case PyUnicode_2BYTE_KIND:
10973        result = ucs2lib_find(buf1, len1, buf2, len2, 0) != -1;
10974        break;
10975    case PyUnicode_4BYTE_KIND:
10976        result = ucs4lib_find(buf1, len1, buf2, len2, 0) != -1;
10977        break;
10978    default:
10979        result = -1;
10980        assert(0);
10981    }
10982
10983    Py_DECREF(str);
10984    Py_DECREF(sub);
10985
10986    if (kind2 != kind1)
10987        PyMem_Free(buf2);
10988
10989    return result;
10990}
10991
10992/* Concat to string or Unicode object giving a new Unicode object. */
10993
10994PyObject *
10995PyUnicode_Concat(PyObject *left, PyObject *right)
10996{
10997    PyObject *u = NULL, *v = NULL, *w;
10998    Py_UCS4 maxchar, maxchar2;
10999    Py_ssize_t u_len, v_len, new_len;
11000
11001    /* Coerce the two arguments */
11002    u = PyUnicode_FromObject(left);
11003    if (u == NULL)
11004        goto onError;
11005    v = PyUnicode_FromObject(right);
11006    if (v == NULL)
11007        goto onError;
11008
11009    /* Shortcuts */
11010    if (v == unicode_empty) {
11011        Py_DECREF(v);
11012        return u;
11013    }
11014    if (u == unicode_empty) {
11015        Py_DECREF(u);
11016        return v;
11017    }
11018
11019    u_len = PyUnicode_GET_LENGTH(u);
11020    v_len = PyUnicode_GET_LENGTH(v);
11021    if (u_len > PY_SSIZE_T_MAX - v_len) {
11022        PyErr_SetString(PyExc_OverflowError,
11023                        "strings are too large to concat");
11024        goto onError;
11025    }
11026    new_len = u_len + v_len;
11027
11028    maxchar = PyUnicode_MAX_CHAR_VALUE(u);
11029    maxchar2 = PyUnicode_MAX_CHAR_VALUE(v);
11030    maxchar = Py_MAX(maxchar, maxchar2);
11031
11032    /* Concat the two Unicode strings */
11033    w = PyUnicode_New(new_len, maxchar);
11034    if (w == NULL)
11035        goto onError;
11036    _PyUnicode_FastCopyCharacters(w, 0, u, 0, u_len);
11037    _PyUnicode_FastCopyCharacters(w, u_len, v, 0, v_len);
11038    Py_DECREF(u);
11039    Py_DECREF(v);
11040    assert(_PyUnicode_CheckConsistency(w, 1));
11041    return w;
11042
11043  onError:
11044    Py_XDECREF(u);
11045    Py_XDECREF(v);
11046    return NULL;
11047}
11048
11049void
11050PyUnicode_Append(PyObject **p_left, PyObject *right)
11051{
11052    PyObject *left, *res;
11053    Py_UCS4 maxchar, maxchar2;
11054    Py_ssize_t left_len, right_len, new_len;
11055
11056    if (p_left == NULL) {
11057        if (!PyErr_Occurred())
11058            PyErr_BadInternalCall();
11059        return;
11060    }
11061    left = *p_left;
11062    if (right == NULL || left == NULL
11063        || !PyUnicode_Check(left) || !PyUnicode_Check(right)) {
11064        if (!PyErr_Occurred())
11065            PyErr_BadInternalCall();
11066        goto error;
11067    }
11068
11069    if (PyUnicode_READY(left) == -1)
11070        goto error;
11071    if (PyUnicode_READY(right) == -1)
11072        goto error;
11073
11074    /* Shortcuts */
11075    if (left == unicode_empty) {
11076        Py_DECREF(left);
11077        Py_INCREF(right);
11078        *p_left = right;
11079        return;
11080    }
11081    if (right == unicode_empty)
11082        return;
11083
11084    left_len = PyUnicode_GET_LENGTH(left);
11085    right_len = PyUnicode_GET_LENGTH(right);
11086    if (left_len > PY_SSIZE_T_MAX - right_len) {
11087        PyErr_SetString(PyExc_OverflowError,
11088                        "strings are too large to concat");
11089        goto error;
11090    }
11091    new_len = left_len + right_len;
11092
11093    if (unicode_modifiable(left)
11094        && PyUnicode_CheckExact(right)
11095        && PyUnicode_KIND(right) <= PyUnicode_KIND(left)
11096        /* Don't resize for ascii += latin1. Convert ascii to latin1 requires
11097           to change the structure size, but characters are stored just after
11098           the structure, and so it requires to move all characters which is
11099           not so different than duplicating the string. */
11100        && !(PyUnicode_IS_ASCII(left) && !PyUnicode_IS_ASCII(right)))
11101    {
11102        /* append inplace */
11103        if (unicode_resize(p_left, new_len) != 0)
11104            goto error;
11105
11106        /* copy 'right' into the newly allocated area of 'left' */
11107        _PyUnicode_FastCopyCharacters(*p_left, left_len, right, 0, right_len);
11108    }
11109    else {
11110        maxchar = PyUnicode_MAX_CHAR_VALUE(left);
11111        maxchar2 = PyUnicode_MAX_CHAR_VALUE(right);
11112        maxchar = Py_MAX(maxchar, maxchar2);
11113
11114        /* Concat the two Unicode strings */
11115        res = PyUnicode_New(new_len, maxchar);
11116        if (res == NULL)
11117            goto error;
11118        _PyUnicode_FastCopyCharacters(res, 0, left, 0, left_len);
11119        _PyUnicode_FastCopyCharacters(res, left_len, right, 0, right_len);
11120        Py_DECREF(left);
11121        *p_left = res;
11122    }
11123    assert(_PyUnicode_CheckConsistency(*p_left, 1));
11124    return;
11125
11126error:
11127    Py_CLEAR(*p_left);
11128}
11129
11130void
11131PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
11132{
11133    PyUnicode_Append(pleft, right);
11134    Py_XDECREF(right);
11135}
11136
11137PyDoc_STRVAR(count__doc__,
11138             "S.count(sub[, start[, end]]) -> int\n\
11139\n\
11140Return the number of non-overlapping occurrences of substring sub in\n\
11141string S[start:end].  Optional arguments start and end are\n\
11142interpreted as in slice notation.");
11143
11144static PyObject *
11145unicode_count(PyObject *self, PyObject *args)
11146{
11147    PyObject *substring = NULL;
11148    Py_ssize_t start = 0;
11149    Py_ssize_t end = PY_SSIZE_T_MAX;
11150    PyObject *result;
11151    int kind1, kind2, kind;
11152    void *buf1, *buf2;
11153    Py_ssize_t len1, len2, iresult;
11154
11155    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
11156                                            &start, &end))
11157        return NULL;
11158
11159    kind1 = PyUnicode_KIND(self);
11160    kind2 = PyUnicode_KIND(substring);
11161    if (kind2 > kind1) {
11162        Py_DECREF(substring);
11163        return PyLong_FromLong(0);
11164    }
11165    kind = kind1;
11166    buf1 = PyUnicode_DATA(self);
11167    buf2 = PyUnicode_DATA(substring);
11168    if (kind2 != kind)
11169        buf2 = _PyUnicode_AsKind(substring, kind);
11170    if (!buf2) {
11171        Py_DECREF(substring);
11172        return NULL;
11173    }
11174    len1 = PyUnicode_GET_LENGTH(self);
11175    len2 = PyUnicode_GET_LENGTH(substring);
11176
11177    ADJUST_INDICES(start, end, len1);
11178    switch (kind) {
11179    case PyUnicode_1BYTE_KIND:
11180        iresult = ucs1lib_count(
11181            ((Py_UCS1*)buf1) + start, end - start,
11182            buf2, len2, PY_SSIZE_T_MAX
11183            );
11184        break;
11185    case PyUnicode_2BYTE_KIND:
11186        iresult = ucs2lib_count(
11187            ((Py_UCS2*)buf1) + start, end - start,
11188            buf2, len2, PY_SSIZE_T_MAX
11189            );
11190        break;
11191    case PyUnicode_4BYTE_KIND:
11192        iresult = ucs4lib_count(
11193            ((Py_UCS4*)buf1) + start, end - start,
11194            buf2, len2, PY_SSIZE_T_MAX
11195            );
11196        break;
11197    default:
11198        assert(0); iresult = 0;
11199    }
11200
11201    result = PyLong_FromSsize_t(iresult);
11202
11203    if (kind2 != kind)
11204        PyMem_Free(buf2);
11205
11206    Py_DECREF(substring);
11207
11208    return result;
11209}
11210
11211PyDoc_STRVAR(encode__doc__,
11212             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
11213\n\
11214Encode S using the codec registered for encoding. Default encoding\n\
11215is 'utf-8'. errors may be given to set a different error\n\
11216handling scheme. Default is 'strict' meaning that encoding errors raise\n\
11217a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
11218'xmlcharrefreplace' as well as any other name registered with\n\
11219codecs.register_error that can handle UnicodeEncodeErrors.");
11220
11221static PyObject *
11222unicode_encode(PyObject *self, PyObject *args, PyObject *kwargs)
11223{
11224    static char *kwlist[] = {"encoding", "errors", 0};
11225    char *encoding = NULL;
11226    char *errors = NULL;
11227
11228    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
11229                                     kwlist, &encoding, &errors))
11230        return NULL;
11231    return PyUnicode_AsEncodedString(self, encoding, errors);
11232}
11233
11234PyDoc_STRVAR(expandtabs__doc__,
11235             "S.expandtabs(tabsize=8) -> str\n\
11236\n\
11237Return a copy of S where all tab characters are expanded using spaces.\n\
11238If tabsize is not given, a tab size of 8 characters is assumed.");
11239
11240static PyObject*
11241unicode_expandtabs(PyObject *self, PyObject *args, PyObject *kwds)
11242{
11243    Py_ssize_t i, j, line_pos, src_len, incr;
11244    Py_UCS4 ch;
11245    PyObject *u;
11246    void *src_data, *dest_data;
11247    static char *kwlist[] = {"tabsize", 0};
11248    int tabsize = 8;
11249    int kind;
11250    int found;
11251
11252    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:expandtabs",
11253                                     kwlist, &tabsize))
11254        return NULL;
11255
11256    if (PyUnicode_READY(self) == -1)
11257        return NULL;
11258
11259    /* First pass: determine size of output string */
11260    src_len = PyUnicode_GET_LENGTH(self);
11261    i = j = line_pos = 0;
11262    kind = PyUnicode_KIND(self);
11263    src_data = PyUnicode_DATA(self);
11264    found = 0;
11265    for (; i < src_len; i++) {
11266        ch = PyUnicode_READ(kind, src_data, i);
11267        if (ch == '\t') {
11268            found = 1;
11269            if (tabsize > 0) {
11270                incr = tabsize - (line_pos % tabsize); /* cannot overflow */
11271                if (j > PY_SSIZE_T_MAX - incr)
11272                    goto overflow;
11273                line_pos += incr;
11274                j += incr;
11275            }
11276        }
11277        else {
11278            if (j > PY_SSIZE_T_MAX - 1)
11279                goto overflow;
11280            line_pos++;
11281            j++;
11282            if (ch == '\n' || ch == '\r')
11283                line_pos = 0;
11284        }
11285    }
11286    if (!found)
11287        return unicode_result_unchanged(self);
11288
11289    /* Second pass: create output string and fill it */
11290    u = PyUnicode_New(j, PyUnicode_MAX_CHAR_VALUE(self));
11291    if (!u)
11292        return NULL;
11293    dest_data = PyUnicode_DATA(u);
11294
11295    i = j = line_pos = 0;
11296
11297    for (; i < src_len; i++) {
11298        ch = PyUnicode_READ(kind, src_data, i);
11299        if (ch == '\t') {
11300            if (tabsize > 0) {
11301                incr = tabsize - (line_pos % tabsize);
11302                line_pos += incr;
11303                FILL(kind, dest_data, ' ', j, incr);
11304                j += incr;
11305            }
11306        }
11307        else {
11308            line_pos++;
11309            PyUnicode_WRITE(kind, dest_data, j, ch);
11310            j++;
11311            if (ch == '\n' || ch == '\r')
11312                line_pos = 0;
11313        }
11314    }
11315    assert (j == PyUnicode_GET_LENGTH(u));
11316    return unicode_result(u);
11317
11318  overflow:
11319    PyErr_SetString(PyExc_OverflowError, "new string is too long");
11320    return NULL;
11321}
11322
11323PyDoc_STRVAR(find__doc__,
11324             "S.find(sub[, start[, end]]) -> int\n\
11325\n\
11326Return the lowest index in S where substring sub is found,\n\
11327such that sub is contained within S[start:end].  Optional\n\
11328arguments start and end are interpreted as in slice notation.\n\
11329\n\
11330Return -1 on failure.");
11331
11332static PyObject *
11333unicode_find(PyObject *self, PyObject *args)
11334{
11335    PyObject *substring = NULL;
11336    Py_ssize_t start = 0;
11337    Py_ssize_t end = 0;
11338    Py_ssize_t result;
11339
11340    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
11341                                            &start, &end))
11342        return NULL;
11343
11344    if (PyUnicode_READY(self) == -1) {
11345        Py_DECREF(substring);
11346        return NULL;
11347    }
11348    if (PyUnicode_READY(substring) == -1) {
11349        Py_DECREF(substring);
11350        return NULL;
11351    }
11352
11353    result = any_find_slice(1, self, substring, start, end);
11354
11355    Py_DECREF(substring);
11356
11357    if (result == -2)
11358        return NULL;
11359
11360    return PyLong_FromSsize_t(result);
11361}
11362
11363static PyObject *
11364unicode_getitem(PyObject *self, Py_ssize_t index)
11365{
11366    void *data;
11367    enum PyUnicode_Kind kind;
11368    Py_UCS4 ch;
11369
11370    if (!PyUnicode_Check(self) || PyUnicode_READY(self) == -1) {
11371        PyErr_BadArgument();
11372        return NULL;
11373    }
11374    if (index < 0 || index >= PyUnicode_GET_LENGTH(self)) {
11375        PyErr_SetString(PyExc_IndexError, "string index out of range");
11376        return NULL;
11377    }
11378    kind = PyUnicode_KIND(self);
11379    data = PyUnicode_DATA(self);
11380    ch = PyUnicode_READ(kind, data, index);
11381    return unicode_char(ch);
11382}
11383
11384/* Believe it or not, this produces the same value for ASCII strings
11385   as bytes_hash(). */
11386static Py_hash_t
11387unicode_hash(PyObject *self)
11388{
11389    Py_ssize_t len;
11390    Py_uhash_t x;  /* Unsigned for defined overflow behavior. */
11391
11392#ifdef Py_DEBUG
11393    assert(_Py_HashSecret_Initialized);
11394#endif
11395    if (_PyUnicode_HASH(self) != -1)
11396        return _PyUnicode_HASH(self);
11397    if (PyUnicode_READY(self) == -1)
11398        return -1;
11399    len = PyUnicode_GET_LENGTH(self);
11400    /*
11401      We make the hash of the empty string be 0, rather than using
11402      (prefix ^ suffix), since this slightly obfuscates the hash secret
11403    */
11404    if (len == 0) {
11405        _PyUnicode_HASH(self) = 0;
11406        return 0;
11407    }
11408    x = _Py_HashBytes(PyUnicode_DATA(self),
11409                      PyUnicode_GET_LENGTH(self) * PyUnicode_KIND(self));
11410    _PyUnicode_HASH(self) = x;
11411    return x;
11412}
11413
11414PyDoc_STRVAR(index__doc__,
11415             "S.index(sub[, start[, end]]) -> int\n\
11416\n\
11417Like S.find() but raise ValueError when the substring is not found.");
11418
11419static PyObject *
11420unicode_index(PyObject *self, PyObject *args)
11421{
11422    Py_ssize_t result;
11423    PyObject *substring = NULL;
11424    Py_ssize_t start = 0;
11425    Py_ssize_t end = 0;
11426
11427    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
11428                                            &start, &end))
11429        return NULL;
11430
11431    if (PyUnicode_READY(self) == -1) {
11432        Py_DECREF(substring);
11433        return NULL;
11434    }
11435    if (PyUnicode_READY(substring) == -1) {
11436        Py_DECREF(substring);
11437        return NULL;
11438    }
11439
11440    result = any_find_slice(1, self, substring, start, end);
11441
11442    Py_DECREF(substring);
11443
11444    if (result == -2)
11445        return NULL;
11446
11447    if (result < 0) {
11448        PyErr_SetString(PyExc_ValueError, "substring not found");
11449        return NULL;
11450    }
11451
11452    return PyLong_FromSsize_t(result);
11453}
11454
11455PyDoc_STRVAR(islower__doc__,
11456             "S.islower() -> bool\n\
11457\n\
11458Return True if all cased characters in S are lowercase and there is\n\
11459at least one cased character in S, False otherwise.");
11460
11461static PyObject*
11462unicode_islower(PyObject *self)
11463{
11464    Py_ssize_t i, length;
11465    int kind;
11466    void *data;
11467    int cased;
11468
11469    if (PyUnicode_READY(self) == -1)
11470        return NULL;
11471    length = PyUnicode_GET_LENGTH(self);
11472    kind = PyUnicode_KIND(self);
11473    data = PyUnicode_DATA(self);
11474
11475    /* Shortcut for single character strings */
11476    if (length == 1)
11477        return PyBool_FromLong(
11478            Py_UNICODE_ISLOWER(PyUnicode_READ(kind, data, 0)));
11479
11480    /* Special case for empty strings */
11481    if (length == 0)
11482        return PyBool_FromLong(0);
11483
11484    cased = 0;
11485    for (i = 0; i < length; i++) {
11486        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11487
11488        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
11489            return PyBool_FromLong(0);
11490        else if (!cased && Py_UNICODE_ISLOWER(ch))
11491            cased = 1;
11492    }
11493    return PyBool_FromLong(cased);
11494}
11495
11496PyDoc_STRVAR(isupper__doc__,
11497             "S.isupper() -> bool\n\
11498\n\
11499Return True if all cased characters in S are uppercase and there is\n\
11500at least one cased character in S, False otherwise.");
11501
11502static PyObject*
11503unicode_isupper(PyObject *self)
11504{
11505    Py_ssize_t i, length;
11506    int kind;
11507    void *data;
11508    int cased;
11509
11510    if (PyUnicode_READY(self) == -1)
11511        return NULL;
11512    length = PyUnicode_GET_LENGTH(self);
11513    kind = PyUnicode_KIND(self);
11514    data = PyUnicode_DATA(self);
11515
11516    /* Shortcut for single character strings */
11517    if (length == 1)
11518        return PyBool_FromLong(
11519            Py_UNICODE_ISUPPER(PyUnicode_READ(kind, data, 0)) != 0);
11520
11521    /* Special case for empty strings */
11522    if (length == 0)
11523        return PyBool_FromLong(0);
11524
11525    cased = 0;
11526    for (i = 0; i < length; i++) {
11527        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11528
11529        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
11530            return PyBool_FromLong(0);
11531        else if (!cased && Py_UNICODE_ISUPPER(ch))
11532            cased = 1;
11533    }
11534    return PyBool_FromLong(cased);
11535}
11536
11537PyDoc_STRVAR(istitle__doc__,
11538             "S.istitle() -> bool\n\
11539\n\
11540Return True if S is a titlecased string and there is at least one\n\
11541character in S, i.e. upper- and titlecase characters may only\n\
11542follow uncased characters and lowercase characters only cased ones.\n\
11543Return False otherwise.");
11544
11545static PyObject*
11546unicode_istitle(PyObject *self)
11547{
11548    Py_ssize_t i, length;
11549    int kind;
11550    void *data;
11551    int cased, previous_is_cased;
11552
11553    if (PyUnicode_READY(self) == -1)
11554        return NULL;
11555    length = PyUnicode_GET_LENGTH(self);
11556    kind = PyUnicode_KIND(self);
11557    data = PyUnicode_DATA(self);
11558
11559    /* Shortcut for single character strings */
11560    if (length == 1) {
11561        Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11562        return PyBool_FromLong((Py_UNICODE_ISTITLE(ch) != 0) ||
11563                               (Py_UNICODE_ISUPPER(ch) != 0));
11564    }
11565
11566    /* Special case for empty strings */
11567    if (length == 0)
11568        return PyBool_FromLong(0);
11569
11570    cased = 0;
11571    previous_is_cased = 0;
11572    for (i = 0; i < length; i++) {
11573        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11574
11575        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
11576            if (previous_is_cased)
11577                return PyBool_FromLong(0);
11578            previous_is_cased = 1;
11579            cased = 1;
11580        }
11581        else if (Py_UNICODE_ISLOWER(ch)) {
11582            if (!previous_is_cased)
11583                return PyBool_FromLong(0);
11584            previous_is_cased = 1;
11585            cased = 1;
11586        }
11587        else
11588            previous_is_cased = 0;
11589    }
11590    return PyBool_FromLong(cased);
11591}
11592
11593PyDoc_STRVAR(isspace__doc__,
11594             "S.isspace() -> bool\n\
11595\n\
11596Return True if all characters in S are whitespace\n\
11597and there is at least one character in S, False otherwise.");
11598
11599static PyObject*
11600unicode_isspace(PyObject *self)
11601{
11602    Py_ssize_t i, length;
11603    int kind;
11604    void *data;
11605
11606    if (PyUnicode_READY(self) == -1)
11607        return NULL;
11608    length = PyUnicode_GET_LENGTH(self);
11609    kind = PyUnicode_KIND(self);
11610    data = PyUnicode_DATA(self);
11611
11612    /* Shortcut for single character strings */
11613    if (length == 1)
11614        return PyBool_FromLong(
11615            Py_UNICODE_ISSPACE(PyUnicode_READ(kind, data, 0)));
11616
11617    /* Special case for empty strings */
11618    if (length == 0)
11619        return PyBool_FromLong(0);
11620
11621    for (i = 0; i < length; i++) {
11622        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11623        if (!Py_UNICODE_ISSPACE(ch))
11624            return PyBool_FromLong(0);
11625    }
11626    return PyBool_FromLong(1);
11627}
11628
11629PyDoc_STRVAR(isalpha__doc__,
11630             "S.isalpha() -> bool\n\
11631\n\
11632Return True if all characters in S are alphabetic\n\
11633and there is at least one character in S, False otherwise.");
11634
11635static PyObject*
11636unicode_isalpha(PyObject *self)
11637{
11638    Py_ssize_t i, length;
11639    int kind;
11640    void *data;
11641
11642    if (PyUnicode_READY(self) == -1)
11643        return NULL;
11644    length = PyUnicode_GET_LENGTH(self);
11645    kind = PyUnicode_KIND(self);
11646    data = PyUnicode_DATA(self);
11647
11648    /* Shortcut for single character strings */
11649    if (length == 1)
11650        return PyBool_FromLong(
11651            Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, 0)));
11652
11653    /* Special case for empty strings */
11654    if (length == 0)
11655        return PyBool_FromLong(0);
11656
11657    for (i = 0; i < length; i++) {
11658        if (!Py_UNICODE_ISALPHA(PyUnicode_READ(kind, data, i)))
11659            return PyBool_FromLong(0);
11660    }
11661    return PyBool_FromLong(1);
11662}
11663
11664PyDoc_STRVAR(isalnum__doc__,
11665             "S.isalnum() -> bool\n\
11666\n\
11667Return True if all characters in S are alphanumeric\n\
11668and there is at least one character in S, False otherwise.");
11669
11670static PyObject*
11671unicode_isalnum(PyObject *self)
11672{
11673    int kind;
11674    void *data;
11675    Py_ssize_t len, i;
11676
11677    if (PyUnicode_READY(self) == -1)
11678        return NULL;
11679
11680    kind = PyUnicode_KIND(self);
11681    data = PyUnicode_DATA(self);
11682    len = PyUnicode_GET_LENGTH(self);
11683
11684    /* Shortcut for single character strings */
11685    if (len == 1) {
11686        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11687        return PyBool_FromLong(Py_UNICODE_ISALNUM(ch));
11688    }
11689
11690    /* Special case for empty strings */
11691    if (len == 0)
11692        return PyBool_FromLong(0);
11693
11694    for (i = 0; i < len; i++) {
11695        const Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11696        if (!Py_UNICODE_ISALNUM(ch))
11697            return PyBool_FromLong(0);
11698    }
11699    return PyBool_FromLong(1);
11700}
11701
11702PyDoc_STRVAR(isdecimal__doc__,
11703             "S.isdecimal() -> bool\n\
11704\n\
11705Return True if there are only decimal characters in S,\n\
11706False otherwise.");
11707
11708static PyObject*
11709unicode_isdecimal(PyObject *self)
11710{
11711    Py_ssize_t i, length;
11712    int kind;
11713    void *data;
11714
11715    if (PyUnicode_READY(self) == -1)
11716        return NULL;
11717    length = PyUnicode_GET_LENGTH(self);
11718    kind = PyUnicode_KIND(self);
11719    data = PyUnicode_DATA(self);
11720
11721    /* Shortcut for single character strings */
11722    if (length == 1)
11723        return PyBool_FromLong(
11724            Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, 0)));
11725
11726    /* Special case for empty strings */
11727    if (length == 0)
11728        return PyBool_FromLong(0);
11729
11730    for (i = 0; i < length; i++) {
11731        if (!Py_UNICODE_ISDECIMAL(PyUnicode_READ(kind, data, i)))
11732            return PyBool_FromLong(0);
11733    }
11734    return PyBool_FromLong(1);
11735}
11736
11737PyDoc_STRVAR(isdigit__doc__,
11738             "S.isdigit() -> bool\n\
11739\n\
11740Return True if all characters in S are digits\n\
11741and there is at least one character in S, False otherwise.");
11742
11743static PyObject*
11744unicode_isdigit(PyObject *self)
11745{
11746    Py_ssize_t i, length;
11747    int kind;
11748    void *data;
11749
11750    if (PyUnicode_READY(self) == -1)
11751        return NULL;
11752    length = PyUnicode_GET_LENGTH(self);
11753    kind = PyUnicode_KIND(self);
11754    data = PyUnicode_DATA(self);
11755
11756    /* Shortcut for single character strings */
11757    if (length == 1) {
11758        const Py_UCS4 ch = PyUnicode_READ(kind, data, 0);
11759        return PyBool_FromLong(Py_UNICODE_ISDIGIT(ch));
11760    }
11761
11762    /* Special case for empty strings */
11763    if (length == 0)
11764        return PyBool_FromLong(0);
11765
11766    for (i = 0; i < length; i++) {
11767        if (!Py_UNICODE_ISDIGIT(PyUnicode_READ(kind, data, i)))
11768            return PyBool_FromLong(0);
11769    }
11770    return PyBool_FromLong(1);
11771}
11772
11773PyDoc_STRVAR(isnumeric__doc__,
11774             "S.isnumeric() -> bool\n\
11775\n\
11776Return True if there are only numeric characters in S,\n\
11777False otherwise.");
11778
11779static PyObject*
11780unicode_isnumeric(PyObject *self)
11781{
11782    Py_ssize_t i, length;
11783    int kind;
11784    void *data;
11785
11786    if (PyUnicode_READY(self) == -1)
11787        return NULL;
11788    length = PyUnicode_GET_LENGTH(self);
11789    kind = PyUnicode_KIND(self);
11790    data = PyUnicode_DATA(self);
11791
11792    /* Shortcut for single character strings */
11793    if (length == 1)
11794        return PyBool_FromLong(
11795            Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, 0)));
11796
11797    /* Special case for empty strings */
11798    if (length == 0)
11799        return PyBool_FromLong(0);
11800
11801    for (i = 0; i < length; i++) {
11802        if (!Py_UNICODE_ISNUMERIC(PyUnicode_READ(kind, data, i)))
11803            return PyBool_FromLong(0);
11804    }
11805    return PyBool_FromLong(1);
11806}
11807
11808int
11809PyUnicode_IsIdentifier(PyObject *self)
11810{
11811    int kind;
11812    void *data;
11813    Py_ssize_t i;
11814    Py_UCS4 first;
11815
11816    if (PyUnicode_READY(self) == -1) {
11817        Py_FatalError("identifier not ready");
11818        return 0;
11819    }
11820
11821    /* Special case for empty strings */
11822    if (PyUnicode_GET_LENGTH(self) == 0)
11823        return 0;
11824    kind = PyUnicode_KIND(self);
11825    data = PyUnicode_DATA(self);
11826
11827    /* PEP 3131 says that the first character must be in
11828       XID_Start and subsequent characters in XID_Continue,
11829       and for the ASCII range, the 2.x rules apply (i.e
11830       start with letters and underscore, continue with
11831       letters, digits, underscore). However, given the current
11832       definition of XID_Start and XID_Continue, it is sufficient
11833       to check just for these, except that _ must be allowed
11834       as starting an identifier.  */
11835    first = PyUnicode_READ(kind, data, 0);
11836    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
11837        return 0;
11838
11839    for (i = 1; i < PyUnicode_GET_LENGTH(self); i++)
11840        if (!_PyUnicode_IsXidContinue(PyUnicode_READ(kind, data, i)))
11841            return 0;
11842    return 1;
11843}
11844
11845PyDoc_STRVAR(isidentifier__doc__,
11846             "S.isidentifier() -> bool\n\
11847\n\
11848Return True if S is a valid identifier according\n\
11849to the language definition.\n\
11850\n\
11851Use keyword.iskeyword() to test for reserved identifiers\n\
11852such as \"def\" and \"class\".\n");
11853
11854static PyObject*
11855unicode_isidentifier(PyObject *self)
11856{
11857    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
11858}
11859
11860PyDoc_STRVAR(isprintable__doc__,
11861             "S.isprintable() -> bool\n\
11862\n\
11863Return True if all characters in S are considered\n\
11864printable in repr() or S is empty, False otherwise.");
11865
11866static PyObject*
11867unicode_isprintable(PyObject *self)
11868{
11869    Py_ssize_t i, length;
11870    int kind;
11871    void *data;
11872
11873    if (PyUnicode_READY(self) == -1)
11874        return NULL;
11875    length = PyUnicode_GET_LENGTH(self);
11876    kind = PyUnicode_KIND(self);
11877    data = PyUnicode_DATA(self);
11878
11879    /* Shortcut for single character strings */
11880    if (length == 1)
11881        return PyBool_FromLong(
11882            Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, 0)));
11883
11884    for (i = 0; i < length; i++) {
11885        if (!Py_UNICODE_ISPRINTABLE(PyUnicode_READ(kind, data, i))) {
11886            Py_RETURN_FALSE;
11887        }
11888    }
11889    Py_RETURN_TRUE;
11890}
11891
11892PyDoc_STRVAR(join__doc__,
11893             "S.join(iterable) -> str\n\
11894\n\
11895Return a string which is the concatenation of the strings in the\n\
11896iterable.  The separator between elements is S.");
11897
11898static PyObject*
11899unicode_join(PyObject *self, PyObject *data)
11900{
11901    return PyUnicode_Join(self, data);
11902}
11903
11904static Py_ssize_t
11905unicode_length(PyObject *self)
11906{
11907    if (PyUnicode_READY(self) == -1)
11908        return -1;
11909    return PyUnicode_GET_LENGTH(self);
11910}
11911
11912PyDoc_STRVAR(ljust__doc__,
11913             "S.ljust(width[, fillchar]) -> str\n\
11914\n\
11915Return S left-justified in a Unicode string of length width. Padding is\n\
11916done using the specified fill character (default is a space).");
11917
11918static PyObject *
11919unicode_ljust(PyObject *self, PyObject *args)
11920{
11921    Py_ssize_t width;
11922    Py_UCS4 fillchar = ' ';
11923
11924    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
11925        return NULL;
11926
11927    if (PyUnicode_READY(self) == -1)
11928        return NULL;
11929
11930    if (PyUnicode_GET_LENGTH(self) >= width)
11931        return unicode_result_unchanged(self);
11932
11933    return pad(self, 0, width - PyUnicode_GET_LENGTH(self), fillchar);
11934}
11935
11936PyDoc_STRVAR(lower__doc__,
11937             "S.lower() -> str\n\
11938\n\
11939Return a copy of the string S converted to lowercase.");
11940
11941static PyObject*
11942unicode_lower(PyObject *self)
11943{
11944    if (PyUnicode_READY(self) == -1)
11945        return NULL;
11946    if (PyUnicode_IS_ASCII(self))
11947        return ascii_upper_or_lower(self, 1);
11948    return case_operation(self, do_lower);
11949}
11950
11951#define LEFTSTRIP 0
11952#define RIGHTSTRIP 1
11953#define BOTHSTRIP 2
11954
11955/* Arrays indexed by above */
11956static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
11957
11958#define STRIPNAME(i) (stripformat[i]+3)
11959
11960/* externally visible for str.strip(unicode) */
11961PyObject *
11962_PyUnicode_XStrip(PyObject *self, int striptype, PyObject *sepobj)
11963{
11964    void *data;
11965    int kind;
11966    Py_ssize_t i, j, len;
11967    BLOOM_MASK sepmask;
11968    Py_ssize_t seplen;
11969
11970    if (PyUnicode_READY(self) == -1 || PyUnicode_READY(sepobj) == -1)
11971        return NULL;
11972
11973    kind = PyUnicode_KIND(self);
11974    data = PyUnicode_DATA(self);
11975    len = PyUnicode_GET_LENGTH(self);
11976    seplen = PyUnicode_GET_LENGTH(sepobj);
11977    sepmask = make_bloom_mask(PyUnicode_KIND(sepobj),
11978                              PyUnicode_DATA(sepobj),
11979                              seplen);
11980
11981    i = 0;
11982    if (striptype != RIGHTSTRIP) {
11983        while (i < len) {
11984            Py_UCS4 ch = PyUnicode_READ(kind, data, i);
11985            if (!BLOOM(sepmask, ch))
11986                break;
11987            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
11988                break;
11989            i++;
11990        }
11991    }
11992
11993    j = len;
11994    if (striptype != LEFTSTRIP) {
11995        j--;
11996        while (j >= i) {
11997            Py_UCS4 ch = PyUnicode_READ(kind, data, j);
11998            if (!BLOOM(sepmask, ch))
11999                break;
12000            if (PyUnicode_FindChar(sepobj, ch, 0, seplen, 1) < 0)
12001                break;
12002            j--;
12003        }
12004
12005        j++;
12006    }
12007
12008    return PyUnicode_Substring(self, i, j);
12009}
12010
12011PyObject*
12012PyUnicode_Substring(PyObject *self, Py_ssize_t start, Py_ssize_t end)
12013{
12014    unsigned char *data;
12015    int kind;
12016    Py_ssize_t length;
12017
12018    if (PyUnicode_READY(self) == -1)
12019        return NULL;
12020
12021    length = PyUnicode_GET_LENGTH(self);
12022    end = Py_MIN(end, length);
12023
12024    if (start == 0 && end == length)
12025        return unicode_result_unchanged(self);
12026
12027    if (start < 0 || end < 0) {
12028        PyErr_SetString(PyExc_IndexError, "string index out of range");
12029        return NULL;
12030    }
12031    if (start >= length || end < start)
12032        _Py_RETURN_UNICODE_EMPTY();
12033
12034    length = end - start;
12035    if (PyUnicode_IS_ASCII(self)) {
12036        data = PyUnicode_1BYTE_DATA(self);
12037        return _PyUnicode_FromASCII((char*)(data + start), length);
12038    }
12039    else {
12040        kind = PyUnicode_KIND(self);
12041        data = PyUnicode_1BYTE_DATA(self);
12042        return PyUnicode_FromKindAndData(kind,
12043                                         data + kind * start,
12044                                         length);
12045    }
12046}
12047
12048static PyObject *
12049do_strip(PyObject *self, int striptype)
12050{
12051    Py_ssize_t len, i, j;
12052
12053    if (PyUnicode_READY(self) == -1)
12054        return NULL;
12055
12056    len = PyUnicode_GET_LENGTH(self);
12057
12058    if (PyUnicode_IS_ASCII(self)) {
12059        Py_UCS1 *data = PyUnicode_1BYTE_DATA(self);
12060
12061        i = 0;
12062        if (striptype != RIGHTSTRIP) {
12063            while (i < len) {
12064                Py_UCS1 ch = data[i];
12065                if (!_Py_ascii_whitespace[ch])
12066                    break;
12067                i++;
12068            }
12069        }
12070
12071        j = len;
12072        if (striptype != LEFTSTRIP) {
12073            j--;
12074            while (j >= i) {
12075                Py_UCS1 ch = data[j];
12076                if (!_Py_ascii_whitespace[ch])
12077                    break;
12078                j--;
12079            }
12080            j++;
12081        }
12082    }
12083    else {
12084        int kind = PyUnicode_KIND(self);
12085        void *data = PyUnicode_DATA(self);
12086
12087        i = 0;
12088        if (striptype != RIGHTSTRIP) {
12089            while (i < len) {
12090                Py_UCS4 ch = PyUnicode_READ(kind, data, i);
12091                if (!Py_UNICODE_ISSPACE(ch))
12092                    break;
12093                i++;
12094            }
12095        }
12096
12097        j = len;
12098        if (striptype != LEFTSTRIP) {
12099            j--;
12100            while (j >= i) {
12101                Py_UCS4 ch = PyUnicode_READ(kind, data, j);
12102                if (!Py_UNICODE_ISSPACE(ch))
12103                    break;
12104                j--;
12105            }
12106            j++;
12107        }
12108    }
12109
12110    return PyUnicode_Substring(self, i, j);
12111}
12112
12113
12114static PyObject *
12115do_argstrip(PyObject *self, int striptype, PyObject *args)
12116{
12117    PyObject *sep = NULL;
12118
12119    if (!PyArg_ParseTuple(args, stripformat[striptype], &sep))
12120        return NULL;
12121
12122    if (sep != NULL && sep != Py_None) {
12123        if (PyUnicode_Check(sep))
12124            return _PyUnicode_XStrip(self, striptype, sep);
12125        else {
12126            PyErr_Format(PyExc_TypeError,
12127                         "%s arg must be None or str",
12128                         STRIPNAME(striptype));
12129            return NULL;
12130        }
12131    }
12132
12133    return do_strip(self, striptype);
12134}
12135
12136
12137PyDoc_STRVAR(strip__doc__,
12138             "S.strip([chars]) -> str\n\
12139\n\
12140Return a copy of the string S with leading and trailing\n\
12141whitespace removed.\n\
12142If chars is given and not None, remove characters in chars instead.");
12143
12144static PyObject *
12145unicode_strip(PyObject *self, PyObject *args)
12146{
12147    if (PyTuple_GET_SIZE(args) == 0)
12148        return do_strip(self, BOTHSTRIP); /* Common case */
12149    else
12150        return do_argstrip(self, BOTHSTRIP, args);
12151}
12152
12153
12154PyDoc_STRVAR(lstrip__doc__,
12155             "S.lstrip([chars]) -> str\n\
12156\n\
12157Return a copy of the string S with leading whitespace removed.\n\
12158If chars is given and not None, remove characters in chars instead.");
12159
12160static PyObject *
12161unicode_lstrip(PyObject *self, PyObject *args)
12162{
12163    if (PyTuple_GET_SIZE(args) == 0)
12164        return do_strip(self, LEFTSTRIP); /* Common case */
12165    else
12166        return do_argstrip(self, LEFTSTRIP, args);
12167}
12168
12169
12170PyDoc_STRVAR(rstrip__doc__,
12171             "S.rstrip([chars]) -> str\n\
12172\n\
12173Return a copy of the string S with trailing whitespace removed.\n\
12174If chars is given and not None, remove characters in chars instead.");
12175
12176static PyObject *
12177unicode_rstrip(PyObject *self, PyObject *args)
12178{
12179    if (PyTuple_GET_SIZE(args) == 0)
12180        return do_strip(self, RIGHTSTRIP); /* Common case */
12181    else
12182        return do_argstrip(self, RIGHTSTRIP, args);
12183}
12184
12185
12186static PyObject*
12187unicode_repeat(PyObject *str, Py_ssize_t len)
12188{
12189    PyObject *u;
12190    Py_ssize_t nchars, n;
12191
12192    if (len < 1)
12193        _Py_RETURN_UNICODE_EMPTY();
12194
12195    /* no repeat, return original string */
12196    if (len == 1)
12197        return unicode_result_unchanged(str);
12198
12199    if (PyUnicode_READY(str) == -1)
12200        return NULL;
12201
12202    if (PyUnicode_GET_LENGTH(str) > PY_SSIZE_T_MAX / len) {
12203        PyErr_SetString(PyExc_OverflowError,
12204                        "repeated string is too long");
12205        return NULL;
12206    }
12207    nchars = len * PyUnicode_GET_LENGTH(str);
12208
12209    u = PyUnicode_New(nchars, PyUnicode_MAX_CHAR_VALUE(str));
12210    if (!u)
12211        return NULL;
12212    assert(PyUnicode_KIND(u) == PyUnicode_KIND(str));
12213
12214    if (PyUnicode_GET_LENGTH(str) == 1) {
12215        const int kind = PyUnicode_KIND(str);
12216        const Py_UCS4 fill_char = PyUnicode_READ(kind, PyUnicode_DATA(str), 0);
12217        if (kind == PyUnicode_1BYTE_KIND) {
12218            void *to = PyUnicode_DATA(u);
12219            memset(to, (unsigned char)fill_char, len);
12220        }
12221        else if (kind == PyUnicode_2BYTE_KIND) {
12222            Py_UCS2 *ucs2 = PyUnicode_2BYTE_DATA(u);
12223            for (n = 0; n < len; ++n)
12224                ucs2[n] = fill_char;
12225        } else {
12226            Py_UCS4 *ucs4 = PyUnicode_4BYTE_DATA(u);
12227            assert(kind == PyUnicode_4BYTE_KIND);
12228            for (n = 0; n < len; ++n)
12229                ucs4[n] = fill_char;
12230        }
12231    }
12232    else {
12233        /* number of characters copied this far */
12234        Py_ssize_t done = PyUnicode_GET_LENGTH(str);
12235        const Py_ssize_t char_size = PyUnicode_KIND(str);
12236        char *to = (char *) PyUnicode_DATA(u);
12237        Py_MEMCPY(to, PyUnicode_DATA(str),
12238                  PyUnicode_GET_LENGTH(str) * char_size);
12239        while (done < nchars) {
12240            n = (done <= nchars-done) ? done : nchars-done;
12241            Py_MEMCPY(to + (done * char_size), to, n * char_size);
12242            done += n;
12243        }
12244    }
12245
12246    assert(_PyUnicode_CheckConsistency(u, 1));
12247    return u;
12248}
12249
12250PyObject *
12251PyUnicode_Replace(PyObject *obj,
12252                  PyObject *subobj,
12253                  PyObject *replobj,
12254                  Py_ssize_t maxcount)
12255{
12256    PyObject *self;
12257    PyObject *str1;
12258    PyObject *str2;
12259    PyObject *result;
12260
12261    self = PyUnicode_FromObject(obj);
12262    if (self == NULL)
12263        return NULL;
12264    str1 = PyUnicode_FromObject(subobj);
12265    if (str1 == NULL) {
12266        Py_DECREF(self);
12267        return NULL;
12268    }
12269    str2 = PyUnicode_FromObject(replobj);
12270    if (str2 == NULL) {
12271        Py_DECREF(self);
12272        Py_DECREF(str1);
12273        return NULL;
12274    }
12275    if (PyUnicode_READY(self) == -1 ||
12276        PyUnicode_READY(str1) == -1 ||
12277        PyUnicode_READY(str2) == -1)
12278        result = NULL;
12279    else
12280        result = replace(self, str1, str2, maxcount);
12281    Py_DECREF(self);
12282    Py_DECREF(str1);
12283    Py_DECREF(str2);
12284    return result;
12285}
12286
12287PyDoc_STRVAR(replace__doc__,
12288             "S.replace(old, new[, count]) -> str\n\
12289\n\
12290Return a copy of S with all occurrences of substring\n\
12291old replaced by new.  If the optional argument count is\n\
12292given, only the first count occurrences are replaced.");
12293
12294static PyObject*
12295unicode_replace(PyObject *self, PyObject *args)
12296{
12297    PyObject *str1;
12298    PyObject *str2;
12299    Py_ssize_t maxcount = -1;
12300    PyObject *result;
12301
12302    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
12303        return NULL;
12304    if (PyUnicode_READY(self) == -1)
12305        return NULL;
12306    str1 = PyUnicode_FromObject(str1);
12307    if (str1 == NULL)
12308        return NULL;
12309    str2 = PyUnicode_FromObject(str2);
12310    if (str2 == NULL) {
12311        Py_DECREF(str1);
12312        return NULL;
12313    }
12314    if (PyUnicode_READY(str1) == -1 || PyUnicode_READY(str2) == -1)
12315        result = NULL;
12316    else
12317        result = replace(self, str1, str2, maxcount);
12318
12319    Py_DECREF(str1);
12320    Py_DECREF(str2);
12321    return result;
12322}
12323
12324static PyObject *
12325unicode_repr(PyObject *unicode)
12326{
12327    PyObject *repr;
12328    Py_ssize_t isize;
12329    Py_ssize_t osize, squote, dquote, i, o;
12330    Py_UCS4 max, quote;
12331    int ikind, okind, unchanged;
12332    void *idata, *odata;
12333
12334    if (PyUnicode_READY(unicode) == -1)
12335        return NULL;
12336
12337    isize = PyUnicode_GET_LENGTH(unicode);
12338    idata = PyUnicode_DATA(unicode);
12339
12340    /* Compute length of output, quote characters, and
12341       maximum character */
12342    osize = 0;
12343    max = 127;
12344    squote = dquote = 0;
12345    ikind = PyUnicode_KIND(unicode);
12346    for (i = 0; i < isize; i++) {
12347        Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12348        Py_ssize_t incr = 1;
12349        switch (ch) {
12350        case '\'': squote++; break;
12351        case '"':  dquote++; break;
12352        case '\\': case '\t': case '\r': case '\n':
12353            incr = 2;
12354            break;
12355        default:
12356            /* Fast-path ASCII */
12357            if (ch < ' ' || ch == 0x7f)
12358                incr = 4; /* \xHH */
12359            else if (ch < 0x7f)
12360                ;
12361            else if (Py_UNICODE_ISPRINTABLE(ch))
12362                max = ch > max ? ch : max;
12363            else if (ch < 0x100)
12364                incr = 4; /* \xHH */
12365            else if (ch < 0x10000)
12366                incr = 6; /* \uHHHH */
12367            else
12368                incr = 10; /* \uHHHHHHHH */
12369        }
12370        if (osize > PY_SSIZE_T_MAX - incr) {
12371            PyErr_SetString(PyExc_OverflowError,
12372                            "string is too long to generate repr");
12373            return NULL;
12374        }
12375        osize += incr;
12376    }
12377
12378    quote = '\'';
12379    unchanged = (osize == isize);
12380    if (squote) {
12381        unchanged = 0;
12382        if (dquote)
12383            /* Both squote and dquote present. Use squote,
12384               and escape them */
12385            osize += squote;
12386        else
12387            quote = '"';
12388    }
12389    osize += 2;   /* quotes */
12390
12391    repr = PyUnicode_New(osize, max);
12392    if (repr == NULL)
12393        return NULL;
12394    okind = PyUnicode_KIND(repr);
12395    odata = PyUnicode_DATA(repr);
12396
12397    PyUnicode_WRITE(okind, odata, 0, quote);
12398    PyUnicode_WRITE(okind, odata, osize-1, quote);
12399    if (unchanged) {
12400        _PyUnicode_FastCopyCharacters(repr, 1,
12401                                      unicode, 0,
12402                                      isize);
12403    }
12404    else {
12405        for (i = 0, o = 1; i < isize; i++) {
12406            Py_UCS4 ch = PyUnicode_READ(ikind, idata, i);
12407
12408            /* Escape quotes and backslashes */
12409            if ((ch == quote) || (ch == '\\')) {
12410                PyUnicode_WRITE(okind, odata, o++, '\\');
12411                PyUnicode_WRITE(okind, odata, o++, ch);
12412                continue;
12413            }
12414
12415            /* Map special whitespace to '\t', \n', '\r' */
12416            if (ch == '\t') {
12417                PyUnicode_WRITE(okind, odata, o++, '\\');
12418                PyUnicode_WRITE(okind, odata, o++, 't');
12419            }
12420            else if (ch == '\n') {
12421                PyUnicode_WRITE(okind, odata, o++, '\\');
12422                PyUnicode_WRITE(okind, odata, o++, 'n');
12423            }
12424            else if (ch == '\r') {
12425                PyUnicode_WRITE(okind, odata, o++, '\\');
12426                PyUnicode_WRITE(okind, odata, o++, 'r');
12427            }
12428
12429            /* Map non-printable US ASCII to '\xhh' */
12430            else if (ch < ' ' || ch == 0x7F) {
12431                PyUnicode_WRITE(okind, odata, o++, '\\');
12432                PyUnicode_WRITE(okind, odata, o++, 'x');
12433                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12434                PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12435            }
12436
12437            /* Copy ASCII characters as-is */
12438            else if (ch < 0x7F) {
12439                PyUnicode_WRITE(okind, odata, o++, ch);
12440            }
12441
12442            /* Non-ASCII characters */
12443            else {
12444                /* Map Unicode whitespace and control characters
12445                   (categories Z* and C* except ASCII space)
12446                */
12447                if (!Py_UNICODE_ISPRINTABLE(ch)) {
12448                    PyUnicode_WRITE(okind, odata, o++, '\\');
12449                    /* Map 8-bit characters to '\xhh' */
12450                    if (ch <= 0xff) {
12451                        PyUnicode_WRITE(okind, odata, o++, 'x');
12452                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0x000F]);
12453                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0x000F]);
12454                    }
12455                    /* Map 16-bit characters to '\uxxxx' */
12456                    else if (ch <= 0xffff) {
12457                        PyUnicode_WRITE(okind, odata, o++, 'u');
12458                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12459                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12460                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12461                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12462                    }
12463                    /* Map 21-bit characters to '\U00xxxxxx' */
12464                    else {
12465                        PyUnicode_WRITE(okind, odata, o++, 'U');
12466                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 28) & 0xF]);
12467                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 24) & 0xF]);
12468                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 20) & 0xF]);
12469                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 16) & 0xF]);
12470                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 12) & 0xF]);
12471                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 8) & 0xF]);
12472                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[(ch >> 4) & 0xF]);
12473                        PyUnicode_WRITE(okind, odata, o++, Py_hexdigits[ch & 0xF]);
12474                    }
12475                }
12476                /* Copy characters as-is */
12477                else {
12478                    PyUnicode_WRITE(okind, odata, o++, ch);
12479                }
12480            }
12481        }
12482    }
12483    /* Closing quote already added at the beginning */
12484    assert(_PyUnicode_CheckConsistency(repr, 1));
12485    return repr;
12486}
12487
12488PyDoc_STRVAR(rfind__doc__,
12489             "S.rfind(sub[, start[, end]]) -> int\n\
12490\n\
12491Return the highest index in S where substring sub is found,\n\
12492such that sub is contained within S[start:end].  Optional\n\
12493arguments start and end are interpreted as in slice notation.\n\
12494\n\
12495Return -1 on failure.");
12496
12497static PyObject *
12498unicode_rfind(PyObject *self, PyObject *args)
12499{
12500    PyObject *substring = NULL;
12501    Py_ssize_t start = 0;
12502    Py_ssize_t end = 0;
12503    Py_ssize_t result;
12504
12505    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
12506                                            &start, &end))
12507        return NULL;
12508
12509    if (PyUnicode_READY(self) == -1) {
12510        Py_DECREF(substring);
12511        return NULL;
12512    }
12513    if (PyUnicode_READY(substring) == -1) {
12514        Py_DECREF(substring);
12515        return NULL;
12516    }
12517
12518    result = any_find_slice(-1, self, substring, start, end);
12519
12520    Py_DECREF(substring);
12521
12522    if (result == -2)
12523        return NULL;
12524
12525    return PyLong_FromSsize_t(result);
12526}
12527
12528PyDoc_STRVAR(rindex__doc__,
12529             "S.rindex(sub[, start[, end]]) -> int\n\
12530\n\
12531Like S.rfind() but raise ValueError when the substring is not found.");
12532
12533static PyObject *
12534unicode_rindex(PyObject *self, PyObject *args)
12535{
12536    PyObject *substring = NULL;
12537    Py_ssize_t start = 0;
12538    Py_ssize_t end = 0;
12539    Py_ssize_t result;
12540
12541    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
12542                                            &start, &end))
12543        return NULL;
12544
12545    if (PyUnicode_READY(self) == -1) {
12546        Py_DECREF(substring);
12547        return NULL;
12548    }
12549    if (PyUnicode_READY(substring) == -1) {
12550        Py_DECREF(substring);
12551        return NULL;
12552    }
12553
12554    result = any_find_slice(-1, self, substring, start, end);
12555
12556    Py_DECREF(substring);
12557
12558    if (result == -2)
12559        return NULL;
12560
12561    if (result < 0) {
12562        PyErr_SetString(PyExc_ValueError, "substring not found");
12563        return NULL;
12564    }
12565
12566    return PyLong_FromSsize_t(result);
12567}
12568
12569PyDoc_STRVAR(rjust__doc__,
12570             "S.rjust(width[, fillchar]) -> str\n\
12571\n\
12572Return S right-justified in a string of length width. Padding is\n\
12573done using the specified fill character (default is a space).");
12574
12575static PyObject *
12576unicode_rjust(PyObject *self, PyObject *args)
12577{
12578    Py_ssize_t width;
12579    Py_UCS4 fillchar = ' ';
12580
12581    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
12582        return NULL;
12583
12584    if (PyUnicode_READY(self) == -1)
12585        return NULL;
12586
12587    if (PyUnicode_GET_LENGTH(self) >= width)
12588        return unicode_result_unchanged(self);
12589
12590    return pad(self, width - PyUnicode_GET_LENGTH(self), 0, fillchar);
12591}
12592
12593PyObject *
12594PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12595{
12596    PyObject *result;
12597
12598    s = PyUnicode_FromObject(s);
12599    if (s == NULL)
12600        return NULL;
12601    if (sep != NULL) {
12602        sep = PyUnicode_FromObject(sep);
12603        if (sep == NULL) {
12604            Py_DECREF(s);
12605            return NULL;
12606        }
12607    }
12608
12609    result = split(s, sep, maxsplit);
12610
12611    Py_DECREF(s);
12612    Py_XDECREF(sep);
12613    return result;
12614}
12615
12616PyDoc_STRVAR(split__doc__,
12617             "S.split(sep=None, maxsplit=-1) -> list of strings\n\
12618\n\
12619Return a list of the words in S, using sep as the\n\
12620delimiter string.  If maxsplit is given, at most maxsplit\n\
12621splits are done. If sep is not specified or is None, any\n\
12622whitespace string is a separator and empty strings are\n\
12623removed from the result.");
12624
12625static PyObject*
12626unicode_split(PyObject *self, PyObject *args, PyObject *kwds)
12627{
12628    static char *kwlist[] = {"sep", "maxsplit", 0};
12629    PyObject *substring = Py_None;
12630    Py_ssize_t maxcount = -1;
12631
12632    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:split",
12633                                     kwlist, &substring, &maxcount))
12634        return NULL;
12635
12636    if (substring == Py_None)
12637        return split(self, NULL, maxcount);
12638    else if (PyUnicode_Check(substring))
12639        return split(self, substring, maxcount);
12640    else
12641        return PyUnicode_Split(self, substring, maxcount);
12642}
12643
12644PyObject *
12645PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
12646{
12647    PyObject* str_obj;
12648    PyObject* sep_obj;
12649    PyObject* out;
12650    int kind1, kind2, kind;
12651    void *buf1 = NULL, *buf2 = NULL;
12652    Py_ssize_t len1, len2;
12653
12654    str_obj = PyUnicode_FromObject(str_in);
12655    if (!str_obj)
12656        return NULL;
12657    sep_obj = PyUnicode_FromObject(sep_in);
12658    if (!sep_obj) {
12659        Py_DECREF(str_obj);
12660        return NULL;
12661    }
12662    if (PyUnicode_READY(sep_obj) == -1 || PyUnicode_READY(str_obj) == -1) {
12663        Py_DECREF(sep_obj);
12664        Py_DECREF(str_obj);
12665        return NULL;
12666    }
12667
12668    kind1 = PyUnicode_KIND(str_obj);
12669    kind2 = PyUnicode_KIND(sep_obj);
12670    kind = Py_MAX(kind1, kind2);
12671    buf1 = PyUnicode_DATA(str_obj);
12672    if (kind1 != kind)
12673        buf1 = _PyUnicode_AsKind(str_obj, kind);
12674    if (!buf1)
12675        goto onError;
12676    buf2 = PyUnicode_DATA(sep_obj);
12677    if (kind2 != kind)
12678        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12679    if (!buf2)
12680        goto onError;
12681    len1 = PyUnicode_GET_LENGTH(str_obj);
12682    len2 = PyUnicode_GET_LENGTH(sep_obj);
12683
12684    switch (kind) {
12685    case PyUnicode_1BYTE_KIND:
12686        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12687            out = asciilib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12688        else
12689            out = ucs1lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12690        break;
12691    case PyUnicode_2BYTE_KIND:
12692        out = ucs2lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12693        break;
12694    case PyUnicode_4BYTE_KIND:
12695        out = ucs4lib_partition(str_obj, buf1, len1, sep_obj, buf2, len2);
12696        break;
12697    default:
12698        assert(0);
12699        out = 0;
12700    }
12701
12702    Py_DECREF(sep_obj);
12703    Py_DECREF(str_obj);
12704    if (kind1 != kind)
12705        PyMem_Free(buf1);
12706    if (kind2 != kind)
12707        PyMem_Free(buf2);
12708
12709    return out;
12710  onError:
12711    Py_DECREF(sep_obj);
12712    Py_DECREF(str_obj);
12713    if (kind1 != kind && buf1)
12714        PyMem_Free(buf1);
12715    if (kind2 != kind && buf2)
12716        PyMem_Free(buf2);
12717    return NULL;
12718}
12719
12720
12721PyObject *
12722PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
12723{
12724    PyObject* str_obj;
12725    PyObject* sep_obj;
12726    PyObject* out;
12727    int kind1, kind2, kind;
12728    void *buf1 = NULL, *buf2 = NULL;
12729    Py_ssize_t len1, len2;
12730
12731    str_obj = PyUnicode_FromObject(str_in);
12732    if (!str_obj)
12733        return NULL;
12734    sep_obj = PyUnicode_FromObject(sep_in);
12735    if (!sep_obj) {
12736        Py_DECREF(str_obj);
12737        return NULL;
12738    }
12739
12740    kind1 = PyUnicode_KIND(str_obj);
12741    kind2 = PyUnicode_KIND(sep_obj);
12742    kind = Py_MAX(kind1, kind2);
12743    buf1 = PyUnicode_DATA(str_obj);
12744    if (kind1 != kind)
12745        buf1 = _PyUnicode_AsKind(str_obj, kind);
12746    if (!buf1)
12747        goto onError;
12748    buf2 = PyUnicode_DATA(sep_obj);
12749    if (kind2 != kind)
12750        buf2 = _PyUnicode_AsKind(sep_obj, kind);
12751    if (!buf2)
12752        goto onError;
12753    len1 = PyUnicode_GET_LENGTH(str_obj);
12754    len2 = PyUnicode_GET_LENGTH(sep_obj);
12755
12756    switch (kind) {
12757    case PyUnicode_1BYTE_KIND:
12758        if (PyUnicode_IS_ASCII(str_obj) && PyUnicode_IS_ASCII(sep_obj))
12759            out = asciilib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12760        else
12761            out = ucs1lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12762        break;
12763    case PyUnicode_2BYTE_KIND:
12764        out = ucs2lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12765        break;
12766    case PyUnicode_4BYTE_KIND:
12767        out = ucs4lib_rpartition(str_obj, buf1, len1, sep_obj, buf2, len2);
12768        break;
12769    default:
12770        assert(0);
12771        out = 0;
12772    }
12773
12774    Py_DECREF(sep_obj);
12775    Py_DECREF(str_obj);
12776    if (kind1 != kind)
12777        PyMem_Free(buf1);
12778    if (kind2 != kind)
12779        PyMem_Free(buf2);
12780
12781    return out;
12782  onError:
12783    Py_DECREF(sep_obj);
12784    Py_DECREF(str_obj);
12785    if (kind1 != kind && buf1)
12786        PyMem_Free(buf1);
12787    if (kind2 != kind && buf2)
12788        PyMem_Free(buf2);
12789    return NULL;
12790}
12791
12792PyDoc_STRVAR(partition__doc__,
12793             "S.partition(sep) -> (head, sep, tail)\n\
12794\n\
12795Search for the separator sep in S, and return the part before it,\n\
12796the separator itself, and the part after it.  If the separator is not\n\
12797found, return S and two empty strings.");
12798
12799static PyObject*
12800unicode_partition(PyObject *self, PyObject *separator)
12801{
12802    return PyUnicode_Partition(self, separator);
12803}
12804
12805PyDoc_STRVAR(rpartition__doc__,
12806             "S.rpartition(sep) -> (head, sep, tail)\n\
12807\n\
12808Search for the separator sep in S, starting at the end of S, and return\n\
12809the part before it, the separator itself, and the part after it.  If the\n\
12810separator is not found, return two empty strings and S.");
12811
12812static PyObject*
12813unicode_rpartition(PyObject *self, PyObject *separator)
12814{
12815    return PyUnicode_RPartition(self, separator);
12816}
12817
12818PyObject *
12819PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
12820{
12821    PyObject *result;
12822
12823    s = PyUnicode_FromObject(s);
12824    if (s == NULL)
12825        return NULL;
12826    if (sep != NULL) {
12827        sep = PyUnicode_FromObject(sep);
12828        if (sep == NULL) {
12829            Py_DECREF(s);
12830            return NULL;
12831        }
12832    }
12833
12834    result = rsplit(s, sep, maxsplit);
12835
12836    Py_DECREF(s);
12837    Py_XDECREF(sep);
12838    return result;
12839}
12840
12841PyDoc_STRVAR(rsplit__doc__,
12842             "S.rsplit(sep=None, maxsplit=-1) -> list of strings\n\
12843\n\
12844Return a list of the words in S, using sep as the\n\
12845delimiter string, starting at the end of the string and\n\
12846working to the front.  If maxsplit is given, at most maxsplit\n\
12847splits are done. If sep is not specified, any whitespace string\n\
12848is a separator.");
12849
12850static PyObject*
12851unicode_rsplit(PyObject *self, PyObject *args, PyObject *kwds)
12852{
12853    static char *kwlist[] = {"sep", "maxsplit", 0};
12854    PyObject *substring = Py_None;
12855    Py_ssize_t maxcount = -1;
12856
12857    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|On:rsplit",
12858                                     kwlist, &substring, &maxcount))
12859        return NULL;
12860
12861    if (substring == Py_None)
12862        return rsplit(self, NULL, maxcount);
12863    else if (PyUnicode_Check(substring))
12864        return rsplit(self, substring, maxcount);
12865    else
12866        return PyUnicode_RSplit(self, substring, maxcount);
12867}
12868
12869PyDoc_STRVAR(splitlines__doc__,
12870             "S.splitlines([keepends]) -> list of strings\n\
12871\n\
12872Return a list of the lines in S, breaking at line boundaries.\n\
12873Line breaks are not included in the resulting list unless keepends\n\
12874is given and true.");
12875
12876static PyObject*
12877unicode_splitlines(PyObject *self, PyObject *args, PyObject *kwds)
12878{
12879    static char *kwlist[] = {"keepends", 0};
12880    int keepends = 0;
12881
12882    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|i:splitlines",
12883                                     kwlist, &keepends))
12884        return NULL;
12885
12886    return PyUnicode_Splitlines(self, keepends);
12887}
12888
12889static
12890PyObject *unicode_str(PyObject *self)
12891{
12892    return unicode_result_unchanged(self);
12893}
12894
12895PyDoc_STRVAR(swapcase__doc__,
12896             "S.swapcase() -> str\n\
12897\n\
12898Return a copy of S with uppercase characters converted to lowercase\n\
12899and vice versa.");
12900
12901static PyObject*
12902unicode_swapcase(PyObject *self)
12903{
12904    if (PyUnicode_READY(self) == -1)
12905        return NULL;
12906    return case_operation(self, do_swapcase);
12907}
12908
12909/*[clinic input]
12910
12911@staticmethod
12912str.maketrans as unicode_maketrans
12913
12914  x: object
12915
12916  y: unicode=NULL
12917
12918  z: unicode=NULL
12919
12920  /
12921
12922Return a translation table usable for str.translate().
12923
12924If there is only one argument, it must be a dictionary mapping Unicode
12925ordinals (integers) or characters to Unicode ordinals, strings or None.
12926Character keys will be then converted to ordinals.
12927If there are two arguments, they must be strings of equal length, and
12928in the resulting dictionary, each character in x will be mapped to the
12929character at the same position in y. If there is a third argument, it
12930must be a string, whose characters will be mapped to None in the result.
12931[clinic start generated code]*/
12932
12933PyDoc_STRVAR(unicode_maketrans__doc__,
12934"maketrans(x, y=None, z=None, /)\n"
12935"--\n"
12936"\n"
12937"Return a translation table usable for str.translate().\n"
12938"\n"
12939"If there is only one argument, it must be a dictionary mapping Unicode\n"
12940"ordinals (integers) or characters to Unicode ordinals, strings or None.\n"
12941"Character keys will be then converted to ordinals.\n"
12942"If there are two arguments, they must be strings of equal length, and\n"
12943"in the resulting dictionary, each character in x will be mapped to the\n"
12944"character at the same position in y. If there is a third argument, it\n"
12945"must be a string, whose characters will be mapped to None in the result.");
12946
12947#define UNICODE_MAKETRANS_METHODDEF    \
12948    {"maketrans", (PyCFunction)unicode_maketrans, METH_VARARGS|METH_STATIC, unicode_maketrans__doc__},
12949
12950static PyObject *
12951unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z);
12952
12953static PyObject *
12954unicode_maketrans(void *null, PyObject *args)
12955{
12956    PyObject *return_value = NULL;
12957    PyObject *x;
12958    PyObject *y = NULL;
12959    PyObject *z = NULL;
12960
12961    if (!PyArg_ParseTuple(args,
12962        "O|UU:maketrans",
12963        &x, &y, &z))
12964        goto exit;
12965    return_value = unicode_maketrans_impl(x, y, z);
12966
12967exit:
12968    return return_value;
12969}
12970
12971static PyObject *
12972unicode_maketrans_impl(PyObject *x, PyObject *y, PyObject *z)
12973/*[clinic end generated code: output=566edf630f77436a input=7bfbf529a293c6c5]*/
12974{
12975    PyObject *new = NULL, *key, *value;
12976    Py_ssize_t i = 0;
12977    int res;
12978
12979    new = PyDict_New();
12980    if (!new)
12981        return NULL;
12982    if (y != NULL) {
12983        int x_kind, y_kind, z_kind;
12984        void *x_data, *y_data, *z_data;
12985
12986        /* x must be a string too, of equal length */
12987        if (!PyUnicode_Check(x)) {
12988            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
12989                            "be a string if there is a second argument");
12990            goto err;
12991        }
12992        if (PyUnicode_GET_LENGTH(x) != PyUnicode_GET_LENGTH(y)) {
12993            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
12994                            "arguments must have equal length");
12995            goto err;
12996        }
12997        /* create entries for translating chars in x to those in y */
12998        x_kind = PyUnicode_KIND(x);
12999        y_kind = PyUnicode_KIND(y);
13000        x_data = PyUnicode_DATA(x);
13001        y_data = PyUnicode_DATA(y);
13002        for (i = 0; i < PyUnicode_GET_LENGTH(x); i++) {
13003            key = PyLong_FromLong(PyUnicode_READ(x_kind, x_data, i));
13004            if (!key)
13005                goto err;
13006            value = PyLong_FromLong(PyUnicode_READ(y_kind, y_data, i));
13007            if (!value) {
13008                Py_DECREF(key);
13009                goto err;
13010            }
13011            res = PyDict_SetItem(new, key, value);
13012            Py_DECREF(key);
13013            Py_DECREF(value);
13014            if (res < 0)
13015                goto err;
13016        }
13017        /* create entries for deleting chars in z */
13018        if (z != NULL) {
13019            z_kind = PyUnicode_KIND(z);
13020            z_data = PyUnicode_DATA(z);
13021            for (i = 0; i < PyUnicode_GET_LENGTH(z); i++) {
13022                key = PyLong_FromLong(PyUnicode_READ(z_kind, z_data, i));
13023                if (!key)
13024                    goto err;
13025                res = PyDict_SetItem(new, key, Py_None);
13026                Py_DECREF(key);
13027                if (res < 0)
13028                    goto err;
13029            }
13030        }
13031    } else {
13032        int kind;
13033        void *data;
13034
13035        /* x must be a dict */
13036        if (!PyDict_CheckExact(x)) {
13037            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
13038                            "to maketrans it must be a dict");
13039            goto err;
13040        }
13041        /* copy entries into the new dict, converting string keys to int keys */
13042        while (PyDict_Next(x, &i, &key, &value)) {
13043            if (PyUnicode_Check(key)) {
13044                /* convert string keys to integer keys */
13045                PyObject *newkey;
13046                if (PyUnicode_GET_LENGTH(key) != 1) {
13047                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
13048                                    "table must be of length 1");
13049                    goto err;
13050                }
13051                kind = PyUnicode_KIND(key);
13052                data = PyUnicode_DATA(key);
13053                newkey = PyLong_FromLong(PyUnicode_READ(kind, data, 0));
13054                if (!newkey)
13055                    goto err;
13056                res = PyDict_SetItem(new, newkey, value);
13057                Py_DECREF(newkey);
13058                if (res < 0)
13059                    goto err;
13060            } else if (PyLong_Check(key)) {
13061                /* just keep integer keys */
13062                if (PyDict_SetItem(new, key, value) < 0)
13063                    goto err;
13064            } else {
13065                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
13066                                "be strings or integers");
13067                goto err;
13068            }
13069        }
13070    }
13071    return new;
13072  err:
13073    Py_DECREF(new);
13074    return NULL;
13075}
13076
13077PyDoc_STRVAR(translate__doc__,
13078             "S.translate(table) -> str\n\
13079\n\
13080Return a copy of the string S, where all characters have been mapped\n\
13081through the given translation table, which must be a mapping of\n\
13082Unicode ordinals to Unicode ordinals, strings, or None.\n\
13083Unmapped characters are left untouched. Characters mapped to None\n\
13084are deleted.");
13085
13086static PyObject*
13087unicode_translate(PyObject *self, PyObject *table)
13088{
13089    return _PyUnicode_TranslateCharmap(self, table, "ignore");
13090}
13091
13092PyDoc_STRVAR(upper__doc__,
13093             "S.upper() -> str\n\
13094\n\
13095Return a copy of S converted to uppercase.");
13096
13097static PyObject*
13098unicode_upper(PyObject *self)
13099{
13100    if (PyUnicode_READY(self) == -1)
13101        return NULL;
13102    if (PyUnicode_IS_ASCII(self))
13103        return ascii_upper_or_lower(self, 0);
13104    return case_operation(self, do_upper);
13105}
13106
13107PyDoc_STRVAR(zfill__doc__,
13108             "S.zfill(width) -> str\n\
13109\n\
13110Pad a numeric string S with zeros on the left, to fill a field\n\
13111of the specified width. The string S is never truncated.");
13112
13113static PyObject *
13114unicode_zfill(PyObject *self, PyObject *args)
13115{
13116    Py_ssize_t fill;
13117    PyObject *u;
13118    Py_ssize_t width;
13119    int kind;
13120    void *data;
13121    Py_UCS4 chr;
13122
13123    if (!PyArg_ParseTuple(args, "n:zfill", &width))
13124        return NULL;
13125
13126    if (PyUnicode_READY(self) == -1)
13127        return NULL;
13128
13129    if (PyUnicode_GET_LENGTH(self) >= width)
13130        return unicode_result_unchanged(self);
13131
13132    fill = width - PyUnicode_GET_LENGTH(self);
13133
13134    u = pad(self, fill, 0, '0');
13135
13136    if (u == NULL)
13137        return NULL;
13138
13139    kind = PyUnicode_KIND(u);
13140    data = PyUnicode_DATA(u);
13141    chr = PyUnicode_READ(kind, data, fill);
13142
13143    if (chr == '+' || chr == '-') {
13144        /* move sign to beginning of string */
13145        PyUnicode_WRITE(kind, data, 0, chr);
13146        PyUnicode_WRITE(kind, data, fill, '0');
13147    }
13148
13149    assert(_PyUnicode_CheckConsistency(u, 1));
13150    return u;
13151}
13152
13153#if 0
13154static PyObject *
13155unicode__decimal2ascii(PyObject *self)
13156{
13157    return PyUnicode_TransformDecimalAndSpaceToASCII(self);
13158}
13159#endif
13160
13161PyDoc_STRVAR(startswith__doc__,
13162             "S.startswith(prefix[, start[, end]]) -> bool\n\
13163\n\
13164Return True if S starts with the specified prefix, False otherwise.\n\
13165With optional start, test S beginning at that position.\n\
13166With optional end, stop comparing S at that position.\n\
13167prefix can also be a tuple of strings to try.");
13168
13169static PyObject *
13170unicode_startswith(PyObject *self,
13171                   PyObject *args)
13172{
13173    PyObject *subobj;
13174    PyObject *substring;
13175    Py_ssize_t start = 0;
13176    Py_ssize_t end = PY_SSIZE_T_MAX;
13177    int result;
13178
13179    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
13180        return NULL;
13181    if (PyTuple_Check(subobj)) {
13182        Py_ssize_t i;
13183        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13184            substring = PyUnicode_FromObject(PyTuple_GET_ITEM(subobj, i));
13185            if (substring == NULL)
13186                return NULL;
13187            result = tailmatch(self, substring, start, end, -1);
13188            Py_DECREF(substring);
13189            if (result == -1)
13190                return NULL;
13191            if (result) {
13192                Py_RETURN_TRUE;
13193            }
13194        }
13195        /* nothing matched */
13196        Py_RETURN_FALSE;
13197    }
13198    substring = PyUnicode_FromObject(subobj);
13199    if (substring == NULL) {
13200        if (PyErr_ExceptionMatches(PyExc_TypeError))
13201            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
13202                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13203        return NULL;
13204    }
13205    result = tailmatch(self, substring, start, end, -1);
13206    Py_DECREF(substring);
13207    if (result == -1)
13208        return NULL;
13209    return PyBool_FromLong(result);
13210}
13211
13212
13213PyDoc_STRVAR(endswith__doc__,
13214             "S.endswith(suffix[, start[, end]]) -> bool\n\
13215\n\
13216Return True if S ends with the specified suffix, False otherwise.\n\
13217With optional start, test S beginning at that position.\n\
13218With optional end, stop comparing S at that position.\n\
13219suffix can also be a tuple of strings to try.");
13220
13221static PyObject *
13222unicode_endswith(PyObject *self,
13223                 PyObject *args)
13224{
13225    PyObject *subobj;
13226    PyObject *substring;
13227    Py_ssize_t start = 0;
13228    Py_ssize_t end = PY_SSIZE_T_MAX;
13229    int result;
13230
13231    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
13232        return NULL;
13233    if (PyTuple_Check(subobj)) {
13234        Py_ssize_t i;
13235        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
13236            substring = PyUnicode_FromObject(
13237                PyTuple_GET_ITEM(subobj, i));
13238            if (substring == NULL)
13239                return NULL;
13240            result = tailmatch(self, substring, start, end, +1);
13241            Py_DECREF(substring);
13242            if (result == -1)
13243                return NULL;
13244            if (result) {
13245                Py_RETURN_TRUE;
13246            }
13247        }
13248        Py_RETURN_FALSE;
13249    }
13250    substring = PyUnicode_FromObject(subobj);
13251    if (substring == NULL) {
13252        if (PyErr_ExceptionMatches(PyExc_TypeError))
13253            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
13254                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
13255        return NULL;
13256    }
13257    result = tailmatch(self, substring, start, end, +1);
13258    Py_DECREF(substring);
13259    if (result == -1)
13260        return NULL;
13261    return PyBool_FromLong(result);
13262}
13263
13264Py_LOCAL_INLINE(void)
13265_PyUnicodeWriter_Update(_PyUnicodeWriter *writer)
13266{
13267    if (!writer->readonly)
13268        writer->size = PyUnicode_GET_LENGTH(writer->buffer);
13269    else {
13270        /* Copy-on-write mode: set buffer size to 0 so
13271         * _PyUnicodeWriter_Prepare() will copy (and enlarge) the buffer on
13272         * next write. */
13273        writer->size = 0;
13274    }
13275    writer->maxchar = PyUnicode_MAX_CHAR_VALUE(writer->buffer);
13276    writer->data = PyUnicode_DATA(writer->buffer);
13277    writer->kind = PyUnicode_KIND(writer->buffer);
13278}
13279
13280void
13281_PyUnicodeWriter_Init(_PyUnicodeWriter *writer)
13282{
13283    memset(writer, 0, sizeof(*writer));
13284#ifdef Py_DEBUG
13285    writer->kind = 5;    /* invalid kind */
13286#endif
13287    writer->min_char = 127;
13288}
13289
13290int
13291_PyUnicodeWriter_PrepareInternal(_PyUnicodeWriter *writer,
13292                                 Py_ssize_t length, Py_UCS4 maxchar)
13293{
13294#ifdef MS_WINDOWS
13295   /* On Windows, overallocate by 50% is the best factor */
13296#  define OVERALLOCATE_FACTOR 2
13297#else
13298   /* On Linux, overallocate by 25% is the best factor */
13299#  define OVERALLOCATE_FACTOR 4
13300#endif
13301    Py_ssize_t newlen;
13302    PyObject *newbuffer;
13303
13304    assert(length > 0);
13305
13306    if (length > PY_SSIZE_T_MAX - writer->pos) {
13307        PyErr_NoMemory();
13308        return -1;
13309    }
13310    newlen = writer->pos + length;
13311
13312    maxchar = Py_MAX(maxchar, writer->min_char);
13313
13314    if (writer->buffer == NULL) {
13315        assert(!writer->readonly);
13316        if (writer->overallocate
13317            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13318            /* overallocate to limit the number of realloc() */
13319            newlen += newlen / OVERALLOCATE_FACTOR;
13320        }
13321        if (newlen < writer->min_length)
13322            newlen = writer->min_length;
13323
13324        writer->buffer = PyUnicode_New(newlen, maxchar);
13325        if (writer->buffer == NULL)
13326            return -1;
13327    }
13328    else if (newlen > writer->size) {
13329        if (writer->overallocate
13330            && newlen <= (PY_SSIZE_T_MAX - newlen / OVERALLOCATE_FACTOR)) {
13331            /* overallocate to limit the number of realloc() */
13332            newlen += newlen / OVERALLOCATE_FACTOR;
13333        }
13334        if (newlen < writer->min_length)
13335            newlen = writer->min_length;
13336
13337        if (maxchar > writer->maxchar || writer->readonly) {
13338            /* resize + widen */
13339            newbuffer = PyUnicode_New(newlen, maxchar);
13340            if (newbuffer == NULL)
13341                return -1;
13342            _PyUnicode_FastCopyCharacters(newbuffer, 0,
13343                                          writer->buffer, 0, writer->pos);
13344            Py_DECREF(writer->buffer);
13345            writer->readonly = 0;
13346        }
13347        else {
13348            newbuffer = resize_compact(writer->buffer, newlen);
13349            if (newbuffer == NULL)
13350                return -1;
13351        }
13352        writer->buffer = newbuffer;
13353    }
13354    else if (maxchar > writer->maxchar) {
13355        assert(!writer->readonly);
13356        newbuffer = PyUnicode_New(writer->size, maxchar);
13357        if (newbuffer == NULL)
13358            return -1;
13359        _PyUnicode_FastCopyCharacters(newbuffer, 0,
13360                                      writer->buffer, 0, writer->pos);
13361        Py_DECREF(writer->buffer);
13362        writer->buffer = newbuffer;
13363    }
13364    _PyUnicodeWriter_Update(writer);
13365    return 0;
13366
13367#undef OVERALLOCATE_FACTOR
13368}
13369
13370Py_LOCAL_INLINE(int)
13371_PyUnicodeWriter_WriteCharInline(_PyUnicodeWriter *writer, Py_UCS4 ch)
13372{
13373    if (_PyUnicodeWriter_Prepare(writer, 1, ch) < 0)
13374        return -1;
13375    PyUnicode_WRITE(writer->kind, writer->data, writer->pos, ch);
13376    writer->pos++;
13377    return 0;
13378}
13379
13380int
13381_PyUnicodeWriter_WriteChar(_PyUnicodeWriter *writer, Py_UCS4 ch)
13382{
13383    return _PyUnicodeWriter_WriteCharInline(writer, ch);
13384}
13385
13386int
13387_PyUnicodeWriter_WriteStr(_PyUnicodeWriter *writer, PyObject *str)
13388{
13389    Py_UCS4 maxchar;
13390    Py_ssize_t len;
13391
13392    if (PyUnicode_READY(str) == -1)
13393        return -1;
13394    len = PyUnicode_GET_LENGTH(str);
13395    if (len == 0)
13396        return 0;
13397    maxchar = PyUnicode_MAX_CHAR_VALUE(str);
13398    if (maxchar > writer->maxchar || len > writer->size - writer->pos) {
13399        if (writer->buffer == NULL && !writer->overallocate) {
13400            writer->readonly = 1;
13401            Py_INCREF(str);
13402            writer->buffer = str;
13403            _PyUnicodeWriter_Update(writer);
13404            writer->pos += len;
13405            return 0;
13406        }
13407        if (_PyUnicodeWriter_PrepareInternal(writer, len, maxchar) == -1)
13408            return -1;
13409    }
13410    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13411                                  str, 0, len);
13412    writer->pos += len;
13413    return 0;
13414}
13415
13416int
13417_PyUnicodeWriter_WriteSubstring(_PyUnicodeWriter *writer, PyObject *str,
13418                                Py_ssize_t start, Py_ssize_t end)
13419{
13420    Py_UCS4 maxchar;
13421    Py_ssize_t len;
13422
13423    if (PyUnicode_READY(str) == -1)
13424        return -1;
13425
13426    assert(0 <= start);
13427    assert(end <= PyUnicode_GET_LENGTH(str));
13428    assert(start <= end);
13429
13430    if (end == 0)
13431        return 0;
13432
13433    if (start == 0 && end == PyUnicode_GET_LENGTH(str))
13434        return _PyUnicodeWriter_WriteStr(writer, str);
13435
13436    if (PyUnicode_MAX_CHAR_VALUE(str) > writer->maxchar)
13437        maxchar = _PyUnicode_FindMaxChar(str, start, end);
13438    else
13439        maxchar = writer->maxchar;
13440    len = end - start;
13441
13442    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) < 0)
13443        return -1;
13444
13445    _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
13446                                  str, start, len);
13447    writer->pos += len;
13448    return 0;
13449}
13450
13451int
13452_PyUnicodeWriter_WriteASCIIString(_PyUnicodeWriter *writer,
13453                                  const char *ascii, Py_ssize_t len)
13454{
13455    if (len == -1)
13456        len = strlen(ascii);
13457
13458    assert(ucs1lib_find_max_char((Py_UCS1*)ascii, (Py_UCS1*)ascii + len) < 128);
13459
13460    if (writer->buffer == NULL && !writer->overallocate) {
13461        PyObject *str;
13462
13463        str = _PyUnicode_FromASCII(ascii, len);
13464        if (str == NULL)
13465            return -1;
13466
13467        writer->readonly = 1;
13468        writer->buffer = str;
13469        _PyUnicodeWriter_Update(writer);
13470        writer->pos += len;
13471        return 0;
13472    }
13473
13474    if (_PyUnicodeWriter_Prepare(writer, len, 127) == -1)
13475        return -1;
13476
13477    switch (writer->kind)
13478    {
13479    case PyUnicode_1BYTE_KIND:
13480    {
13481        const Py_UCS1 *str = (const Py_UCS1 *)ascii;
13482        Py_UCS1 *data = writer->data;
13483
13484        Py_MEMCPY(data + writer->pos, str, len);
13485        break;
13486    }
13487    case PyUnicode_2BYTE_KIND:
13488    {
13489        _PyUnicode_CONVERT_BYTES(
13490            Py_UCS1, Py_UCS2,
13491            ascii, ascii + len,
13492            (Py_UCS2 *)writer->data + writer->pos);
13493        break;
13494    }
13495    case PyUnicode_4BYTE_KIND:
13496    {
13497        _PyUnicode_CONVERT_BYTES(
13498            Py_UCS1, Py_UCS4,
13499            ascii, ascii + len,
13500            (Py_UCS4 *)writer->data + writer->pos);
13501        break;
13502    }
13503    default:
13504        assert(0);
13505    }
13506
13507    writer->pos += len;
13508    return 0;
13509}
13510
13511int
13512_PyUnicodeWriter_WriteLatin1String(_PyUnicodeWriter *writer,
13513                                   const char *str, Py_ssize_t len)
13514{
13515    Py_UCS4 maxchar;
13516
13517    maxchar = ucs1lib_find_max_char((Py_UCS1*)str, (Py_UCS1*)str + len);
13518    if (_PyUnicodeWriter_Prepare(writer, len, maxchar) == -1)
13519        return -1;
13520    unicode_write_cstr(writer->buffer, writer->pos, str, len);
13521    writer->pos += len;
13522    return 0;
13523}
13524
13525PyObject *
13526_PyUnicodeWriter_Finish(_PyUnicodeWriter *writer)
13527{
13528    PyObject *str;
13529    if (writer->pos == 0) {
13530        Py_CLEAR(writer->buffer);
13531        _Py_RETURN_UNICODE_EMPTY();
13532    }
13533    if (writer->readonly) {
13534        str = writer->buffer;
13535        writer->buffer = NULL;
13536        assert(PyUnicode_GET_LENGTH(str) == writer->pos);
13537        return str;
13538    }
13539    if (PyUnicode_GET_LENGTH(writer->buffer) != writer->pos) {
13540        PyObject *newbuffer;
13541        newbuffer = resize_compact(writer->buffer, writer->pos);
13542        if (newbuffer == NULL) {
13543            Py_CLEAR(writer->buffer);
13544            return NULL;
13545        }
13546        writer->buffer = newbuffer;
13547    }
13548    str = writer->buffer;
13549    writer->buffer = NULL;
13550    assert(_PyUnicode_CheckConsistency(str, 1));
13551    return unicode_result_ready(str);
13552}
13553
13554void
13555_PyUnicodeWriter_Dealloc(_PyUnicodeWriter *writer)
13556{
13557    Py_CLEAR(writer->buffer);
13558}
13559
13560#include "stringlib/unicode_format.h"
13561
13562PyDoc_STRVAR(format__doc__,
13563             "S.format(*args, **kwargs) -> str\n\
13564\n\
13565Return a formatted version of S, using substitutions from args and kwargs.\n\
13566The substitutions are identified by braces ('{' and '}').");
13567
13568PyDoc_STRVAR(format_map__doc__,
13569             "S.format_map(mapping) -> str\n\
13570\n\
13571Return a formatted version of S, using substitutions from mapping.\n\
13572The substitutions are identified by braces ('{' and '}').");
13573
13574static PyObject *
13575unicode__format__(PyObject* self, PyObject* args)
13576{
13577    PyObject *format_spec;
13578    _PyUnicodeWriter writer;
13579    int ret;
13580
13581    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
13582        return NULL;
13583
13584    if (PyUnicode_READY(self) == -1)
13585        return NULL;
13586    _PyUnicodeWriter_Init(&writer);
13587    ret = _PyUnicode_FormatAdvancedWriter(&writer,
13588                                          self, format_spec, 0,
13589                                          PyUnicode_GET_LENGTH(format_spec));
13590    if (ret == -1) {
13591        _PyUnicodeWriter_Dealloc(&writer);
13592        return NULL;
13593    }
13594    return _PyUnicodeWriter_Finish(&writer);
13595}
13596
13597PyDoc_STRVAR(p_format__doc__,
13598             "S.__format__(format_spec) -> str\n\
13599\n\
13600Return a formatted version of S as described by format_spec.");
13601
13602static PyObject *
13603unicode__sizeof__(PyObject *v)
13604{
13605    Py_ssize_t size;
13606
13607    /* If it's a compact object, account for base structure +
13608       character data. */
13609    if (PyUnicode_IS_COMPACT_ASCII(v))
13610        size = sizeof(PyASCIIObject) + PyUnicode_GET_LENGTH(v) + 1;
13611    else if (PyUnicode_IS_COMPACT(v))
13612        size = sizeof(PyCompactUnicodeObject) +
13613            (PyUnicode_GET_LENGTH(v) + 1) * PyUnicode_KIND(v);
13614    else {
13615        /* If it is a two-block object, account for base object, and
13616           for character block if present. */
13617        size = sizeof(PyUnicodeObject);
13618        if (_PyUnicode_DATA_ANY(v))
13619            size += (PyUnicode_GET_LENGTH(v) + 1) *
13620                PyUnicode_KIND(v);
13621    }
13622    /* If the wstr pointer is present, account for it unless it is shared
13623       with the data pointer. Check if the data is not shared. */
13624    if (_PyUnicode_HAS_WSTR_MEMORY(v))
13625        size += (PyUnicode_WSTR_LENGTH(v) + 1) * sizeof(wchar_t);
13626    if (_PyUnicode_HAS_UTF8_MEMORY(v))
13627        size += PyUnicode_UTF8_LENGTH(v) + 1;
13628
13629    return PyLong_FromSsize_t(size);
13630}
13631
13632PyDoc_STRVAR(sizeof__doc__,
13633             "S.__sizeof__() -> size of S in memory, in bytes");
13634
13635static PyObject *
13636unicode_getnewargs(PyObject *v)
13637{
13638    PyObject *copy = _PyUnicode_Copy(v);
13639    if (!copy)
13640        return NULL;
13641    return Py_BuildValue("(N)", copy);
13642}
13643
13644static PyMethodDef unicode_methods[] = {
13645    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
13646    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
13647    {"split", (PyCFunction) unicode_split, METH_VARARGS | METH_KEYWORDS, split__doc__},
13648    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS | METH_KEYWORDS, rsplit__doc__},
13649    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
13650    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
13651    {"casefold", (PyCFunction) unicode_casefold, METH_NOARGS, casefold__doc__},
13652    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
13653    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
13654    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
13655    {"expandtabs", (PyCFunction) unicode_expandtabs,
13656     METH_VARARGS | METH_KEYWORDS, expandtabs__doc__},
13657    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
13658    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
13659    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
13660    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
13661    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
13662    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
13663    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
13664    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
13665    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
13666    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
13667    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
13668    {"splitlines", (PyCFunction) unicode_splitlines,
13669     METH_VARARGS | METH_KEYWORDS, splitlines__doc__},
13670    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
13671    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
13672    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
13673    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
13674    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
13675    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
13676    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
13677    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
13678    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
13679    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
13680    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
13681    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
13682    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
13683    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
13684    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
13685    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
13686    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
13687    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
13688    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
13689    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
13690    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
13691    UNICODE_MAKETRANS_METHODDEF
13692    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
13693#if 0
13694    /* These methods are just used for debugging the implementation. */
13695    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
13696#endif
13697
13698    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
13699    {NULL, NULL}
13700};
13701
13702static PyObject *
13703unicode_mod(PyObject *v, PyObject *w)
13704{
13705    if (!PyUnicode_Check(v))
13706        Py_RETURN_NOTIMPLEMENTED;
13707    return PyUnicode_Format(v, w);
13708}
13709
13710static PyNumberMethods unicode_as_number = {
13711    0,              /*nb_add*/
13712    0,              /*nb_subtract*/
13713    0,              /*nb_multiply*/
13714    unicode_mod,            /*nb_remainder*/
13715};
13716
13717static PySequenceMethods unicode_as_sequence = {
13718    (lenfunc) unicode_length,       /* sq_length */
13719    PyUnicode_Concat,           /* sq_concat */
13720    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
13721    (ssizeargfunc) unicode_getitem,     /* sq_item */
13722    0,                  /* sq_slice */
13723    0,                  /* sq_ass_item */
13724    0,                  /* sq_ass_slice */
13725    PyUnicode_Contains,         /* sq_contains */
13726};
13727
13728static PyObject*
13729unicode_subscript(PyObject* self, PyObject* item)
13730{
13731    if (PyUnicode_READY(self) == -1)
13732        return NULL;
13733
13734    if (PyIndex_Check(item)) {
13735        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
13736        if (i == -1 && PyErr_Occurred())
13737            return NULL;
13738        if (i < 0)
13739            i += PyUnicode_GET_LENGTH(self);
13740        return unicode_getitem(self, i);
13741    } else if (PySlice_Check(item)) {
13742        Py_ssize_t start, stop, step, slicelength, cur, i;
13743        PyObject *result;
13744        void *src_data, *dest_data;
13745        int src_kind, dest_kind;
13746        Py_UCS4 ch, max_char, kind_limit;
13747
13748        if (PySlice_GetIndicesEx(item, PyUnicode_GET_LENGTH(self),
13749                                 &start, &stop, &step, &slicelength) < 0) {
13750            return NULL;
13751        }
13752
13753        if (slicelength <= 0) {
13754            _Py_RETURN_UNICODE_EMPTY();
13755        } else if (start == 0 && step == 1 &&
13756                   slicelength == PyUnicode_GET_LENGTH(self)) {
13757            return unicode_result_unchanged(self);
13758        } else if (step == 1) {
13759            return PyUnicode_Substring(self,
13760                                       start, start + slicelength);
13761        }
13762        /* General case */
13763        src_kind = PyUnicode_KIND(self);
13764        src_data = PyUnicode_DATA(self);
13765        if (!PyUnicode_IS_ASCII(self)) {
13766            kind_limit = kind_maxchar_limit(src_kind);
13767            max_char = 0;
13768            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13769                ch = PyUnicode_READ(src_kind, src_data, cur);
13770                if (ch > max_char) {
13771                    max_char = ch;
13772                    if (max_char >= kind_limit)
13773                        break;
13774                }
13775            }
13776        }
13777        else
13778            max_char = 127;
13779        result = PyUnicode_New(slicelength, max_char);
13780        if (result == NULL)
13781            return NULL;
13782        dest_kind = PyUnicode_KIND(result);
13783        dest_data = PyUnicode_DATA(result);
13784
13785        for (cur = start, i = 0; i < slicelength; cur += step, i++) {
13786            Py_UCS4 ch = PyUnicode_READ(src_kind, src_data, cur);
13787            PyUnicode_WRITE(dest_kind, dest_data, i, ch);
13788        }
13789        assert(_PyUnicode_CheckConsistency(result, 1));
13790        return result;
13791    } else {
13792        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
13793        return NULL;
13794    }
13795}
13796
13797static PyMappingMethods unicode_as_mapping = {
13798    (lenfunc)unicode_length,        /* mp_length */
13799    (binaryfunc)unicode_subscript,  /* mp_subscript */
13800    (objobjargproc)0,           /* mp_ass_subscript */
13801};
13802
13803
13804/* Helpers for PyUnicode_Format() */
13805
13806struct unicode_formatter_t {
13807    PyObject *args;
13808    int args_owned;
13809    Py_ssize_t arglen, argidx;
13810    PyObject *dict;
13811
13812    enum PyUnicode_Kind fmtkind;
13813    Py_ssize_t fmtcnt, fmtpos;
13814    void *fmtdata;
13815    PyObject *fmtstr;
13816
13817    _PyUnicodeWriter writer;
13818};
13819
13820struct unicode_format_arg_t {
13821    Py_UCS4 ch;
13822    int flags;
13823    Py_ssize_t width;
13824    int prec;
13825    int sign;
13826};
13827
13828static PyObject *
13829unicode_format_getnextarg(struct unicode_formatter_t *ctx)
13830{
13831    Py_ssize_t argidx = ctx->argidx;
13832
13833    if (argidx < ctx->arglen) {
13834        ctx->argidx++;
13835        if (ctx->arglen < 0)
13836            return ctx->args;
13837        else
13838            return PyTuple_GetItem(ctx->args, argidx);
13839    }
13840    PyErr_SetString(PyExc_TypeError,
13841                    "not enough arguments for format string");
13842    return NULL;
13843}
13844
13845/* Returns a new reference to a PyUnicode object, or NULL on failure. */
13846
13847/* Format a float into the writer if the writer is not NULL, or into *p_output
13848   otherwise.
13849
13850   Return 0 on success, raise an exception and return -1 on error. */
13851static int
13852formatfloat(PyObject *v, struct unicode_format_arg_t *arg,
13853            PyObject **p_output,
13854            _PyUnicodeWriter *writer)
13855{
13856    char *p;
13857    double x;
13858    Py_ssize_t len;
13859    int prec;
13860    int dtoa_flags;
13861
13862    x = PyFloat_AsDouble(v);
13863    if (x == -1.0 && PyErr_Occurred())
13864        return -1;
13865
13866    prec = arg->prec;
13867    if (prec < 0)
13868        prec = 6;
13869
13870    if (arg->flags & F_ALT)
13871        dtoa_flags = Py_DTSF_ALT;
13872    else
13873        dtoa_flags = 0;
13874    p = PyOS_double_to_string(x, arg->ch, prec, dtoa_flags, NULL);
13875    if (p == NULL)
13876        return -1;
13877    len = strlen(p);
13878    if (writer) {
13879        if (_PyUnicodeWriter_WriteASCIIString(writer, p, len) < 0) {
13880            PyMem_Free(p);
13881            return -1;
13882        }
13883    }
13884    else
13885        *p_output = _PyUnicode_FromASCII(p, len);
13886    PyMem_Free(p);
13887    return 0;
13888}
13889
13890/* formatlong() emulates the format codes d, u, o, x and X, and
13891 * the F_ALT flag, for Python's long (unbounded) ints.  It's not used for
13892 * Python's regular ints.
13893 * Return value:  a new PyUnicodeObject*, or NULL if error.
13894 *     The output string is of the form
13895 *         "-"? ("0x" | "0X")? digit+
13896 *     "0x"/"0X" are present only for x and X conversions, with F_ALT
13897 *         set in flags.  The case of hex digits will be correct,
13898 *     There will be at least prec digits, zero-filled on the left if
13899 *         necessary to get that many.
13900 * val          object to be converted
13901 * flags        bitmask of format flags; only F_ALT is looked at
13902 * prec         minimum number of digits; 0-fill on left if needed
13903 * type         a character in [duoxX]; u acts the same as d
13904 *
13905 * CAUTION:  o, x and X conversions on regular ints can never
13906 * produce a '-' sign, but can for Python's unbounded ints.
13907 */
13908static PyObject*
13909formatlong(PyObject *val, struct unicode_format_arg_t *arg)
13910{
13911    PyObject *result = NULL;
13912    char *buf;
13913    Py_ssize_t i;
13914    int sign;           /* 1 if '-', else 0 */
13915    int len;            /* number of characters */
13916    Py_ssize_t llen;
13917    int numdigits;      /* len == numnondigits + numdigits */
13918    int numnondigits = 0;
13919    int prec = arg->prec;
13920    int type = arg->ch;
13921
13922    /* Avoid exceeding SSIZE_T_MAX */
13923    if (prec > INT_MAX-3) {
13924        PyErr_SetString(PyExc_OverflowError,
13925                        "precision too large");
13926        return NULL;
13927    }
13928
13929    assert(PyLong_Check(val));
13930
13931    switch (type) {
13932    default:
13933        assert(!"'type' not in [diuoxX]");
13934    case 'd':
13935    case 'i':
13936    case 'u':
13937        /* int and int subclasses should print numerically when a numeric */
13938        /* format code is used (see issue18780) */
13939        result = PyNumber_ToBase(val, 10);
13940        break;
13941    case 'o':
13942        numnondigits = 2;
13943        result = PyNumber_ToBase(val, 8);
13944        break;
13945    case 'x':
13946    case 'X':
13947        numnondigits = 2;
13948        result = PyNumber_ToBase(val, 16);
13949        break;
13950    }
13951    if (!result)
13952        return NULL;
13953
13954    assert(unicode_modifiable(result));
13955    assert(PyUnicode_IS_READY(result));
13956    assert(PyUnicode_IS_ASCII(result));
13957
13958    /* To modify the string in-place, there can only be one reference. */
13959    if (Py_REFCNT(result) != 1) {
13960        Py_DECREF(result);
13961        PyErr_BadInternalCall();
13962        return NULL;
13963    }
13964    buf = PyUnicode_DATA(result);
13965    llen = PyUnicode_GET_LENGTH(result);
13966    if (llen > INT_MAX) {
13967        Py_DECREF(result);
13968        PyErr_SetString(PyExc_ValueError,
13969                        "string too large in _PyBytes_FormatLong");
13970        return NULL;
13971    }
13972    len = (int)llen;
13973    sign = buf[0] == '-';
13974    numnondigits += sign;
13975    numdigits = len - numnondigits;
13976    assert(numdigits > 0);
13977
13978    /* Get rid of base marker unless F_ALT */
13979    if (((arg->flags & F_ALT) == 0 &&
13980        (type == 'o' || type == 'x' || type == 'X'))) {
13981        assert(buf[sign] == '0');
13982        assert(buf[sign+1] == 'x' || buf[sign+1] == 'X' ||
13983               buf[sign+1] == 'o');
13984        numnondigits -= 2;
13985        buf += 2;
13986        len -= 2;
13987        if (sign)
13988            buf[0] = '-';
13989        assert(len == numnondigits + numdigits);
13990        assert(numdigits > 0);
13991    }
13992
13993    /* Fill with leading zeroes to meet minimum width. */
13994    if (prec > numdigits) {
13995        PyObject *r1 = PyBytes_FromStringAndSize(NULL,
13996                                numnondigits + prec);
13997        char *b1;
13998        if (!r1) {
13999            Py_DECREF(result);
14000            return NULL;
14001        }
14002        b1 = PyBytes_AS_STRING(r1);
14003        for (i = 0; i < numnondigits; ++i)
14004            *b1++ = *buf++;
14005        for (i = 0; i < prec - numdigits; i++)
14006            *b1++ = '0';
14007        for (i = 0; i < numdigits; i++)
14008            *b1++ = *buf++;
14009        *b1 = '\0';
14010        Py_DECREF(result);
14011        result = r1;
14012        buf = PyBytes_AS_STRING(result);
14013        len = numnondigits + prec;
14014    }
14015
14016    /* Fix up case for hex conversions. */
14017    if (type == 'X') {
14018        /* Need to convert all lower case letters to upper case.
14019           and need to convert 0x to 0X (and -0x to -0X). */
14020        for (i = 0; i < len; i++)
14021            if (buf[i] >= 'a' && buf[i] <= 'x')
14022                buf[i] -= 'a'-'A';
14023    }
14024    if (!PyUnicode_Check(result)
14025        || buf != PyUnicode_DATA(result)) {
14026        PyObject *unicode;
14027        unicode = _PyUnicode_FromASCII(buf, len);
14028        Py_DECREF(result);
14029        result = unicode;
14030    }
14031    else if (len != PyUnicode_GET_LENGTH(result)) {
14032        if (PyUnicode_Resize(&result, len) < 0)
14033            Py_CLEAR(result);
14034    }
14035    return result;
14036}
14037
14038/* Format an integer or a float as an integer.
14039 * Return 1 if the number has been formatted into the writer,
14040 *        0 if the number has been formatted into *p_output
14041 *       -1 and raise an exception on error */
14042static int
14043mainformatlong(PyObject *v,
14044               struct unicode_format_arg_t *arg,
14045               PyObject **p_output,
14046               _PyUnicodeWriter *writer)
14047{
14048    PyObject *iobj, *res;
14049    char type = (char)arg->ch;
14050
14051    if (!PyNumber_Check(v))
14052        goto wrongtype;
14053
14054    /* make sure number is a type of integer */
14055    /* if not, issue deprecation warning for now */
14056    if (!PyLong_Check(v)) {
14057        if (type == 'o' || type == 'x' || type == 'X') {
14058            iobj = PyNumber_Index(v);
14059            if (iobj == NULL) {
14060                PyErr_Clear();
14061                if (PyErr_WarnEx(PyExc_DeprecationWarning,
14062                                 "automatic int conversions have been deprecated",
14063                                 1)) {
14064                    return -1;
14065                }
14066                iobj = PyNumber_Long(v);
14067                if (iobj == NULL ) {
14068                    if (PyErr_ExceptionMatches(PyExc_TypeError))
14069                        goto wrongtype;
14070                    return -1;
14071                }
14072            }
14073        }
14074        else {
14075            iobj = PyNumber_Long(v);
14076            if (iobj == NULL ) {
14077                if (PyErr_ExceptionMatches(PyExc_TypeError))
14078                    goto wrongtype;
14079                return -1;
14080            }
14081        }
14082        assert(PyLong_Check(iobj));
14083    }
14084    else {
14085        iobj = v;
14086        Py_INCREF(iobj);
14087    }
14088
14089    if (PyLong_CheckExact(v)
14090        && arg->width == -1 && arg->prec == -1
14091        && !(arg->flags & (F_SIGN | F_BLANK))
14092        && type != 'X')
14093    {
14094        /* Fast path */
14095        int alternate = arg->flags & F_ALT;
14096        int base;
14097
14098        switch(type)
14099        {
14100            default:
14101                assert(0 && "'type' not in [diuoxX]");
14102            case 'd':
14103            case 'i':
14104            case 'u':
14105                base = 10;
14106                break;
14107            case 'o':
14108                base = 8;
14109                break;
14110            case 'x':
14111            case 'X':
14112                base = 16;
14113                break;
14114        }
14115
14116        if (_PyLong_FormatWriter(writer, v, base, alternate) == -1) {
14117            Py_DECREF(iobj);
14118            return -1;
14119        }
14120        Py_DECREF(iobj);
14121        return 1;
14122    }
14123
14124    res = formatlong(iobj, arg);
14125    Py_DECREF(iobj);
14126    if (res == NULL)
14127        return -1;
14128    *p_output = res;
14129    return 0;
14130
14131wrongtype:
14132    PyErr_Format(PyExc_TypeError,
14133            "%%%c format: a number is required, "
14134            "not %.200s",
14135            type, Py_TYPE(v)->tp_name);
14136    return -1;
14137}
14138
14139static Py_UCS4
14140formatchar(PyObject *v)
14141{
14142    /* presume that the buffer is at least 3 characters long */
14143    if (PyUnicode_Check(v)) {
14144        if (PyUnicode_GET_LENGTH(v) == 1) {
14145            return PyUnicode_READ_CHAR(v, 0);
14146        }
14147        goto onError;
14148    }
14149    else {
14150        PyObject *iobj;
14151        long x;
14152        /* make sure number is a type of integer */
14153        /* if not, issue deprecation warning for now */
14154        if (!PyLong_Check(v)) {
14155            iobj = PyNumber_Index(v);
14156            if (iobj == NULL) {
14157                PyErr_Clear();
14158                if (PyErr_WarnEx(PyExc_DeprecationWarning,
14159                                 "automatic int conversions have been deprecated",
14160                                 1)) {
14161                    return -1;
14162                }
14163                iobj = PyNumber_Long(v);
14164                if (iobj == NULL ) {
14165                    if (PyErr_ExceptionMatches(PyExc_TypeError))
14166                        goto onError;
14167                    return -1;
14168                }
14169            }
14170            v = iobj;
14171            Py_DECREF(iobj);
14172        }
14173        /* Integer input truncated to a character */
14174        x = PyLong_AsLong(v);
14175        if (x == -1 && PyErr_Occurred())
14176            goto onError;
14177
14178        if (x < 0 || x > MAX_UNICODE) {
14179            PyErr_SetString(PyExc_OverflowError,
14180                            "%c arg not in range(0x110000)");
14181            return (Py_UCS4) -1;
14182        }
14183
14184        return (Py_UCS4) x;
14185    }
14186
14187  onError:
14188    PyErr_SetString(PyExc_TypeError,
14189                    "%c requires int or char");
14190    return (Py_UCS4) -1;
14191}
14192
14193/* Parse options of an argument: flags, width, precision.
14194   Handle also "%(name)" syntax.
14195
14196   Return 0 if the argument has been formatted into arg->str.
14197   Return 1 if the argument has been written into ctx->writer,
14198   Raise an exception and return -1 on error. */
14199static int
14200unicode_format_arg_parse(struct unicode_formatter_t *ctx,
14201                         struct unicode_format_arg_t *arg)
14202{
14203#define FORMAT_READ(ctx) \
14204        PyUnicode_READ((ctx)->fmtkind, (ctx)->fmtdata, (ctx)->fmtpos)
14205
14206    PyObject *v;
14207
14208    if (arg->ch == '(') {
14209        /* Get argument value from a dictionary. Example: "%(name)s". */
14210        Py_ssize_t keystart;
14211        Py_ssize_t keylen;
14212        PyObject *key;
14213        int pcount = 1;
14214
14215        if (ctx->dict == NULL) {
14216            PyErr_SetString(PyExc_TypeError,
14217                            "format requires a mapping");
14218            return -1;
14219        }
14220        ++ctx->fmtpos;
14221        --ctx->fmtcnt;
14222        keystart = ctx->fmtpos;
14223        /* Skip over balanced parentheses */
14224        while (pcount > 0 && --ctx->fmtcnt >= 0) {
14225            arg->ch = FORMAT_READ(ctx);
14226            if (arg->ch == ')')
14227                --pcount;
14228            else if (arg->ch == '(')
14229                ++pcount;
14230            ctx->fmtpos++;
14231        }
14232        keylen = ctx->fmtpos - keystart - 1;
14233        if (ctx->fmtcnt < 0 || pcount > 0) {
14234            PyErr_SetString(PyExc_ValueError,
14235                            "incomplete format key");
14236            return -1;
14237        }
14238        key = PyUnicode_Substring(ctx->fmtstr,
14239                                  keystart, keystart + keylen);
14240        if (key == NULL)
14241            return -1;
14242        if (ctx->args_owned) {
14243            Py_DECREF(ctx->args);
14244            ctx->args_owned = 0;
14245        }
14246        ctx->args = PyObject_GetItem(ctx->dict, key);
14247        Py_DECREF(key);
14248        if (ctx->args == NULL)
14249            return -1;
14250        ctx->args_owned = 1;
14251        ctx->arglen = -1;
14252        ctx->argidx = -2;
14253    }
14254
14255    /* Parse flags. Example: "%+i" => flags=F_SIGN. */
14256    while (--ctx->fmtcnt >= 0) {
14257        arg->ch = FORMAT_READ(ctx);
14258        ctx->fmtpos++;
14259        switch (arg->ch) {
14260        case '-': arg->flags |= F_LJUST; continue;
14261        case '+': arg->flags |= F_SIGN; continue;
14262        case ' ': arg->flags |= F_BLANK; continue;
14263        case '#': arg->flags |= F_ALT; continue;
14264        case '0': arg->flags |= F_ZERO; continue;
14265        }
14266        break;
14267    }
14268
14269    /* Parse width. Example: "%10s" => width=10 */
14270    if (arg->ch == '*') {
14271        v = unicode_format_getnextarg(ctx);
14272        if (v == NULL)
14273            return -1;
14274        if (!PyLong_Check(v)) {
14275            PyErr_SetString(PyExc_TypeError,
14276                            "* wants int");
14277            return -1;
14278        }
14279        arg->width = PyLong_AsSsize_t(v);
14280        if (arg->width == -1 && PyErr_Occurred())
14281            return -1;
14282        if (arg->width < 0) {
14283            arg->flags |= F_LJUST;
14284            arg->width = -arg->width;
14285        }
14286        if (--ctx->fmtcnt >= 0) {
14287            arg->ch = FORMAT_READ(ctx);
14288            ctx->fmtpos++;
14289        }
14290    }
14291    else if (arg->ch >= '0' && arg->ch <= '9') {
14292        arg->width = arg->ch - '0';
14293        while (--ctx->fmtcnt >= 0) {
14294            arg->ch = FORMAT_READ(ctx);
14295            ctx->fmtpos++;
14296            if (arg->ch < '0' || arg->ch > '9')
14297                break;
14298            /* Since arg->ch is unsigned, the RHS would end up as unsigned,
14299               mixing signed and unsigned comparison. Since arg->ch is between
14300               '0' and '9', casting to int is safe. */
14301            if (arg->width > (PY_SSIZE_T_MAX - ((int)arg->ch - '0')) / 10) {
14302                PyErr_SetString(PyExc_ValueError,
14303                                "width too big");
14304                return -1;
14305            }
14306            arg->width = arg->width*10 + (arg->ch - '0');
14307        }
14308    }
14309
14310    /* Parse precision. Example: "%.3f" => prec=3 */
14311    if (arg->ch == '.') {
14312        arg->prec = 0;
14313        if (--ctx->fmtcnt >= 0) {
14314            arg->ch = FORMAT_READ(ctx);
14315            ctx->fmtpos++;
14316        }
14317        if (arg->ch == '*') {
14318            v = unicode_format_getnextarg(ctx);
14319            if (v == NULL)
14320                return -1;
14321            if (!PyLong_Check(v)) {
14322                PyErr_SetString(PyExc_TypeError,
14323                                "* wants int");
14324                return -1;
14325            }
14326            arg->prec = _PyLong_AsInt(v);
14327            if (arg->prec == -1 && PyErr_Occurred())
14328                return -1;
14329            if (arg->prec < 0)
14330                arg->prec = 0;
14331            if (--ctx->fmtcnt >= 0) {
14332                arg->ch = FORMAT_READ(ctx);
14333                ctx->fmtpos++;
14334            }
14335        }
14336        else if (arg->ch >= '0' && arg->ch <= '9') {
14337            arg->prec = arg->ch - '0';
14338            while (--ctx->fmtcnt >= 0) {
14339                arg->ch = FORMAT_READ(ctx);
14340                ctx->fmtpos++;
14341                if (arg->ch < '0' || arg->ch > '9')
14342                    break;
14343                if (arg->prec > (INT_MAX - ((int)arg->ch - '0')) / 10) {
14344                    PyErr_SetString(PyExc_ValueError,
14345                                    "precision too big");
14346                    return -1;
14347                }
14348                arg->prec = arg->prec*10 + (arg->ch - '0');
14349            }
14350        }
14351    }
14352
14353    /* Ignore "h", "l" and "L" format prefix (ex: "%hi" or "%ls") */
14354    if (ctx->fmtcnt >= 0) {
14355        if (arg->ch == 'h' || arg->ch == 'l' || arg->ch == 'L') {
14356            if (--ctx->fmtcnt >= 0) {
14357                arg->ch = FORMAT_READ(ctx);
14358                ctx->fmtpos++;
14359            }
14360        }
14361    }
14362    if (ctx->fmtcnt < 0) {
14363        PyErr_SetString(PyExc_ValueError,
14364                        "incomplete format");
14365        return -1;
14366    }
14367    return 0;
14368
14369#undef FORMAT_READ
14370}
14371
14372/* Format one argument. Supported conversion specifiers:
14373
14374   - "s", "r", "a": any type
14375   - "i", "d", "u": int or float
14376   - "o", "x", "X": int
14377   - "e", "E", "f", "F", "g", "G": float
14378   - "c": int or str (1 character)
14379
14380   When possible, the output is written directly into the Unicode writer
14381   (ctx->writer). A string is created when padding is required.
14382
14383   Return 0 if the argument has been formatted into *p_str,
14384          1 if the argument has been written into ctx->writer,
14385         -1 on error. */
14386static int
14387unicode_format_arg_format(struct unicode_formatter_t *ctx,
14388                          struct unicode_format_arg_t *arg,
14389                          PyObject **p_str)
14390{
14391    PyObject *v;
14392    _PyUnicodeWriter *writer = &ctx->writer;
14393
14394    if (ctx->fmtcnt == 0)
14395        ctx->writer.overallocate = 0;
14396
14397    if (arg->ch == '%') {
14398        if (_PyUnicodeWriter_WriteCharInline(writer, '%') < 0)
14399            return -1;
14400        return 1;
14401    }
14402
14403    v = unicode_format_getnextarg(ctx);
14404    if (v == NULL)
14405        return -1;
14406
14407
14408    switch (arg->ch) {
14409    case 's':
14410    case 'r':
14411    case 'a':
14412        if (PyLong_CheckExact(v) && arg->width == -1 && arg->prec == -1) {
14413            /* Fast path */
14414            if (_PyLong_FormatWriter(writer, v, 10, arg->flags & F_ALT) == -1)
14415                return -1;
14416            return 1;
14417        }
14418
14419        if (PyUnicode_CheckExact(v) && arg->ch == 's') {
14420            *p_str = v;
14421            Py_INCREF(*p_str);
14422        }
14423        else {
14424            if (arg->ch == 's')
14425                *p_str = PyObject_Str(v);
14426            else if (arg->ch == 'r')
14427                *p_str = PyObject_Repr(v);
14428            else
14429                *p_str = PyObject_ASCII(v);
14430        }
14431        break;
14432
14433    case 'i':
14434    case 'd':
14435    case 'u':
14436    case 'o':
14437    case 'x':
14438    case 'X':
14439    {
14440        int ret = mainformatlong(v, arg, p_str, writer);
14441        if (ret != 0)
14442            return ret;
14443        arg->sign = 1;
14444        break;
14445    }
14446
14447    case 'e':
14448    case 'E':
14449    case 'f':
14450    case 'F':
14451    case 'g':
14452    case 'G':
14453        if (arg->width == -1 && arg->prec == -1
14454            && !(arg->flags & (F_SIGN | F_BLANK)))
14455        {
14456            /* Fast path */
14457            if (formatfloat(v, arg, NULL, writer) == -1)
14458                return -1;
14459            return 1;
14460        }
14461
14462        arg->sign = 1;
14463        if (formatfloat(v, arg, p_str, NULL) == -1)
14464            return -1;
14465        break;
14466
14467    case 'c':
14468    {
14469        Py_UCS4 ch = formatchar(v);
14470        if (ch == (Py_UCS4) -1)
14471            return -1;
14472        if (arg->width == -1 && arg->prec == -1) {
14473            /* Fast path */
14474            if (_PyUnicodeWriter_WriteCharInline(writer, ch) < 0)
14475                return -1;
14476            return 1;
14477        }
14478        *p_str = PyUnicode_FromOrdinal(ch);
14479        break;
14480    }
14481
14482    default:
14483        PyErr_Format(PyExc_ValueError,
14484                     "unsupported format character '%c' (0x%x) "
14485                     "at index %zd",
14486                     (31<=arg->ch && arg->ch<=126) ? (char)arg->ch : '?',
14487                     (int)arg->ch,
14488                     ctx->fmtpos - 1);
14489        return -1;
14490    }
14491    if (*p_str == NULL)
14492        return -1;
14493    assert (PyUnicode_Check(*p_str));
14494    return 0;
14495}
14496
14497static int
14498unicode_format_arg_output(struct unicode_formatter_t *ctx,
14499                          struct unicode_format_arg_t *arg,
14500                          PyObject *str)
14501{
14502    Py_ssize_t len;
14503    enum PyUnicode_Kind kind;
14504    void *pbuf;
14505    Py_ssize_t pindex;
14506    Py_UCS4 signchar;
14507    Py_ssize_t buflen;
14508    Py_UCS4 maxchar;
14509    Py_ssize_t sublen;
14510    _PyUnicodeWriter *writer = &ctx->writer;
14511    Py_UCS4 fill;
14512
14513    fill = ' ';
14514    if (arg->sign && arg->flags & F_ZERO)
14515        fill = '0';
14516
14517    if (PyUnicode_READY(str) == -1)
14518        return -1;
14519
14520    len = PyUnicode_GET_LENGTH(str);
14521    if ((arg->width == -1 || arg->width <= len)
14522        && (arg->prec == -1 || arg->prec >= len)
14523        && !(arg->flags & (F_SIGN | F_BLANK)))
14524    {
14525        /* Fast path */
14526        if (_PyUnicodeWriter_WriteStr(writer, str) == -1)
14527            return -1;
14528        return 0;
14529    }
14530
14531    /* Truncate the string for "s", "r" and "a" formats
14532       if the precision is set */
14533    if (arg->ch == 's' || arg->ch == 'r' || arg->ch == 'a') {
14534        if (arg->prec >= 0 && len > arg->prec)
14535            len = arg->prec;
14536    }
14537
14538    /* Adjust sign and width */
14539    kind = PyUnicode_KIND(str);
14540    pbuf = PyUnicode_DATA(str);
14541    pindex = 0;
14542    signchar = '\0';
14543    if (arg->sign) {
14544        Py_UCS4 ch = PyUnicode_READ(kind, pbuf, pindex);
14545        if (ch == '-' || ch == '+') {
14546            signchar = ch;
14547            len--;
14548            pindex++;
14549        }
14550        else if (arg->flags & F_SIGN)
14551            signchar = '+';
14552        else if (arg->flags & F_BLANK)
14553            signchar = ' ';
14554        else
14555            arg->sign = 0;
14556    }
14557    if (arg->width < len)
14558        arg->width = len;
14559
14560    /* Prepare the writer */
14561    maxchar = writer->maxchar;
14562    if (!(arg->flags & F_LJUST)) {
14563        if (arg->sign) {
14564            if ((arg->width-1) > len)
14565                maxchar = Py_MAX(maxchar, fill);
14566        }
14567        else {
14568            if (arg->width > len)
14569                maxchar = Py_MAX(maxchar, fill);
14570        }
14571    }
14572    if (PyUnicode_MAX_CHAR_VALUE(str) > maxchar) {
14573        Py_UCS4 strmaxchar = _PyUnicode_FindMaxChar(str, 0, pindex+len);
14574        maxchar = Py_MAX(maxchar, strmaxchar);
14575    }
14576
14577    buflen = arg->width;
14578    if (arg->sign && len == arg->width)
14579        buflen++;
14580    if (_PyUnicodeWriter_Prepare(writer, buflen, maxchar) == -1)
14581        return -1;
14582
14583    /* Write the sign if needed */
14584    if (arg->sign) {
14585        if (fill != ' ') {
14586            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14587            writer->pos += 1;
14588        }
14589        if (arg->width > len)
14590            arg->width--;
14591    }
14592
14593    /* Write the numeric prefix for "x", "X" and "o" formats
14594       if the alternate form is used.
14595       For example, write "0x" for the "%#x" format. */
14596    if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14597        assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14598        assert(PyUnicode_READ(kind, pbuf, pindex + 1) == arg->ch);
14599        if (fill != ' ') {
14600            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14601            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14602            writer->pos += 2;
14603            pindex += 2;
14604        }
14605        arg->width -= 2;
14606        if (arg->width < 0)
14607            arg->width = 0;
14608        len -= 2;
14609    }
14610
14611    /* Pad left with the fill character if needed */
14612    if (arg->width > len && !(arg->flags & F_LJUST)) {
14613        sublen = arg->width - len;
14614        FILL(writer->kind, writer->data, fill, writer->pos, sublen);
14615        writer->pos += sublen;
14616        arg->width = len;
14617    }
14618
14619    /* If padding with spaces: write sign if needed and/or numeric prefix if
14620       the alternate form is used */
14621    if (fill == ' ') {
14622        if (arg->sign) {
14623            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, signchar);
14624            writer->pos += 1;
14625        }
14626        if ((arg->flags & F_ALT) && (arg->ch == 'x' || arg->ch == 'X' || arg->ch == 'o')) {
14627            assert(PyUnicode_READ(kind, pbuf, pindex) == '0');
14628            assert(PyUnicode_READ(kind, pbuf, pindex+1) == arg->ch);
14629            PyUnicode_WRITE(writer->kind, writer->data, writer->pos, '0');
14630            PyUnicode_WRITE(writer->kind, writer->data, writer->pos+1, arg->ch);
14631            writer->pos += 2;
14632            pindex += 2;
14633        }
14634    }
14635
14636    /* Write characters */
14637    if (len) {
14638        _PyUnicode_FastCopyCharacters(writer->buffer, writer->pos,
14639                                      str, pindex, len);
14640        writer->pos += len;
14641    }
14642
14643    /* Pad right with the fill character if needed */
14644    if (arg->width > len) {
14645        sublen = arg->width - len;
14646        FILL(writer->kind, writer->data, ' ', writer->pos, sublen);
14647        writer->pos += sublen;
14648    }
14649    return 0;
14650}
14651
14652/* Helper of PyUnicode_Format(): format one arg.
14653   Return 0 on success, raise an exception and return -1 on error. */
14654static int
14655unicode_format_arg(struct unicode_formatter_t *ctx)
14656{
14657    struct unicode_format_arg_t arg;
14658    PyObject *str;
14659    int ret;
14660
14661    arg.ch = PyUnicode_READ(ctx->fmtkind, ctx->fmtdata, ctx->fmtpos);
14662    arg.flags = 0;
14663    arg.width = -1;
14664    arg.prec = -1;
14665    arg.sign = 0;
14666    str = NULL;
14667
14668    ret = unicode_format_arg_parse(ctx, &arg);
14669    if (ret == -1)
14670        return -1;
14671
14672    ret = unicode_format_arg_format(ctx, &arg, &str);
14673    if (ret == -1)
14674        return -1;
14675
14676    if (ret != 1) {
14677        ret = unicode_format_arg_output(ctx, &arg, str);
14678        Py_DECREF(str);
14679        if (ret == -1)
14680            return -1;
14681    }
14682
14683    if (ctx->dict && (ctx->argidx < ctx->arglen) && arg.ch != '%') {
14684        PyErr_SetString(PyExc_TypeError,
14685                        "not all arguments converted during string formatting");
14686        return -1;
14687    }
14688    return 0;
14689}
14690
14691PyObject *
14692PyUnicode_Format(PyObject *format, PyObject *args)
14693{
14694    struct unicode_formatter_t ctx;
14695
14696    if (format == NULL || args == NULL) {
14697        PyErr_BadInternalCall();
14698        return NULL;
14699    }
14700
14701    ctx.fmtstr = PyUnicode_FromObject(format);
14702    if (ctx.fmtstr == NULL)
14703        return NULL;
14704    if (PyUnicode_READY(ctx.fmtstr) == -1) {
14705        Py_DECREF(ctx.fmtstr);
14706        return NULL;
14707    }
14708    ctx.fmtdata = PyUnicode_DATA(ctx.fmtstr);
14709    ctx.fmtkind = PyUnicode_KIND(ctx.fmtstr);
14710    ctx.fmtcnt = PyUnicode_GET_LENGTH(ctx.fmtstr);
14711    ctx.fmtpos = 0;
14712
14713    _PyUnicodeWriter_Init(&ctx.writer);
14714    ctx.writer.min_length = ctx.fmtcnt + 100;
14715    ctx.writer.overallocate = 1;
14716
14717    if (PyTuple_Check(args)) {
14718        ctx.arglen = PyTuple_Size(args);
14719        ctx.argidx = 0;
14720    }
14721    else {
14722        ctx.arglen = -1;
14723        ctx.argidx = -2;
14724    }
14725    ctx.args_owned = 0;
14726    if (PyMapping_Check(args) && !PyTuple_Check(args) && !PyUnicode_Check(args))
14727        ctx.dict = args;
14728    else
14729        ctx.dict = NULL;
14730    ctx.args = args;
14731
14732    while (--ctx.fmtcnt >= 0) {
14733        if (PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14734            Py_ssize_t nonfmtpos;
14735
14736            nonfmtpos = ctx.fmtpos++;
14737            while (ctx.fmtcnt >= 0 &&
14738                   PyUnicode_READ(ctx.fmtkind, ctx.fmtdata, ctx.fmtpos) != '%') {
14739                ctx.fmtpos++;
14740                ctx.fmtcnt--;
14741            }
14742            if (ctx.fmtcnt < 0) {
14743                ctx.fmtpos--;
14744                ctx.writer.overallocate = 0;
14745            }
14746
14747            if (_PyUnicodeWriter_WriteSubstring(&ctx.writer, ctx.fmtstr,
14748                                                nonfmtpos, ctx.fmtpos) < 0)
14749                goto onError;
14750        }
14751        else {
14752            ctx.fmtpos++;
14753            if (unicode_format_arg(&ctx) == -1)
14754                goto onError;
14755        }
14756    }
14757
14758    if (ctx.argidx < ctx.arglen && !ctx.dict) {
14759        PyErr_SetString(PyExc_TypeError,
14760                        "not all arguments converted during string formatting");
14761        goto onError;
14762    }
14763
14764    if (ctx.args_owned) {
14765        Py_DECREF(ctx.args);
14766    }
14767    Py_DECREF(ctx.fmtstr);
14768    return _PyUnicodeWriter_Finish(&ctx.writer);
14769
14770  onError:
14771    Py_DECREF(ctx.fmtstr);
14772    _PyUnicodeWriter_Dealloc(&ctx.writer);
14773    if (ctx.args_owned) {
14774        Py_DECREF(ctx.args);
14775    }
14776    return NULL;
14777}
14778
14779static PyObject *
14780unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
14781
14782static PyObject *
14783unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14784{
14785    PyObject *x = NULL;
14786    static char *kwlist[] = {"object", "encoding", "errors", 0};
14787    char *encoding = NULL;
14788    char *errors = NULL;
14789
14790    if (type != &PyUnicode_Type)
14791        return unicode_subtype_new(type, args, kwds);
14792    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
14793                                     kwlist, &x, &encoding, &errors))
14794        return NULL;
14795    if (x == NULL)
14796        _Py_RETURN_UNICODE_EMPTY();
14797    if (encoding == NULL && errors == NULL)
14798        return PyObject_Str(x);
14799    else
14800        return PyUnicode_FromEncodedObject(x, encoding, errors);
14801}
14802
14803static PyObject *
14804unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
14805{
14806    PyObject *unicode, *self;
14807    Py_ssize_t length, char_size;
14808    int share_wstr, share_utf8;
14809    unsigned int kind;
14810    void *data;
14811
14812    assert(PyType_IsSubtype(type, &PyUnicode_Type));
14813
14814    unicode = unicode_new(&PyUnicode_Type, args, kwds);
14815    if (unicode == NULL)
14816        return NULL;
14817    assert(_PyUnicode_CHECK(unicode));
14818    if (PyUnicode_READY(unicode) == -1) {
14819        Py_DECREF(unicode);
14820        return NULL;
14821    }
14822
14823    self = type->tp_alloc(type, 0);
14824    if (self == NULL) {
14825        Py_DECREF(unicode);
14826        return NULL;
14827    }
14828    kind = PyUnicode_KIND(unicode);
14829    length = PyUnicode_GET_LENGTH(unicode);
14830
14831    _PyUnicode_LENGTH(self) = length;
14832#ifdef Py_DEBUG
14833    _PyUnicode_HASH(self) = -1;
14834#else
14835    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14836#endif
14837    _PyUnicode_STATE(self).interned = 0;
14838    _PyUnicode_STATE(self).kind = kind;
14839    _PyUnicode_STATE(self).compact = 0;
14840    _PyUnicode_STATE(self).ascii = _PyUnicode_STATE(unicode).ascii;
14841    _PyUnicode_STATE(self).ready = 1;
14842    _PyUnicode_WSTR(self) = NULL;
14843    _PyUnicode_UTF8_LENGTH(self) = 0;
14844    _PyUnicode_UTF8(self) = NULL;
14845    _PyUnicode_WSTR_LENGTH(self) = 0;
14846    _PyUnicode_DATA_ANY(self) = NULL;
14847
14848    share_utf8 = 0;
14849    share_wstr = 0;
14850    if (kind == PyUnicode_1BYTE_KIND) {
14851        char_size = 1;
14852        if (PyUnicode_MAX_CHAR_VALUE(unicode) < 128)
14853            share_utf8 = 1;
14854    }
14855    else if (kind == PyUnicode_2BYTE_KIND) {
14856        char_size = 2;
14857        if (sizeof(wchar_t) == 2)
14858            share_wstr = 1;
14859    }
14860    else {
14861        assert(kind == PyUnicode_4BYTE_KIND);
14862        char_size = 4;
14863        if (sizeof(wchar_t) == 4)
14864            share_wstr = 1;
14865    }
14866
14867    /* Ensure we won't overflow the length. */
14868    if (length > (PY_SSIZE_T_MAX / char_size - 1)) {
14869        PyErr_NoMemory();
14870        goto onError;
14871    }
14872    data = PyObject_MALLOC((length + 1) * char_size);
14873    if (data == NULL) {
14874        PyErr_NoMemory();
14875        goto onError;
14876    }
14877
14878    _PyUnicode_DATA_ANY(self) = data;
14879    if (share_utf8) {
14880        _PyUnicode_UTF8_LENGTH(self) = length;
14881        _PyUnicode_UTF8(self) = data;
14882    }
14883    if (share_wstr) {
14884        _PyUnicode_WSTR_LENGTH(self) = length;
14885        _PyUnicode_WSTR(self) = (wchar_t *)data;
14886    }
14887
14888    Py_MEMCPY(data, PyUnicode_DATA(unicode),
14889              kind * (length + 1));
14890    assert(_PyUnicode_CheckConsistency(self, 1));
14891#ifdef Py_DEBUG
14892    _PyUnicode_HASH(self) = _PyUnicode_HASH(unicode);
14893#endif
14894    Py_DECREF(unicode);
14895    return self;
14896
14897onError:
14898    Py_DECREF(unicode);
14899    Py_DECREF(self);
14900    return NULL;
14901}
14902
14903PyDoc_STRVAR(unicode_doc,
14904"str(object='') -> str\n\
14905str(bytes_or_buffer[, encoding[, errors]]) -> str\n\
14906\n\
14907Create a new string object from the given object. If encoding or\n\
14908errors is specified, then the object must expose a data buffer\n\
14909that will be decoded using the given encoding and error handler.\n\
14910Otherwise, returns the result of object.__str__() (if defined)\n\
14911or repr(object).\n\
14912encoding defaults to sys.getdefaultencoding().\n\
14913errors defaults to 'strict'.");
14914
14915static PyObject *unicode_iter(PyObject *seq);
14916
14917PyTypeObject PyUnicode_Type = {
14918    PyVarObject_HEAD_INIT(&PyType_Type, 0)
14919    "str",              /* tp_name */
14920    sizeof(PyUnicodeObject),        /* tp_size */
14921    0,                  /* tp_itemsize */
14922    /* Slots */
14923    (destructor)unicode_dealloc,    /* tp_dealloc */
14924    0,                  /* tp_print */
14925    0,                  /* tp_getattr */
14926    0,                  /* tp_setattr */
14927    0,                  /* tp_reserved */
14928    unicode_repr,           /* tp_repr */
14929    &unicode_as_number,         /* tp_as_number */
14930    &unicode_as_sequence,       /* tp_as_sequence */
14931    &unicode_as_mapping,        /* tp_as_mapping */
14932    (hashfunc) unicode_hash,        /* tp_hash*/
14933    0,                  /* tp_call*/
14934    (reprfunc) unicode_str,     /* tp_str */
14935    PyObject_GenericGetAttr,        /* tp_getattro */
14936    0,                  /* tp_setattro */
14937    0,                  /* tp_as_buffer */
14938    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
14939    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
14940    unicode_doc,            /* tp_doc */
14941    0,                  /* tp_traverse */
14942    0,                  /* tp_clear */
14943    PyUnicode_RichCompare,      /* tp_richcompare */
14944    0,                  /* tp_weaklistoffset */
14945    unicode_iter,           /* tp_iter */
14946    0,                  /* tp_iternext */
14947    unicode_methods,            /* tp_methods */
14948    0,                  /* tp_members */
14949    0,                  /* tp_getset */
14950    &PyBaseObject_Type,         /* tp_base */
14951    0,                  /* tp_dict */
14952    0,                  /* tp_descr_get */
14953    0,                  /* tp_descr_set */
14954    0,                  /* tp_dictoffset */
14955    0,                  /* tp_init */
14956    0,                  /* tp_alloc */
14957    unicode_new,            /* tp_new */
14958    PyObject_Del,           /* tp_free */
14959};
14960
14961/* Initialize the Unicode implementation */
14962
14963int _PyUnicode_Init(void)
14964{
14965    /* XXX - move this array to unicodectype.c ? */
14966    Py_UCS2 linebreak[] = {
14967        0x000A, /* LINE FEED */
14968        0x000D, /* CARRIAGE RETURN */
14969        0x001C, /* FILE SEPARATOR */
14970        0x001D, /* GROUP SEPARATOR */
14971        0x001E, /* RECORD SEPARATOR */
14972        0x0085, /* NEXT LINE */
14973        0x2028, /* LINE SEPARATOR */
14974        0x2029, /* PARAGRAPH SEPARATOR */
14975    };
14976
14977    /* Init the implementation */
14978    _Py_INCREF_UNICODE_EMPTY();
14979    if (!unicode_empty)
14980        Py_FatalError("Can't create empty string");
14981    Py_DECREF(unicode_empty);
14982
14983    if (PyType_Ready(&PyUnicode_Type) < 0)
14984        Py_FatalError("Can't initialize 'unicode'");
14985
14986    /* initialize the linebreak bloom filter */
14987    bloom_linebreak = make_bloom_mask(
14988        PyUnicode_2BYTE_KIND, linebreak,
14989        Py_ARRAY_LENGTH(linebreak));
14990
14991    if (PyType_Ready(&EncodingMapType) < 0)
14992         Py_FatalError("Can't initialize encoding map type");
14993
14994    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
14995        Py_FatalError("Can't initialize field name iterator type");
14996
14997    if (PyType_Ready(&PyFormatterIter_Type) < 0)
14998        Py_FatalError("Can't initialize formatter iter type");
14999
15000#ifdef HAVE_MBCS
15001    winver.dwOSVersionInfoSize = sizeof(winver);
15002    if (!GetVersionEx((OSVERSIONINFO*)&winver)) {
15003        PyErr_SetFromWindowsErr(0);
15004        return -1;
15005    }
15006#endif
15007    return 0;
15008}
15009
15010/* Finalize the Unicode implementation */
15011
15012int
15013PyUnicode_ClearFreeList(void)
15014{
15015    return 0;
15016}
15017
15018void
15019_PyUnicode_Fini(void)
15020{
15021    int i;
15022
15023    Py_CLEAR(unicode_empty);
15024
15025    for (i = 0; i < 256; i++)
15026        Py_CLEAR(unicode_latin1[i]);
15027    _PyUnicode_ClearStaticStrings();
15028    (void)PyUnicode_ClearFreeList();
15029}
15030
15031void
15032PyUnicode_InternInPlace(PyObject **p)
15033{
15034    PyObject *s = *p;
15035    PyObject *t;
15036#ifdef Py_DEBUG
15037    assert(s != NULL);
15038    assert(_PyUnicode_CHECK(s));
15039#else
15040    if (s == NULL || !PyUnicode_Check(s))
15041        return;
15042#endif
15043    /* If it's a subclass, we don't really know what putting
15044       it in the interned dict might do. */
15045    if (!PyUnicode_CheckExact(s))
15046        return;
15047    if (PyUnicode_CHECK_INTERNED(s))
15048        return;
15049    if (interned == NULL) {
15050        interned = PyDict_New();
15051        if (interned == NULL) {
15052            PyErr_Clear(); /* Don't leave an exception */
15053            return;
15054        }
15055    }
15056    /* It might be that the GetItem call fails even
15057       though the key is present in the dictionary,
15058       namely when this happens during a stack overflow. */
15059    Py_ALLOW_RECURSION
15060    t = PyDict_GetItem(interned, s);
15061    Py_END_ALLOW_RECURSION
15062
15063    if (t) {
15064        Py_INCREF(t);
15065        Py_DECREF(*p);
15066        *p = t;
15067        return;
15068    }
15069
15070    PyThreadState_GET()->recursion_critical = 1;
15071    if (PyDict_SetItem(interned, s, s) < 0) {
15072        PyErr_Clear();
15073        PyThreadState_GET()->recursion_critical = 0;
15074        return;
15075    }
15076    PyThreadState_GET()->recursion_critical = 0;
15077    /* The two references in interned are not counted by refcnt.
15078       The deallocator will take care of this */
15079    Py_REFCNT(s) -= 2;
15080    _PyUnicode_STATE(s).interned = SSTATE_INTERNED_MORTAL;
15081}
15082
15083void
15084PyUnicode_InternImmortal(PyObject **p)
15085{
15086    PyUnicode_InternInPlace(p);
15087    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
15088        _PyUnicode_STATE(*p).interned = SSTATE_INTERNED_IMMORTAL;
15089        Py_INCREF(*p);
15090    }
15091}
15092
15093PyObject *
15094PyUnicode_InternFromString(const char *cp)
15095{
15096    PyObject *s = PyUnicode_FromString(cp);
15097    if (s == NULL)
15098        return NULL;
15099    PyUnicode_InternInPlace(&s);
15100    return s;
15101}
15102
15103void
15104_Py_ReleaseInternedUnicodeStrings(void)
15105{
15106    PyObject *keys;
15107    PyObject *s;
15108    Py_ssize_t i, n;
15109    Py_ssize_t immortal_size = 0, mortal_size = 0;
15110
15111    if (interned == NULL || !PyDict_Check(interned))
15112        return;
15113    keys = PyDict_Keys(interned);
15114    if (keys == NULL || !PyList_Check(keys)) {
15115        PyErr_Clear();
15116        return;
15117    }
15118
15119    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
15120       detector, interned unicode strings are not forcibly deallocated;
15121       rather, we give them their stolen references back, and then clear
15122       and DECREF the interned dict. */
15123
15124    n = PyList_GET_SIZE(keys);
15125    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
15126            n);
15127    for (i = 0; i < n; i++) {
15128        s = PyList_GET_ITEM(keys, i);
15129        if (PyUnicode_READY(s) == -1) {
15130            assert(0 && "could not ready string");
15131            fprintf(stderr, "could not ready string\n");
15132        }
15133        switch (PyUnicode_CHECK_INTERNED(s)) {
15134        case SSTATE_NOT_INTERNED:
15135            /* XXX Shouldn't happen */
15136            break;
15137        case SSTATE_INTERNED_IMMORTAL:
15138            Py_REFCNT(s) += 1;
15139            immortal_size += PyUnicode_GET_LENGTH(s);
15140            break;
15141        case SSTATE_INTERNED_MORTAL:
15142            Py_REFCNT(s) += 2;
15143            mortal_size += PyUnicode_GET_LENGTH(s);
15144            break;
15145        default:
15146            Py_FatalError("Inconsistent interned string state.");
15147        }
15148        _PyUnicode_STATE(s).interned = SSTATE_NOT_INTERNED;
15149    }
15150    fprintf(stderr, "total size of all interned strings: "
15151            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
15152            "mortal/immortal\n", mortal_size, immortal_size);
15153    Py_DECREF(keys);
15154    PyDict_Clear(interned);
15155    Py_CLEAR(interned);
15156}
15157
15158
15159/********************* Unicode Iterator **************************/
15160
15161typedef struct {
15162    PyObject_HEAD
15163    Py_ssize_t it_index;
15164    PyObject *it_seq;    /* Set to NULL when iterator is exhausted */
15165} unicodeiterobject;
15166
15167static void
15168unicodeiter_dealloc(unicodeiterobject *it)
15169{
15170    _PyObject_GC_UNTRACK(it);
15171    Py_XDECREF(it->it_seq);
15172    PyObject_GC_Del(it);
15173}
15174
15175static int
15176unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
15177{
15178    Py_VISIT(it->it_seq);
15179    return 0;
15180}
15181
15182static PyObject *
15183unicodeiter_next(unicodeiterobject *it)
15184{
15185    PyObject *seq, *item;
15186
15187    assert(it != NULL);
15188    seq = it->it_seq;
15189    if (seq == NULL)
15190        return NULL;
15191    assert(_PyUnicode_CHECK(seq));
15192
15193    if (it->it_index < PyUnicode_GET_LENGTH(seq)) {
15194        int kind = PyUnicode_KIND(seq);
15195        void *data = PyUnicode_DATA(seq);
15196        Py_UCS4 chr = PyUnicode_READ(kind, data, it->it_index);
15197        item = PyUnicode_FromOrdinal(chr);
15198        if (item != NULL)
15199            ++it->it_index;
15200        return item;
15201    }
15202
15203    Py_DECREF(seq);
15204    it->it_seq = NULL;
15205    return NULL;
15206}
15207
15208static PyObject *
15209unicodeiter_len(unicodeiterobject *it)
15210{
15211    Py_ssize_t len = 0;
15212    if (it->it_seq)
15213        len = PyUnicode_GET_LENGTH(it->it_seq) - it->it_index;
15214    return PyLong_FromSsize_t(len);
15215}
15216
15217PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
15218
15219static PyObject *
15220unicodeiter_reduce(unicodeiterobject *it)
15221{
15222    if (it->it_seq != NULL) {
15223        return Py_BuildValue("N(O)n", _PyObject_GetBuiltin("iter"),
15224                             it->it_seq, it->it_index);
15225    } else {
15226        PyObject *u = PyUnicode_FromUnicode(NULL, 0);
15227        if (u == NULL)
15228            return NULL;
15229        return Py_BuildValue("N(N)", _PyObject_GetBuiltin("iter"), u);
15230    }
15231}
15232
15233PyDoc_STRVAR(reduce_doc, "Return state information for pickling.");
15234
15235static PyObject *
15236unicodeiter_setstate(unicodeiterobject *it, PyObject *state)
15237{
15238    Py_ssize_t index = PyLong_AsSsize_t(state);
15239    if (index == -1 && PyErr_Occurred())
15240        return NULL;
15241    if (it->it_seq != NULL) {
15242        if (index < 0)
15243            index = 0;
15244        else if (index > PyUnicode_GET_LENGTH(it->it_seq))
15245            index = PyUnicode_GET_LENGTH(it->it_seq); /* iterator truncated */
15246        it->it_index = index;
15247    }
15248    Py_RETURN_NONE;
15249}
15250
15251PyDoc_STRVAR(setstate_doc, "Set state information for unpickling.");
15252
15253static PyMethodDef unicodeiter_methods[] = {
15254    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
15255     length_hint_doc},
15256    {"__reduce__",      (PyCFunction)unicodeiter_reduce, METH_NOARGS,
15257     reduce_doc},
15258    {"__setstate__",    (PyCFunction)unicodeiter_setstate, METH_O,
15259     setstate_doc},
15260    {NULL,      NULL}       /* sentinel */
15261};
15262
15263PyTypeObject PyUnicodeIter_Type = {
15264    PyVarObject_HEAD_INIT(&PyType_Type, 0)
15265    "str_iterator",         /* tp_name */
15266    sizeof(unicodeiterobject),      /* tp_basicsize */
15267    0,                  /* tp_itemsize */
15268    /* methods */
15269    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
15270    0,                  /* tp_print */
15271    0,                  /* tp_getattr */
15272    0,                  /* tp_setattr */
15273    0,                  /* tp_reserved */
15274    0,                  /* tp_repr */
15275    0,                  /* tp_as_number */
15276    0,                  /* tp_as_sequence */
15277    0,                  /* tp_as_mapping */
15278    0,                  /* tp_hash */
15279    0,                  /* tp_call */
15280    0,                  /* tp_str */
15281    PyObject_GenericGetAttr,        /* tp_getattro */
15282    0,                  /* tp_setattro */
15283    0,                  /* tp_as_buffer */
15284    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
15285    0,                  /* tp_doc */
15286    (traverseproc)unicodeiter_traverse, /* tp_traverse */
15287    0,                  /* tp_clear */
15288    0,                  /* tp_richcompare */
15289    0,                  /* tp_weaklistoffset */
15290    PyObject_SelfIter,          /* tp_iter */
15291    (iternextfunc)unicodeiter_next,     /* tp_iternext */
15292    unicodeiter_methods,            /* tp_methods */
15293    0,
15294};
15295
15296static PyObject *
15297unicode_iter(PyObject *seq)
15298{
15299    unicodeiterobject *it;
15300
15301    if (!PyUnicode_Check(seq)) {
15302        PyErr_BadInternalCall();
15303        return NULL;
15304    }
15305    if (PyUnicode_READY(seq) == -1)
15306        return NULL;
15307    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
15308    if (it == NULL)
15309        return NULL;
15310    it->it_index = 0;
15311    Py_INCREF(seq);
15312    it->it_seq = seq;
15313    _PyObject_GC_TRACK(it);
15314    return (PyObject *)it;
15315}
15316
15317
15318size_t
15319Py_UNICODE_strlen(const Py_UNICODE *u)
15320{
15321    int res = 0;
15322    while(*u++)
15323        res++;
15324    return res;
15325}
15326
15327Py_UNICODE*
15328Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
15329{
15330    Py_UNICODE *u = s1;
15331    while ((*u++ = *s2++));
15332    return s1;
15333}
15334
15335Py_UNICODE*
15336Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15337{
15338    Py_UNICODE *u = s1;
15339    while ((*u++ = *s2++))
15340        if (n-- == 0)
15341            break;
15342    return s1;
15343}
15344
15345Py_UNICODE*
15346Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
15347{
15348    Py_UNICODE *u1 = s1;
15349    u1 += Py_UNICODE_strlen(u1);
15350    Py_UNICODE_strcpy(u1, s2);
15351    return s1;
15352}
15353
15354int
15355Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
15356{
15357    while (*s1 && *s2 && *s1 == *s2)
15358        s1++, s2++;
15359    if (*s1 && *s2)
15360        return (*s1 < *s2) ? -1 : +1;
15361    if (*s1)
15362        return 1;
15363    if (*s2)
15364        return -1;
15365    return 0;
15366}
15367
15368int
15369Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
15370{
15371    Py_UNICODE u1, u2;
15372    for (; n != 0; n--) {
15373        u1 = *s1;
15374        u2 = *s2;
15375        if (u1 != u2)
15376            return (u1 < u2) ? -1 : +1;
15377        if (u1 == '\0')
15378            return 0;
15379        s1++;
15380        s2++;
15381    }
15382    return 0;
15383}
15384
15385Py_UNICODE*
15386Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
15387{
15388    const Py_UNICODE *p;
15389    for (p = s; *p; p++)
15390        if (*p == c)
15391            return (Py_UNICODE*)p;
15392    return NULL;
15393}
15394
15395Py_UNICODE*
15396Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
15397{
15398    const Py_UNICODE *p;
15399    p = s + Py_UNICODE_strlen(s);
15400    while (p != s) {
15401        p--;
15402        if (*p == c)
15403            return (Py_UNICODE*)p;
15404    }
15405    return NULL;
15406}
15407
15408Py_UNICODE*
15409PyUnicode_AsUnicodeCopy(PyObject *unicode)
15410{
15411    Py_UNICODE *u, *copy;
15412    Py_ssize_t len, size;
15413
15414    if (!PyUnicode_Check(unicode)) {
15415        PyErr_BadArgument();
15416        return NULL;
15417    }
15418    u = PyUnicode_AsUnicodeAndSize(unicode, &len);
15419    if (u == NULL)
15420        return NULL;
15421    /* Ensure we won't overflow the size. */
15422    if (len > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
15423        PyErr_NoMemory();
15424        return NULL;
15425    }
15426    size = len + 1; /* copy the null character */
15427    size *= sizeof(Py_UNICODE);
15428    copy = PyMem_Malloc(size);
15429    if (copy == NULL) {
15430        PyErr_NoMemory();
15431        return NULL;
15432    }
15433    memcpy(copy, u, size);
15434    return copy;
15435}
15436
15437/* A _string module, to export formatter_parser and formatter_field_name_split
15438   to the string.Formatter class implemented in Python. */
15439
15440static PyMethodDef _string_methods[] = {
15441    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
15442     METH_O, PyDoc_STR("split the argument as a field name")},
15443    {"formatter_parser", (PyCFunction) formatter_parser,
15444     METH_O, PyDoc_STR("parse the argument as a format string")},
15445    {NULL, NULL}
15446};
15447
15448static struct PyModuleDef _string_module = {
15449    PyModuleDef_HEAD_INIT,
15450    "_string",
15451    PyDoc_STR("string helper module"),
15452    0,
15453    _string_methods,
15454    NULL,
15455    NULL,
15456    NULL,
15457    NULL
15458};
15459
15460PyMODINIT_FUNC
15461PyInit__string(void)
15462{
15463    return PyModule_Create(&_string_module);
15464}
15465
15466
15467#ifdef __cplusplus
15468}
15469#endif
15470