1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#ifdef MS_WINDOWS
49#include <windows.h>
50#endif
51
52/* Limit for the Unicode object free list */
53
54#define PyUnicode_MAXFREELIST       1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58   The implementation will keep allocated Unicode memory intact for
59   all objects on the free list having a size less than this
60   limit. This reduces malloc() overhead for small Unicode objects.
61
62   At worst this will result in PyUnicode_MAXFREELIST *
63   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64   malloc()-overhead) bytes of unused garbage.
65
66   Setting the limit to 0 effectively turns the feature off.
67
68   Note: This is an experimental feature ! If you get core dumps when
69   using Unicode objects, turn this feature off.
70
71*/
72
73#define KEEPALIVE_SIZE_LIMIT       9
74
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
83/* --- Globals ------------------------------------------------------------
84
85NOTE: In the interpreter's initialization phase, some globals are currently
86      initialized dynamically as needed. In the process Unicode objects may
87      be created before the Unicode type is ready.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* Free list for Unicode objects */
97static PyUnicodeObject *free_list = NULL;
98static int numfree = 0;
99
100/* The empty Unicode object is shared to improve performance. */
101static PyUnicodeObject *unicode_empty = NULL;
102
103#define _Py_RETURN_UNICODE_EMPTY()                      \
104    do {                                                \
105        if (unicode_empty != NULL)                      \
106            Py_INCREF(unicode_empty);                   \
107        else {                                          \
108            unicode_empty = _PyUnicode_New(0);          \
109            if (unicode_empty != NULL)                  \
110                Py_INCREF(unicode_empty);               \
111        }                                               \
112        return (PyObject *)unicode_empty;               \
113    } while (0)
114
115/* Single character Unicode strings in the Latin-1 range are being
116   shared as well. */
117static PyUnicodeObject *unicode_latin1[256] = {NULL};
118
119/* Default encoding to use and assume when NULL is passed as encoding
120   parameter; it is initialized by _PyUnicode_Init().
121
122   Always use the PyUnicode_SetDefaultEncoding() and
123   PyUnicode_GetDefaultEncoding() APIs to access this global.
124
125*/
126static char unicode_default_encoding[100 + 1] = "ascii";
127
128/* Fast detection of the most frequent whitespace characters */
129const unsigned char _Py_ascii_whitespace[] = {
130    0, 0, 0, 0, 0, 0, 0, 0,
131/*     case 0x0009: * CHARACTER TABULATION */
132/*     case 0x000A: * LINE FEED */
133/*     case 0x000B: * LINE TABULATION */
134/*     case 0x000C: * FORM FEED */
135/*     case 0x000D: * CARRIAGE RETURN */
136    0, 1, 1, 1, 1, 1, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0,
138/*     case 0x001C: * FILE SEPARATOR */
139/*     case 0x001D: * GROUP SEPARATOR */
140/*     case 0x001E: * RECORD SEPARATOR */
141/*     case 0x001F: * UNIT SEPARATOR */
142    0, 0, 0, 0, 1, 1, 1, 1,
143/*     case 0x0020: * SPACE */
144    1, 0, 0, 0, 0, 0, 0, 0,
145    0, 0, 0, 0, 0, 0, 0, 0,
146    0, 0, 0, 0, 0, 0, 0, 0,
147    0, 0, 0, 0, 0, 0, 0, 0,
148
149    0, 0, 0, 0, 0, 0, 0, 0,
150    0, 0, 0, 0, 0, 0, 0, 0,
151    0, 0, 0, 0, 0, 0, 0, 0,
152    0, 0, 0, 0, 0, 0, 0, 0,
153    0, 0, 0, 0, 0, 0, 0, 0,
154    0, 0, 0, 0, 0, 0, 0, 0,
155    0, 0, 0, 0, 0, 0, 0, 0,
156    0, 0, 0, 0, 0, 0, 0, 0
157};
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161    0, 0, 0, 0, 0, 0, 0, 0,
162/*         0x000A, * LINE FEED */
163/*         0x000B, * LINE TABULATION */
164/*         0x000C, * FORM FEED */
165/*         0x000D, * CARRIAGE RETURN */
166    0, 0, 1, 1, 1, 1, 0, 0,
167    0, 0, 0, 0, 0, 0, 0, 0,
168/*         0x001C, * FILE SEPARATOR */
169/*         0x001D, * GROUP SEPARATOR */
170/*         0x001E, * RECORD SEPARATOR */
171    0, 0, 0, 0, 1, 1, 1, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0,
182    0, 0, 0, 0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0, 0, 0, 0,
184    0, 0, 0, 0, 0, 0, 0, 0
185};
186
187
188Py_UNICODE
189PyUnicode_GetMax(void)
190{
191#ifdef Py_UNICODE_WIDE
192    return 0x10FFFF;
193#else
194    /* This is actually an illegal character, so it should
195       not be passed to unichr. */
196    return 0xFFFF;
197#endif
198}
199
200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203   to keep things simple, we use a single bitmask, using the least 5
204   bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak = ~(BLOOM_MASK)0;
221
222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225#define BLOOM_LINEBREAK(ch)                                             \
226    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231    /* calculate simple bloom-style bitmask for a given unicode string */
232
233    BLOOM_MASK mask;
234    Py_ssize_t i;
235
236    mask = 0;
237    for (i = 0; i < len; i++)
238        BLOOM_ADD(mask, ptr[i]);
239
240    return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245    Py_ssize_t i;
246
247    for (i = 0; i < setlen; i++)
248        if (set[i] == chr)
249            return 1;
250
251    return 0;
252}
253
254#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257/* --- Unicode Object ----------------------------------------------------- */
258
259static
260int unicode_resize(register PyUnicodeObject *unicode,
261                   Py_ssize_t length)
262{
263    void *oldstr;
264
265    /* Shortcut if there's nothing much to do. */
266    if (unicode->length == length)
267        goto reset;
268
269    /* Resizing shared object (unicode_empty or single character
270       objects) in-place is not allowed. Use PyUnicode_Resize()
271       instead ! */
272
273    if (unicode == unicode_empty ||
274        (unicode->length == 1 &&
275         unicode->str[0] < 256U &&
276         unicode_latin1[unicode->str[0]] == unicode)) {
277        PyErr_SetString(PyExc_SystemError,
278                        "can't resize shared unicode objects");
279        return -1;
280    }
281
282    /* We allocate one more byte to make sure the string is Ux0000 terminated.
283       The overallocation is also used by fastsearch, which assumes that it's
284       safe to look at str[length] (without making any assumptions about what
285       it contains). */
286
287    oldstr = unicode->str;
288    unicode->str = PyObject_REALLOC(unicode->str,
289                                    sizeof(Py_UNICODE) * (length + 1));
290    if (!unicode->str) {
291        unicode->str = (Py_UNICODE *)oldstr;
292        PyErr_NoMemory();
293        return -1;
294    }
295    unicode->str[length] = 0;
296    unicode->length = length;
297
298  reset:
299    /* Reset the object caches */
300    if (unicode->defenc) {
301        Py_CLEAR(unicode->defenc);
302    }
303    unicode->hash = -1;
304
305    return 0;
306}
307
308/* We allocate one more byte to make sure the string is
309   Ux0000 terminated; some code relies on that.
310
311   XXX This allocator could further be enhanced by assuring that the
312   free list never reduces its size below 1.
313
314*/
315
316static
317PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
318{
319    register PyUnicodeObject *unicode;
320
321    /* Optimization for empty strings */
322    if (length == 0 && unicode_empty != NULL) {
323        Py_INCREF(unicode_empty);
324        return unicode_empty;
325    }
326
327    /* Ensure we won't overflow the size. */
328    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
329        return (PyUnicodeObject *)PyErr_NoMemory();
330    }
331
332    /* Unicode freelist & memory allocation */
333    if (free_list) {
334        unicode = free_list;
335        free_list = *(PyUnicodeObject **)unicode;
336        numfree--;
337        if (unicode->str) {
338            /* Keep-Alive optimization: we only upsize the buffer,
339               never downsize it. */
340            if ((unicode->length < length) &&
341                unicode_resize(unicode, length) < 0) {
342                PyObject_DEL(unicode->str);
343                unicode->str = NULL;
344            }
345        }
346        else {
347            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
348            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
349        }
350        PyObject_INIT(unicode, &PyUnicode_Type);
351    }
352    else {
353        size_t new_size;
354        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
355        if (unicode == NULL)
356            return NULL;
357        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
358        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
359    }
360
361    if (!unicode->str) {
362        PyErr_NoMemory();
363        goto onError;
364    }
365    /* Initialize the first element to guard against cases where
366     * the caller fails before initializing str -- unicode_resize()
367     * reads str[0], and the Keep-Alive optimization can keep memory
368     * allocated for str alive across a call to unicode_dealloc(unicode).
369     * We don't want unicode_resize to read uninitialized memory in
370     * that case.
371     */
372    unicode->str[0] = 0;
373    unicode->str[length] = 0;
374    unicode->length = length;
375    unicode->hash = -1;
376    unicode->defenc = NULL;
377    return unicode;
378
379  onError:
380    /* XXX UNREF/NEWREF interface should be more symmetrical */
381    _Py_DEC_REFTOTAL;
382    _Py_ForgetReference((PyObject *)unicode);
383    PyObject_Del(unicode);
384    return NULL;
385}
386
387static
388void unicode_dealloc(register PyUnicodeObject *unicode)
389{
390    if (PyUnicode_CheckExact(unicode) &&
391        numfree < PyUnicode_MAXFREELIST) {
392        /* Keep-Alive optimization */
393        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
394            PyObject_DEL(unicode->str);
395            unicode->str = NULL;
396            unicode->length = 0;
397        }
398        if (unicode->defenc) {
399            Py_CLEAR(unicode->defenc);
400        }
401        /* Add to free list */
402        *(PyUnicodeObject **)unicode = free_list;
403        free_list = unicode;
404        numfree++;
405    }
406    else {
407        PyObject_DEL(unicode->str);
408        Py_XDECREF(unicode->defenc);
409        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
410    }
411}
412
413static
414int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
415{
416    register PyUnicodeObject *v;
417
418    /* Argument checks */
419    if (unicode == NULL) {
420        PyErr_BadInternalCall();
421        return -1;
422    }
423    v = *unicode;
424    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
425        PyErr_BadInternalCall();
426        return -1;
427    }
428
429    /* Resizing unicode_empty and single character objects is not
430       possible since these are being shared. We simply return a fresh
431       copy with the same Unicode content. */
432    if (v->length != length &&
433        (v == unicode_empty || v->length == 1)) {
434        PyUnicodeObject *w = _PyUnicode_New(length);
435        if (w == NULL)
436            return -1;
437        Py_UNICODE_COPY(w->str, v->str,
438                        length < v->length ? length : v->length);
439        Py_DECREF(*unicode);
440        *unicode = w;
441        return 0;
442    }
443
444    /* Note that we don't have to modify *unicode for unshared Unicode
445       objects, since we can modify them in-place. */
446    return unicode_resize(v, length);
447}
448
449int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
450{
451    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
452}
453
454PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
455                                Py_ssize_t size)
456{
457    PyUnicodeObject *unicode;
458
459    /* If the Unicode data is known at construction time, we can apply
460       some optimizations which share commonly used objects. */
461    if (u != NULL) {
462
463        /* Optimization for empty strings */
464        if (size == 0)
465            _Py_RETURN_UNICODE_EMPTY();
466
467        /* Single character Unicode objects in the Latin-1 range are
468           shared when using this constructor */
469        if (size == 1 && *u < 256) {
470            unicode = unicode_latin1[*u];
471            if (!unicode) {
472                unicode = _PyUnicode_New(1);
473                if (!unicode)
474                    return NULL;
475                unicode->str[0] = *u;
476                unicode_latin1[*u] = unicode;
477            }
478            Py_INCREF(unicode);
479            return (PyObject *)unicode;
480        }
481    }
482
483    unicode = _PyUnicode_New(size);
484    if (!unicode)
485        return NULL;
486
487    /* Copy the Unicode data into the new object */
488    if (u != NULL)
489        Py_UNICODE_COPY(unicode->str, u, size);
490
491    return (PyObject *)unicode;
492}
493
494PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
495{
496    PyUnicodeObject *unicode;
497
498    if (size < 0) {
499        PyErr_SetString(PyExc_SystemError,
500                        "Negative size passed to PyUnicode_FromStringAndSize");
501        return NULL;
502    }
503
504    /* If the Unicode data is known at construction time, we can apply
505       some optimizations which share commonly used objects.
506       Also, this means the input must be UTF-8, so fall back to the
507       UTF-8 decoder at the end. */
508    if (u != NULL) {
509
510        /* Optimization for empty strings */
511        if (size == 0)
512            _Py_RETURN_UNICODE_EMPTY();
513
514        /* Single characters are shared when using this constructor.
515           Restrict to ASCII, since the input must be UTF-8. */
516        if (size == 1 && Py_CHARMASK(*u) < 128) {
517            unicode = unicode_latin1[Py_CHARMASK(*u)];
518            if (!unicode) {
519                unicode = _PyUnicode_New(1);
520                if (!unicode)
521                    return NULL;
522                unicode->str[0] = Py_CHARMASK(*u);
523                unicode_latin1[Py_CHARMASK(*u)] = unicode;
524            }
525            Py_INCREF(unicode);
526            return (PyObject *)unicode;
527        }
528
529        return PyUnicode_DecodeUTF8(u, size, NULL);
530    }
531
532    unicode = _PyUnicode_New(size);
533    if (!unicode)
534        return NULL;
535
536    return (PyObject *)unicode;
537}
538
539PyObject *PyUnicode_FromString(const char *u)
540{
541    size_t size = strlen(u);
542    if (size > PY_SSIZE_T_MAX) {
543        PyErr_SetString(PyExc_OverflowError, "input too long");
544        return NULL;
545    }
546
547    return PyUnicode_FromStringAndSize(u, size);
548}
549
550/* _Py_UNICODE_NEXT is a private macro used to retrieve the character pointed
551 * by 'ptr', possibly combining surrogate pairs on narrow builds.
552 * 'ptr' and 'end' must be Py_UNICODE*, with 'ptr' pointing at the character
553 * that should be returned and 'end' pointing to the end of the buffer.
554 * ('end' is used on narrow builds to detect a lone surrogate at the
555 * end of the buffer that should be returned unchanged.)
556 * The ptr and end arguments should be side-effect free and ptr must an lvalue.
557 * The type of the returned char is always Py_UCS4.
558 *
559 * Note: the macro advances ptr to next char, so it might have side-effects
560 *       (especially if used with other macros).
561 */
562
563/* helper macros used by _Py_UNICODE_NEXT */
564#define _Py_UNICODE_IS_HIGH_SURROGATE(ch) (0xD800 <= ch && ch <= 0xDBFF)
565#define _Py_UNICODE_IS_LOW_SURROGATE(ch) (0xDC00 <= ch && ch <= 0xDFFF)
566/* Join two surrogate characters and return a single Py_UCS4 value. */
567#define _Py_UNICODE_JOIN_SURROGATES(high, low)  \
568    (((((Py_UCS4)(high) & 0x03FF) << 10) |      \
569      ((Py_UCS4)(low) & 0x03FF)) + 0x10000)
570
571#ifdef Py_UNICODE_WIDE
572#define _Py_UNICODE_NEXT(ptr, end) *(ptr)++
573#else
574#define _Py_UNICODE_NEXT(ptr, end)                                      \
575     (((_Py_UNICODE_IS_HIGH_SURROGATE(*(ptr)) && (ptr) < (end)) &&      \
576        _Py_UNICODE_IS_LOW_SURROGATE((ptr)[1])) ?                       \
577       ((ptr) += 2,_Py_UNICODE_JOIN_SURROGATES((ptr)[-2], (ptr)[-1])) : \
578       (Py_UCS4)*(ptr)++)
579#endif
580
581#ifdef HAVE_WCHAR_H
582
583#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
584# define CONVERT_WCHAR_TO_SURROGATES
585#endif
586
587#ifdef CONVERT_WCHAR_TO_SURROGATES
588
589/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
590   to convert from UTF32 to UTF16. */
591
592PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
593                                 Py_ssize_t size)
594{
595    PyUnicodeObject *unicode;
596    register Py_ssize_t i;
597    Py_ssize_t alloc;
598    const wchar_t *orig_w;
599
600    if (w == NULL) {
601        PyErr_BadInternalCall();
602        return NULL;
603    }
604
605    alloc = size;
606    orig_w = w;
607    for (i = size; i > 0; i--) {
608        if (*w > 0xFFFF)
609            alloc++;
610        w++;
611    }
612    w = orig_w;
613    unicode = _PyUnicode_New(alloc);
614    if (!unicode)
615        return NULL;
616
617    /* Copy the wchar_t data into the new object */
618    {
619        register Py_UNICODE *u;
620        u = PyUnicode_AS_UNICODE(unicode);
621        for (i = size; i > 0; i--) {
622            if (*w > 0xFFFF) {
623                wchar_t ordinal = *w++;
624                ordinal -= 0x10000;
625                *u++ = 0xD800 | (ordinal >> 10);
626                *u++ = 0xDC00 | (ordinal & 0x3FF);
627            }
628            else
629                *u++ = *w++;
630        }
631    }
632    return (PyObject *)unicode;
633}
634
635#else
636
637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
638                                 Py_ssize_t size)
639{
640    PyUnicodeObject *unicode;
641
642    if (w == NULL) {
643        PyErr_BadInternalCall();
644        return NULL;
645    }
646
647    unicode = _PyUnicode_New(size);
648    if (!unicode)
649        return NULL;
650
651    /* Copy the wchar_t data into the new object */
652#ifdef HAVE_USABLE_WCHAR_T
653    memcpy(unicode->str, w, size * sizeof(wchar_t));
654#else
655    {
656        register Py_UNICODE *u;
657        register Py_ssize_t i;
658        u = PyUnicode_AS_UNICODE(unicode);
659        for (i = size; i > 0; i--)
660            *u++ = *w++;
661    }
662#endif
663
664    return (PyObject *)unicode;
665}
666
667#endif /* CONVERT_WCHAR_TO_SURROGATES */
668
669#undef CONVERT_WCHAR_TO_SURROGATES
670
671static void
672makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
673{
674    *fmt++ = '%';
675    if (width) {
676        if (zeropad)
677            *fmt++ = '0';
678        fmt += sprintf(fmt, "%d", width);
679    }
680    if (precision)
681        fmt += sprintf(fmt, ".%d", precision);
682    if (longflag)
683        *fmt++ = 'l';
684    else if (size_tflag) {
685        char *f = PY_FORMAT_SIZE_T;
686        while (*f)
687            *fmt++ = *f++;
688    }
689    *fmt++ = c;
690    *fmt = '\0';
691}
692
693#define appendstring(string) \
694    do { \
695        for (copy = string;*copy; copy++) { \
696            *s++ = (unsigned char)*copy; \
697        } \
698    } while (0)
699
700PyObject *
701PyUnicode_FromFormatV(const char *format, va_list vargs)
702{
703    va_list count;
704    Py_ssize_t callcount = 0;
705    PyObject **callresults = NULL;
706    PyObject **callresult = NULL;
707    Py_ssize_t n = 0;
708    int width = 0;
709    int precision = 0;
710    int zeropad;
711    const char* f;
712    Py_UNICODE *s;
713    PyObject *string;
714    /* used by sprintf */
715    char buffer[21];
716    /* use abuffer instead of buffer, if we need more space
717     * (which can happen if there's a format specifier with width). */
718    char *abuffer = NULL;
719    char *realbuffer;
720    Py_ssize_t abuffersize = 0;
721    char fmt[60]; /* should be enough for %0width.precisionld */
722    const char *copy;
723
724#ifdef VA_LIST_IS_ARRAY
725    Py_MEMCPY(count, vargs, sizeof(va_list));
726#else
727#ifdef  __va_copy
728    __va_copy(count, vargs);
729#else
730    count = vargs;
731#endif
732#endif
733     /* step 1: count the number of %S/%R/%s format specifications
734      * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
735      * objects once during step 3 and put the result in an array) */
736    for (f = format; *f; f++) {
737         if (*f == '%') {
738             f++;
739             while (*f && *f != '%' && !isalpha((unsigned)*f))
740                 f++;
741             if (!*f)
742                 break;
743             if (*f == 's' || *f=='S' || *f=='R')
744                 ++callcount;
745         }
746    }
747    /* step 2: allocate memory for the results of
748     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
749    if (callcount) {
750        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
751        if (!callresults) {
752            PyErr_NoMemory();
753            return NULL;
754        }
755        callresult = callresults;
756    }
757    /* step 3: figure out how large a buffer we need */
758    for (f = format; *f; f++) {
759        if (*f == '%') {
760            const char* p = f++;
761            width = 0;
762            while (isdigit((unsigned)*f))
763                width = (width*10) + *f++ - '0';
764            precision = 0;
765            if (*f == '.') {
766                f++;
767                while (isdigit((unsigned)*f))
768                    precision = (precision*10) + *f++ - '0';
769            }
770
771            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
772             * they don't affect the amount of space we reserve.
773             */
774            if ((*f == 'l' || *f == 'z') &&
775                (f[1] == 'd' || f[1] == 'u'))
776                ++f;
777
778            switch (*f) {
779            case 'c':
780            {
781                int ordinal = va_arg(count, int);
782#ifdef Py_UNICODE_WIDE
783                if (ordinal < 0 || ordinal > 0x10ffff) {
784                    PyErr_SetString(PyExc_OverflowError,
785                                    "%c arg not in range(0x110000) "
786                                    "(wide Python build)");
787                    goto fail;
788                }
789#else
790                if (ordinal < 0 || ordinal > 0xffff) {
791                    PyErr_SetString(PyExc_OverflowError,
792                                    "%c arg not in range(0x10000) "
793                                    "(narrow Python build)");
794                    goto fail;
795                }
796#endif
797                /* fall through... */
798            }
799            case '%':
800                n++;
801                break;
802            case 'd': case 'u': case 'i': case 'x':
803                (void) va_arg(count, int);
804                if (width < precision)
805                    width = precision;
806                /* 20 bytes is enough to hold a 64-bit
807                   integer.  Decimal takes the most space.
808                   This isn't enough for octal.
809                   If a width is specified we need more
810                   (which we allocate later). */
811                if (width < 20)
812                    width = 20;
813                n += width;
814                if (abuffersize < width)
815                    abuffersize = width;
816                break;
817            case 's':
818            {
819                /* UTF-8 */
820                const char *s = va_arg(count, const char*);
821                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
822                if (!str)
823                    goto fail;
824                n += PyUnicode_GET_SIZE(str);
825                /* Remember the str and switch to the next slot */
826                *callresult++ = str;
827                break;
828            }
829            case 'U':
830            {
831                PyObject *obj = va_arg(count, PyObject *);
832                assert(obj && PyUnicode_Check(obj));
833                n += PyUnicode_GET_SIZE(obj);
834                break;
835            }
836            case 'V':
837            {
838                PyObject *obj = va_arg(count, PyObject *);
839                const char *str = va_arg(count, const char *);
840                assert(obj || str);
841                assert(!obj || PyUnicode_Check(obj));
842                if (obj)
843                    n += PyUnicode_GET_SIZE(obj);
844                else
845                    n += strlen(str);
846                break;
847            }
848            case 'S':
849            {
850                PyObject *obj = va_arg(count, PyObject *);
851                PyObject *str;
852                assert(obj);
853                str = PyObject_Str(obj);
854                if (!str)
855                    goto fail;
856                n += PyString_GET_SIZE(str);
857                /* Remember the str and switch to the next slot */
858                *callresult++ = str;
859                break;
860            }
861            case 'R':
862            {
863                PyObject *obj = va_arg(count, PyObject *);
864                PyObject *repr;
865                assert(obj);
866                repr = PyObject_Repr(obj);
867                if (!repr)
868                    goto fail;
869                n += PyUnicode_GET_SIZE(repr);
870                /* Remember the repr and switch to the next slot */
871                *callresult++ = repr;
872                break;
873            }
874            case 'p':
875                (void) va_arg(count, int);
876                /* maximum 64-bit pointer representation:
877                 * 0xffffffffffffffff
878                 * so 19 characters is enough.
879                 * XXX I count 18 -- what's the extra for?
880                 */
881                n += 19;
882                break;
883            default:
884                /* if we stumble upon an unknown
885                   formatting code, copy the rest of
886                   the format string to the output
887                   string. (we cannot just skip the
888                   code, since there's no way to know
889                   what's in the argument list) */
890                n += strlen(p);
891                goto expand;
892            }
893        } else
894            n++;
895    }
896  expand:
897    if (abuffersize > 20) {
898        /* add 1 for sprintf's trailing null byte */
899        abuffer = PyObject_Malloc(abuffersize + 1);
900        if (!abuffer) {
901            PyErr_NoMemory();
902            goto fail;
903        }
904        realbuffer = abuffer;
905    }
906    else
907        realbuffer = buffer;
908    /* step 4: fill the buffer */
909    /* Since we've analyzed how much space we need for the worst case,
910       we don't have to resize the string.
911       There can be no errors beyond this point. */
912    string = PyUnicode_FromUnicode(NULL, n);
913    if (!string)
914        goto fail;
915
916    s = PyUnicode_AS_UNICODE(string);
917    callresult = callresults;
918
919    for (f = format; *f; f++) {
920        if (*f == '%') {
921            const char* p = f++;
922            int longflag = 0;
923            int size_tflag = 0;
924            zeropad = (*f == '0');
925            /* parse the width.precision part */
926            width = 0;
927            while (isdigit((unsigned)*f))
928                width = (width*10) + *f++ - '0';
929            precision = 0;
930            if (*f == '.') {
931                f++;
932                while (isdigit((unsigned)*f))
933                    precision = (precision*10) + *f++ - '0';
934            }
935            /* handle the long flag, but only for %ld and %lu.
936               others can be added when necessary. */
937            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
938                longflag = 1;
939                ++f;
940            }
941            /* handle the size_t flag. */
942            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
943                size_tflag = 1;
944                ++f;
945            }
946
947            switch (*f) {
948            case 'c':
949                *s++ = va_arg(vargs, int);
950                break;
951            case 'd':
952                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
953                if (longflag)
954                    sprintf(realbuffer, fmt, va_arg(vargs, long));
955                else if (size_tflag)
956                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
957                else
958                    sprintf(realbuffer, fmt, va_arg(vargs, int));
959                appendstring(realbuffer);
960                break;
961            case 'u':
962                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
963                if (longflag)
964                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
965                else if (size_tflag)
966                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
967                else
968                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
969                appendstring(realbuffer);
970                break;
971            case 'i':
972                makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
973                sprintf(realbuffer, fmt, va_arg(vargs, int));
974                appendstring(realbuffer);
975                break;
976            case 'x':
977                makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
978                sprintf(realbuffer, fmt, va_arg(vargs, int));
979                appendstring(realbuffer);
980                break;
981            case 's':
982            {
983                /* unused, since we already have the result */
984                (void) va_arg(vargs, char *);
985                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
986                                PyUnicode_GET_SIZE(*callresult));
987                s += PyUnicode_GET_SIZE(*callresult);
988                /* We're done with the unicode()/repr() => forget it */
989                Py_DECREF(*callresult);
990                /* switch to next unicode()/repr() result */
991                ++callresult;
992                break;
993            }
994            case 'U':
995            {
996                PyObject *obj = va_arg(vargs, PyObject *);
997                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
998                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
999                s += size;
1000                break;
1001            }
1002            case 'V':
1003            {
1004                PyObject *obj = va_arg(vargs, PyObject *);
1005                const char *str = va_arg(vargs, const char *);
1006                if (obj) {
1007                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1008                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1009                    s += size;
1010                } else {
1011                    appendstring(str);
1012                }
1013                break;
1014            }
1015            case 'S':
1016            case 'R':
1017            {
1018                const char *str = PyString_AS_STRING(*callresult);
1019                /* unused, since we already have the result */
1020                (void) va_arg(vargs, PyObject *);
1021                appendstring(str);
1022                /* We're done with the unicode()/repr() => forget it */
1023                Py_DECREF(*callresult);
1024                /* switch to next unicode()/repr() result */
1025                ++callresult;
1026                break;
1027            }
1028            case 'p':
1029                sprintf(buffer, "%p", va_arg(vargs, void*));
1030                /* %p is ill-defined:  ensure leading 0x. */
1031                if (buffer[1] == 'X')
1032                    buffer[1] = 'x';
1033                else if (buffer[1] != 'x') {
1034                    memmove(buffer+2, buffer, strlen(buffer)+1);
1035                    buffer[0] = '0';
1036                    buffer[1] = 'x';
1037                }
1038                appendstring(buffer);
1039                break;
1040            case '%':
1041                *s++ = '%';
1042                break;
1043            default:
1044                appendstring(p);
1045                goto end;
1046            }
1047        } else
1048            *s++ = *f;
1049    }
1050
1051  end:
1052    if (callresults)
1053        PyObject_Free(callresults);
1054    if (abuffer)
1055        PyObject_Free(abuffer);
1056    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1057    return string;
1058  fail:
1059    if (callresults) {
1060        PyObject **callresult2 = callresults;
1061        while (callresult2 < callresult) {
1062            Py_DECREF(*callresult2);
1063            ++callresult2;
1064        }
1065        PyObject_Free(callresults);
1066    }
1067    if (abuffer)
1068        PyObject_Free(abuffer);
1069    return NULL;
1070}
1071
1072#undef appendstring
1073
1074PyObject *
1075PyUnicode_FromFormat(const char *format, ...)
1076{
1077    PyObject* ret;
1078    va_list vargs;
1079
1080#ifdef HAVE_STDARG_PROTOTYPES
1081    va_start(vargs, format);
1082#else
1083    va_start(vargs);
1084#endif
1085    ret = PyUnicode_FromFormatV(format, vargs);
1086    va_end(vargs);
1087    return ret;
1088}
1089
1090Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1091                                wchar_t *w,
1092                                Py_ssize_t size)
1093{
1094    if (unicode == NULL) {
1095        PyErr_BadInternalCall();
1096        return -1;
1097    }
1098
1099    /* If possible, try to copy the 0-termination as well */
1100    if (size > PyUnicode_GET_SIZE(unicode))
1101        size = PyUnicode_GET_SIZE(unicode) + 1;
1102
1103#ifdef HAVE_USABLE_WCHAR_T
1104    memcpy(w, unicode->str, size * sizeof(wchar_t));
1105#else
1106    {
1107        register Py_UNICODE *u;
1108        register Py_ssize_t i;
1109        u = PyUnicode_AS_UNICODE(unicode);
1110        for (i = size; i > 0; i--)
1111            *w++ = *u++;
1112    }
1113#endif
1114
1115    if (size > PyUnicode_GET_SIZE(unicode))
1116        return PyUnicode_GET_SIZE(unicode);
1117    else
1118        return size;
1119}
1120
1121#endif
1122
1123PyObject *PyUnicode_FromOrdinal(int ordinal)
1124{
1125    Py_UNICODE s[1];
1126
1127#ifdef Py_UNICODE_WIDE
1128    if (ordinal < 0 || ordinal > 0x10ffff) {
1129        PyErr_SetString(PyExc_ValueError,
1130                        "unichr() arg not in range(0x110000) "
1131                        "(wide Python build)");
1132        return NULL;
1133    }
1134#else
1135    if (ordinal < 0 || ordinal > 0xffff) {
1136        PyErr_SetString(PyExc_ValueError,
1137                        "unichr() arg not in range(0x10000) "
1138                        "(narrow Python build)");
1139        return NULL;
1140    }
1141#endif
1142
1143    s[0] = (Py_UNICODE)ordinal;
1144    return PyUnicode_FromUnicode(s, 1);
1145}
1146
1147PyObject *PyUnicode_FromObject(register PyObject *obj)
1148{
1149    /* XXX Perhaps we should make this API an alias of
1150       PyObject_Unicode() instead ?! */
1151    if (PyUnicode_CheckExact(obj)) {
1152        Py_INCREF(obj);
1153        return obj;
1154    }
1155    if (PyUnicode_Check(obj)) {
1156        /* For a Unicode subtype that's not a Unicode object,
1157           return a true Unicode object with the same data. */
1158        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1159                                     PyUnicode_GET_SIZE(obj));
1160    }
1161    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1162}
1163
1164PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1165                                      const char *encoding,
1166                                      const char *errors)
1167{
1168    const char *s = NULL;
1169    Py_ssize_t len;
1170    PyObject *v;
1171
1172    if (obj == NULL) {
1173        PyErr_BadInternalCall();
1174        return NULL;
1175    }
1176
1177#if 0
1178    /* For b/w compatibility we also accept Unicode objects provided
1179       that no encodings is given and then redirect to
1180       PyObject_Unicode() which then applies the additional logic for
1181       Unicode subclasses.
1182
1183       NOTE: This API should really only be used for object which
1184       represent *encoded* Unicode !
1185
1186    */
1187    if (PyUnicode_Check(obj)) {
1188        if (encoding) {
1189            PyErr_SetString(PyExc_TypeError,
1190                            "decoding Unicode is not supported");
1191            return NULL;
1192        }
1193        return PyObject_Unicode(obj);
1194    }
1195#else
1196    if (PyUnicode_Check(obj)) {
1197        PyErr_SetString(PyExc_TypeError,
1198                        "decoding Unicode is not supported");
1199        return NULL;
1200    }
1201#endif
1202
1203    /* Coerce object */
1204    if (PyString_Check(obj)) {
1205        s = PyString_AS_STRING(obj);
1206        len = PyString_GET_SIZE(obj);
1207    }
1208    else if (PyByteArray_Check(obj)) {
1209        /* Python 2.x specific */
1210        PyErr_Format(PyExc_TypeError,
1211                     "decoding bytearray is not supported");
1212        return NULL;
1213    }
1214    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1215        /* Overwrite the error message with something more useful in
1216           case of a TypeError. */
1217        if (PyErr_ExceptionMatches(PyExc_TypeError))
1218            PyErr_Format(PyExc_TypeError,
1219                         "coercing to Unicode: need string or buffer, "
1220                         "%.80s found",
1221                         Py_TYPE(obj)->tp_name);
1222        goto onError;
1223    }
1224
1225    /* Convert to Unicode */
1226    if (len == 0)
1227        _Py_RETURN_UNICODE_EMPTY();
1228
1229    v = PyUnicode_Decode(s, len, encoding, errors);
1230    return v;
1231
1232  onError:
1233    return NULL;
1234}
1235
1236PyObject *PyUnicode_Decode(const char *s,
1237                           Py_ssize_t size,
1238                           const char *encoding,
1239                           const char *errors)
1240{
1241    PyObject *buffer = NULL, *unicode;
1242
1243    if (encoding == NULL)
1244        encoding = PyUnicode_GetDefaultEncoding();
1245
1246    /* Shortcuts for common default encodings */
1247    if (strcmp(encoding, "utf-8") == 0)
1248        return PyUnicode_DecodeUTF8(s, size, errors);
1249    else if (strcmp(encoding, "latin-1") == 0)
1250        return PyUnicode_DecodeLatin1(s, size, errors);
1251#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1252    else if (strcmp(encoding, "mbcs") == 0)
1253        return PyUnicode_DecodeMBCS(s, size, errors);
1254#endif
1255    else if (strcmp(encoding, "ascii") == 0)
1256        return PyUnicode_DecodeASCII(s, size, errors);
1257
1258    /* Decode via the codec registry */
1259    buffer = PyBuffer_FromMemory((void *)s, size);
1260    if (buffer == NULL)
1261        goto onError;
1262    unicode = PyCodec_Decode(buffer, encoding, errors);
1263    if (unicode == NULL)
1264        goto onError;
1265    if (!PyUnicode_Check(unicode)) {
1266        PyErr_Format(PyExc_TypeError,
1267                     "decoder did not return an unicode object (type=%.400s)",
1268                     Py_TYPE(unicode)->tp_name);
1269        Py_DECREF(unicode);
1270        goto onError;
1271    }
1272    Py_DECREF(buffer);
1273    return unicode;
1274
1275  onError:
1276    Py_XDECREF(buffer);
1277    return NULL;
1278}
1279
1280PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1281                                    const char *encoding,
1282                                    const char *errors)
1283{
1284    PyObject *v;
1285
1286    if (!PyUnicode_Check(unicode)) {
1287        PyErr_BadArgument();
1288        goto onError;
1289    }
1290
1291    if (encoding == NULL)
1292        encoding = PyUnicode_GetDefaultEncoding();
1293
1294    /* Decode via the codec registry */
1295    v = PyCodec_Decode(unicode, encoding, errors);
1296    if (v == NULL)
1297        goto onError;
1298    return v;
1299
1300  onError:
1301    return NULL;
1302}
1303
1304PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1305                           Py_ssize_t size,
1306                           const char *encoding,
1307                           const char *errors)
1308{
1309    PyObject *v, *unicode;
1310
1311    unicode = PyUnicode_FromUnicode(s, size);
1312    if (unicode == NULL)
1313        return NULL;
1314    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1315    Py_DECREF(unicode);
1316    return v;
1317}
1318
1319PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1320                                    const char *encoding,
1321                                    const char *errors)
1322{
1323    PyObject *v;
1324
1325    if (!PyUnicode_Check(unicode)) {
1326        PyErr_BadArgument();
1327        goto onError;
1328    }
1329
1330    if (encoding == NULL)
1331        encoding = PyUnicode_GetDefaultEncoding();
1332
1333    /* Encode via the codec registry */
1334    v = PyCodec_Encode(unicode, encoding, errors);
1335    if (v == NULL)
1336        goto onError;
1337    return v;
1338
1339  onError:
1340    return NULL;
1341}
1342
1343PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1344                                    const char *encoding,
1345                                    const char *errors)
1346{
1347    PyObject *v;
1348
1349    if (!PyUnicode_Check(unicode)) {
1350        PyErr_BadArgument();
1351        goto onError;
1352    }
1353
1354    if (encoding == NULL)
1355        encoding = PyUnicode_GetDefaultEncoding();
1356
1357    /* Shortcuts for common default encodings */
1358    if (errors == NULL) {
1359        if (strcmp(encoding, "utf-8") == 0)
1360            return PyUnicode_AsUTF8String(unicode);
1361        else if (strcmp(encoding, "latin-1") == 0)
1362            return PyUnicode_AsLatin1String(unicode);
1363#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1364        else if (strcmp(encoding, "mbcs") == 0)
1365            return PyUnicode_AsMBCSString(unicode);
1366#endif
1367        else if (strcmp(encoding, "ascii") == 0)
1368            return PyUnicode_AsASCIIString(unicode);
1369    }
1370
1371    /* Encode via the codec registry */
1372    v = PyCodec_Encode(unicode, encoding, errors);
1373    if (v == NULL)
1374        goto onError;
1375    if (!PyString_Check(v)) {
1376        PyErr_Format(PyExc_TypeError,
1377                     "encoder did not return a string object (type=%.400s)",
1378                     Py_TYPE(v)->tp_name);
1379        Py_DECREF(v);
1380        goto onError;
1381    }
1382    return v;
1383
1384  onError:
1385    return NULL;
1386}
1387
1388PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1389                                            const char *errors)
1390{
1391    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1392
1393    if (v)
1394        return v;
1395    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1396    if (v && errors == NULL)
1397        ((PyUnicodeObject *)unicode)->defenc = v;
1398    return v;
1399}
1400
1401Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1402{
1403    if (!PyUnicode_Check(unicode)) {
1404        PyErr_BadArgument();
1405        goto onError;
1406    }
1407    return PyUnicode_AS_UNICODE(unicode);
1408
1409  onError:
1410    return NULL;
1411}
1412
1413Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1414{
1415    if (!PyUnicode_Check(unicode)) {
1416        PyErr_BadArgument();
1417        goto onError;
1418    }
1419    return PyUnicode_GET_SIZE(unicode);
1420
1421  onError:
1422    return -1;
1423}
1424
1425const char *PyUnicode_GetDefaultEncoding(void)
1426{
1427    return unicode_default_encoding;
1428}
1429
1430int PyUnicode_SetDefaultEncoding(const char *encoding)
1431{
1432    PyObject *v;
1433
1434    /* Make sure the encoding is valid. As side effect, this also
1435       loads the encoding into the codec registry cache. */
1436    v = _PyCodec_Lookup(encoding);
1437    if (v == NULL)
1438        goto onError;
1439    Py_DECREF(v);
1440    strncpy(unicode_default_encoding,
1441            encoding,
1442            sizeof(unicode_default_encoding) - 1);
1443    return 0;
1444
1445  onError:
1446    return -1;
1447}
1448
1449/* error handling callback helper:
1450   build arguments, call the callback and check the arguments,
1451   if no exception occurred, copy the replacement to the output
1452   and adjust various state variables.
1453   return 0 on success, -1 on error
1454*/
1455
1456static
1457int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1458                                     const char *encoding, const char *reason,
1459                                     const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1460                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1461                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1462{
1463    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1464
1465    PyObject *restuple = NULL;
1466    PyObject *repunicode = NULL;
1467    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1468    Py_ssize_t requiredsize;
1469    Py_ssize_t newpos;
1470    Py_UNICODE *repptr;
1471    Py_ssize_t repsize;
1472    int res = -1;
1473
1474    if (*errorHandler == NULL) {
1475        *errorHandler = PyCodec_LookupError(errors);
1476        if (*errorHandler == NULL)
1477            goto onError;
1478    }
1479
1480    if (*exceptionObject == NULL) {
1481        *exceptionObject = PyUnicodeDecodeError_Create(
1482            encoding, input, insize, *startinpos, *endinpos, reason);
1483        if (*exceptionObject == NULL)
1484            goto onError;
1485    }
1486    else {
1487        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1488            goto onError;
1489        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1490            goto onError;
1491        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1492            goto onError;
1493    }
1494
1495    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1496    if (restuple == NULL)
1497        goto onError;
1498    if (!PyTuple_Check(restuple)) {
1499        PyErr_SetString(PyExc_TypeError, &argparse[4]);
1500        goto onError;
1501    }
1502    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1503        goto onError;
1504    if (newpos<0)
1505        newpos = insize+newpos;
1506    if (newpos<0 || newpos>insize) {
1507        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1508        goto onError;
1509    }
1510
1511    /* need more space? (at least enough for what we
1512       have+the replacement+the rest of the string (starting
1513       at the new input position), so we won't have to check space
1514       when there are no errors in the rest of the string) */
1515    repptr = PyUnicode_AS_UNICODE(repunicode);
1516    repsize = PyUnicode_GET_SIZE(repunicode);
1517    requiredsize = *outpos;
1518    if (requiredsize > PY_SSIZE_T_MAX - repsize)
1519        goto overflow;
1520    requiredsize += repsize;
1521    if (requiredsize > PY_SSIZE_T_MAX - (insize - newpos))
1522        goto overflow;
1523    requiredsize += insize - newpos;
1524    if (requiredsize > outsize) {
1525        if (outsize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*outsize)
1526            requiredsize = 2*outsize;
1527        if (_PyUnicode_Resize(output, requiredsize) < 0)
1528            goto onError;
1529        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1530    }
1531    *endinpos = newpos;
1532    *inptr = input + newpos;
1533    Py_UNICODE_COPY(*outptr, repptr, repsize);
1534    *outptr += repsize;
1535    *outpos += repsize;
1536    /* we made it! */
1537    res = 0;
1538
1539  onError:
1540    Py_XDECREF(restuple);
1541    return res;
1542
1543  overflow:
1544    PyErr_SetString(PyExc_OverflowError,
1545                    "decoded result is too long for a Python string");
1546    goto onError;
1547}
1548
1549/* --- UTF-7 Codec -------------------------------------------------------- */
1550
1551/* See RFC2152 for details.  We encode conservatively and decode liberally. */
1552
1553/* Three simple macros defining base-64. */
1554
1555/* Is c a base-64 character? */
1556
1557#define IS_BASE64(c) \
1558    (isalnum(c) || (c) == '+' || (c) == '/')
1559
1560/* given that c is a base-64 character, what is its base-64 value? */
1561
1562#define FROM_BASE64(c)                                                  \
1563    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1564     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1565     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1566     (c) == '+' ? 62 : 63)
1567
1568/* What is the base-64 character of the bottom 6 bits of n? */
1569
1570#define TO_BASE64(n)  \
1571    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1572
1573/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1574 * decoded as itself.  We are permissive on decoding; the only ASCII
1575 * byte not decoding to itself is the + which begins a base64
1576 * string. */
1577
1578#define DECODE_DIRECT(c)                                \
1579    ((c) <= 127 && (c) != '+')
1580
1581/* The UTF-7 encoder treats ASCII characters differently according to
1582 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1583 * the above).  See RFC2152.  This array identifies these different
1584 * sets:
1585 * 0 : "Set D"
1586 *     alphanumeric and '(),-./:?
1587 * 1 : "Set O"
1588 *     !"#$%&*;<=>@[]^_`{|}
1589 * 2 : "whitespace"
1590 *     ht nl cr sp
1591 * 3 : special (must be base64 encoded)
1592 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1593 */
1594
1595static
1596char utf7_category[128] = {
1597/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1598    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1599/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1600    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1601/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1602    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1603/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1604    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1605/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1606    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1607/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1608    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1609/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1610    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1611/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1612    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1613};
1614
1615/* ENCODE_DIRECT: this character should be encoded as itself.  The
1616 * answer depends on whether we are encoding set O as itself, and also
1617 * on whether we are encoding whitespace as itself.  RFC2152 makes it
1618 * clear that the answers to these questions vary between
1619 * applications, so this code needs to be flexible.  */
1620
1621#define ENCODE_DIRECT(c, directO, directWS)             \
1622    ((c) < 128 && (c) > 0 &&                            \
1623     ((utf7_category[(c)] == 0) ||                      \
1624      (directWS && (utf7_category[(c)] == 2)) ||        \
1625      (directO && (utf7_category[(c)] == 1))))
1626
1627PyObject *PyUnicode_DecodeUTF7(const char *s,
1628                               Py_ssize_t size,
1629                               const char *errors)
1630{
1631    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1632}
1633
1634/* The decoder.  The only state we preserve is our read position,
1635 * i.e. how many characters we have consumed.  So if we end in the
1636 * middle of a shift sequence we have to back off the read position
1637 * and the output to the beginning of the sequence, otherwise we lose
1638 * all the shift state (seen bits, number of bits seen, high
1639 * surrogate). */
1640
1641PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1642                                       Py_ssize_t size,
1643                                       const char *errors,
1644                                       Py_ssize_t *consumed)
1645{
1646    const char *starts = s;
1647    Py_ssize_t startinpos;
1648    Py_ssize_t endinpos;
1649    Py_ssize_t outpos;
1650    const char *e;
1651    PyUnicodeObject *unicode;
1652    Py_UNICODE *p;
1653    const char *errmsg = "";
1654    int inShift = 0;
1655    Py_UNICODE *shiftOutStart;
1656    unsigned int base64bits = 0;
1657    unsigned long base64buffer = 0;
1658    Py_UNICODE surrogate = 0;
1659    PyObject *errorHandler = NULL;
1660    PyObject *exc = NULL;
1661
1662    unicode = _PyUnicode_New(size);
1663    if (!unicode)
1664        return NULL;
1665    if (size == 0) {
1666        if (consumed)
1667            *consumed = 0;
1668        return (PyObject *)unicode;
1669    }
1670
1671    p = unicode->str;
1672    shiftOutStart = p;
1673    e = s + size;
1674
1675    while (s < e) {
1676        Py_UNICODE ch = (unsigned char) *s;
1677
1678        if (inShift) { /* in a base-64 section */
1679            if (IS_BASE64(ch)) { /* consume a base-64 character */
1680                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1681                base64bits += 6;
1682                s++;
1683                if (base64bits >= 16) {
1684                    /* we have enough bits for a UTF-16 value */
1685                    Py_UNICODE outCh = (Py_UNICODE)
1686                                       (base64buffer >> (base64bits-16));
1687                    base64bits -= 16;
1688                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1689                    assert(outCh <= 0xffff);
1690                    if (surrogate) {
1691                        /* expecting a second surrogate */
1692                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1693#ifdef Py_UNICODE_WIDE
1694                            *p++ = (((surrogate & 0x3FF)<<10)
1695                                    | (outCh & 0x3FF)) + 0x10000;
1696#else
1697                            *p++ = surrogate;
1698                            *p++ = outCh;
1699#endif
1700                            surrogate = 0;
1701                            continue;
1702                        }
1703                        else {
1704                            *p++ = surrogate;
1705                            surrogate = 0;
1706                        }
1707                    }
1708                    if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1709                        /* first surrogate */
1710                        surrogate = outCh;
1711                    }
1712                    else {
1713                        *p++ = outCh;
1714                    }
1715                }
1716            }
1717            else { /* now leaving a base-64 section */
1718                inShift = 0;
1719                s++;
1720                if (surrogate) {
1721                    *p++ = surrogate;
1722                    surrogate = 0;
1723                }
1724                if (base64bits > 0) { /* left-over bits */
1725                    if (base64bits >= 6) {
1726                        /* We've seen at least one base-64 character */
1727                        errmsg = "partial character in shift sequence";
1728                        goto utf7Error;
1729                    }
1730                    else {
1731                        /* Some bits remain; they should be zero */
1732                        if (base64buffer != 0) {
1733                            errmsg = "non-zero padding bits in shift sequence";
1734                            goto utf7Error;
1735                        }
1736                    }
1737                }
1738                if (ch != '-') {
1739                    /* '-' is absorbed; other terminating
1740                       characters are preserved */
1741                    *p++ = ch;
1742                }
1743            }
1744        }
1745        else if ( ch == '+' ) {
1746            startinpos = s-starts;
1747            s++; /* consume '+' */
1748            if (s < e && *s == '-') { /* '+-' encodes '+' */
1749                s++;
1750                *p++ = '+';
1751            }
1752            else { /* begin base64-encoded section */
1753                inShift = 1;
1754                shiftOutStart = p;
1755                base64bits = 0;
1756                base64buffer = 0;
1757            }
1758        }
1759        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1760            *p++ = ch;
1761            s++;
1762        }
1763        else {
1764            startinpos = s-starts;
1765            s++;
1766            errmsg = "unexpected special character";
1767            goto utf7Error;
1768        }
1769        continue;
1770utf7Error:
1771        outpos = p-PyUnicode_AS_UNICODE(unicode);
1772        endinpos = s-starts;
1773        if (unicode_decode_call_errorhandler(
1774                errors, &errorHandler,
1775                "utf7", errmsg,
1776                starts, size, &startinpos, &endinpos, &exc, &s,
1777                &unicode, &outpos, &p))
1778            goto onError;
1779    }
1780
1781    /* end of string */
1782
1783    if (inShift && !consumed) { /* in shift sequence, no more to follow */
1784        /* if we're in an inconsistent state, that's an error */
1785        if (surrogate ||
1786                (base64bits >= 6) ||
1787                (base64bits > 0 && base64buffer != 0)) {
1788            outpos = p-PyUnicode_AS_UNICODE(unicode);
1789            endinpos = size;
1790            if (unicode_decode_call_errorhandler(
1791                    errors, &errorHandler,
1792                    "utf7", "unterminated shift sequence",
1793                    starts, size, &startinpos, &endinpos, &exc, &s,
1794                    &unicode, &outpos, &p))
1795                goto onError;
1796        }
1797    }
1798
1799    /* return state */
1800    if (consumed) {
1801        if (inShift) {
1802            p = shiftOutStart; /* back off output */
1803            *consumed = startinpos;
1804        }
1805        else {
1806            *consumed = s-starts;
1807        }
1808    }
1809
1810    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1811        goto onError;
1812
1813    Py_XDECREF(errorHandler);
1814    Py_XDECREF(exc);
1815    return (PyObject *)unicode;
1816
1817  onError:
1818    Py_XDECREF(errorHandler);
1819    Py_XDECREF(exc);
1820    Py_DECREF(unicode);
1821    return NULL;
1822}
1823
1824
1825PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1826                               Py_ssize_t size,
1827                               int base64SetO,
1828                               int base64WhiteSpace,
1829                               const char *errors)
1830{
1831    PyObject *v;
1832    /* It might be possible to tighten this worst case */
1833    Py_ssize_t allocated = 8 * size;
1834    int inShift = 0;
1835    Py_ssize_t i = 0;
1836    unsigned int base64bits = 0;
1837    unsigned long base64buffer = 0;
1838    char * out;
1839    char * start;
1840
1841    if (allocated / 8 != size)
1842        return PyErr_NoMemory();
1843
1844    if (size == 0)
1845        return PyString_FromStringAndSize(NULL, 0);
1846
1847    v = PyString_FromStringAndSize(NULL, allocated);
1848    if (v == NULL)
1849        return NULL;
1850
1851    start = out = PyString_AS_STRING(v);
1852    for (;i < size; ++i) {
1853        Py_UNICODE ch = s[i];
1854
1855        if (inShift) {
1856            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1857                /* shifting out */
1858                if (base64bits) { /* output remaining bits */
1859                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
1860                    base64buffer = 0;
1861                    base64bits = 0;
1862                }
1863                inShift = 0;
1864                /* Characters not in the BASE64 set implicitly unshift the sequence
1865                   so no '-' is required, except if the character is itself a '-' */
1866                if (IS_BASE64(ch) || ch == '-') {
1867                    *out++ = '-';
1868                }
1869                *out++ = (char) ch;
1870            }
1871            else {
1872                goto encode_char;
1873            }
1874        }
1875        else { /* not in a shift sequence */
1876            if (ch == '+') {
1877                *out++ = '+';
1878                        *out++ = '-';
1879            }
1880            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1881                *out++ = (char) ch;
1882            }
1883            else {
1884                *out++ = '+';
1885                inShift = 1;
1886                goto encode_char;
1887            }
1888        }
1889        continue;
1890encode_char:
1891#ifdef Py_UNICODE_WIDE
1892        if (ch >= 0x10000) {
1893            /* code first surrogate */
1894            base64bits += 16;
1895            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1896            while (base64bits >= 6) {
1897                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1898                base64bits -= 6;
1899            }
1900            /* prepare second surrogate */
1901            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1902        }
1903#endif
1904        base64bits += 16;
1905        base64buffer = (base64buffer << 16) | ch;
1906        while (base64bits >= 6) {
1907            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1908            base64bits -= 6;
1909        }
1910    }
1911    if (base64bits)
1912        *out++= TO_BASE64(base64buffer << (6-base64bits) );
1913    if (inShift)
1914        *out++ = '-';
1915
1916    if (_PyString_Resize(&v, out - start))
1917        return NULL;
1918    return v;
1919}
1920
1921#undef IS_BASE64
1922#undef FROM_BASE64
1923#undef TO_BASE64
1924#undef DECODE_DIRECT
1925#undef ENCODE_DIRECT
1926
1927/* --- UTF-8 Codec -------------------------------------------------------- */
1928
1929static
1930char utf8_code_length[256] = {
1931    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1932       illegal prefix.  See RFC 3629 for details */
1933    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1934    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1935    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1936    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1937    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1938    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1939    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1940    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1941    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1942    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1943    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1944    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1945    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1946    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1947    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1948    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1949};
1950
1951PyObject *PyUnicode_DecodeUTF8(const char *s,
1952                               Py_ssize_t size,
1953                               const char *errors)
1954{
1955    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1956}
1957
1958PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1959                                       Py_ssize_t size,
1960                                       const char *errors,
1961                                       Py_ssize_t *consumed)
1962{
1963    const char *starts = s;
1964    int n;
1965    int k;
1966    Py_ssize_t startinpos;
1967    Py_ssize_t endinpos;
1968    Py_ssize_t outpos;
1969    const char *e;
1970    PyUnicodeObject *unicode;
1971    Py_UNICODE *p;
1972    const char *errmsg = "";
1973    PyObject *errorHandler = NULL;
1974    PyObject *exc = NULL;
1975
1976    /* Note: size will always be longer than the resulting Unicode
1977       character count */
1978    unicode = _PyUnicode_New(size);
1979    if (!unicode)
1980        return NULL;
1981    if (size == 0) {
1982        if (consumed)
1983            *consumed = 0;
1984        return (PyObject *)unicode;
1985    }
1986
1987    /* Unpack UTF-8 encoded data */
1988    p = unicode->str;
1989    e = s + size;
1990
1991    while (s < e) {
1992        Py_UCS4 ch = (unsigned char)*s;
1993
1994        if (ch < 0x80) {
1995            *p++ = (Py_UNICODE)ch;
1996            s++;
1997            continue;
1998        }
1999
2000        n = utf8_code_length[ch];
2001
2002        if (s + n > e) {
2003            if (consumed)
2004                break;
2005            else {
2006                errmsg = "unexpected end of data";
2007                startinpos = s-starts;
2008                endinpos = startinpos+1;
2009                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2010                    endinpos++;
2011                goto utf8Error;
2012            }
2013        }
2014
2015        switch (n) {
2016
2017        case 0:
2018            errmsg = "invalid start byte";
2019            startinpos = s-starts;
2020            endinpos = startinpos+1;
2021            goto utf8Error;
2022
2023        case 1:
2024            errmsg = "internal error";
2025            startinpos = s-starts;
2026            endinpos = startinpos+1;
2027            goto utf8Error;
2028
2029        case 2:
2030            if ((s[1] & 0xc0) != 0x80) {
2031                errmsg = "invalid continuation byte";
2032                startinpos = s-starts;
2033                endinpos = startinpos + 1;
2034                goto utf8Error;
2035            }
2036            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2037            assert ((ch > 0x007F) && (ch <= 0x07FF));
2038            *p++ = (Py_UNICODE)ch;
2039            break;
2040
2041        case 3:
2042            /* XXX: surrogates shouldn't be valid UTF-8!
2043               see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2044               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
2045               Uncomment the 2 lines below to make them invalid,
2046               code points: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
2047            if ((s[1] & 0xc0) != 0x80 ||
2048                (s[2] & 0xc0) != 0x80 ||
2049                ((unsigned char)s[0] == 0xE0 &&
2050                 (unsigned char)s[1] < 0xA0)/* ||
2051                ((unsigned char)s[0] == 0xED &&
2052                 (unsigned char)s[1] > 0x9F)*/) {
2053                errmsg = "invalid continuation byte";
2054                startinpos = s-starts;
2055                endinpos = startinpos + 1;
2056
2057                /* if s[1] first two bits are 1 and 0, then the invalid
2058                   continuation byte is s[2], so increment endinpos by 1,
2059                   if not, s[1] is invalid and endinpos doesn't need to
2060                   be incremented. */
2061                if ((s[1] & 0xC0) == 0x80)
2062                    endinpos++;
2063                goto utf8Error;
2064            }
2065            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2066            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2067            *p++ = (Py_UNICODE)ch;
2068            break;
2069
2070        case 4:
2071            if ((s[1] & 0xc0) != 0x80 ||
2072                (s[2] & 0xc0) != 0x80 ||
2073                (s[3] & 0xc0) != 0x80 ||
2074                ((unsigned char)s[0] == 0xF0 &&
2075                 (unsigned char)s[1] < 0x90) ||
2076                ((unsigned char)s[0] == 0xF4 &&
2077                 (unsigned char)s[1] > 0x8F)) {
2078                errmsg = "invalid continuation byte";
2079                startinpos = s-starts;
2080                endinpos = startinpos + 1;
2081                if ((s[1] & 0xC0) == 0x80) {
2082                    endinpos++;
2083                    if ((s[2] & 0xC0) == 0x80)
2084                        endinpos++;
2085                }
2086                goto utf8Error;
2087            }
2088            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2089                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2090            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2091
2092#ifdef Py_UNICODE_WIDE
2093            *p++ = (Py_UNICODE)ch;
2094#else
2095            /*  compute and append the two surrogates: */
2096
2097            /*  translate from 10000..10FFFF to 0..FFFF */
2098            ch -= 0x10000;
2099
2100            /*  high surrogate = top 10 bits added to D800 */
2101            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2102
2103            /*  low surrogate = bottom 10 bits added to DC00 */
2104            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2105#endif
2106            break;
2107        }
2108        s += n;
2109        continue;
2110
2111      utf8Error:
2112        outpos = p-PyUnicode_AS_UNICODE(unicode);
2113        if (unicode_decode_call_errorhandler(
2114                errors, &errorHandler,
2115                "utf8", errmsg,
2116                starts, size, &startinpos, &endinpos, &exc, &s,
2117                &unicode, &outpos, &p))
2118            goto onError;
2119    }
2120    if (consumed)
2121        *consumed = s-starts;
2122
2123    /* Adjust length */
2124    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2125        goto onError;
2126
2127    Py_XDECREF(errorHandler);
2128    Py_XDECREF(exc);
2129    return (PyObject *)unicode;
2130
2131  onError:
2132    Py_XDECREF(errorHandler);
2133    Py_XDECREF(exc);
2134    Py_DECREF(unicode);
2135    return NULL;
2136}
2137
2138/* Allocation strategy:  if the string is short, convert into a stack buffer
2139   and allocate exactly as much space needed at the end.  Else allocate the
2140   maximum possible needed (4 result bytes per Unicode character), and return
2141   the excess memory at the end.
2142*/
2143PyObject *
2144PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2145                     Py_ssize_t size,
2146                     const char *errors)
2147{
2148#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2149
2150    Py_ssize_t i;           /* index into s of next input byte */
2151    PyObject *v;        /* result string object */
2152    char *p;            /* next free byte in output buffer */
2153    Py_ssize_t nallocated;  /* number of result bytes allocated */
2154    Py_ssize_t nneeded;        /* number of result bytes needed */
2155    char stackbuf[MAX_SHORT_UNICHARS * 4];
2156
2157    assert(s != NULL);
2158    assert(size >= 0);
2159
2160    if (size <= MAX_SHORT_UNICHARS) {
2161        /* Write into the stack buffer; nallocated can't overflow.
2162         * At the end, we'll allocate exactly as much heap space as it
2163         * turns out we need.
2164         */
2165        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2166        v = NULL;   /* will allocate after we're done */
2167        p = stackbuf;
2168    }
2169    else {
2170        /* Overallocate on the heap, and give the excess back at the end. */
2171        nallocated = size * 4;
2172        if (nallocated / 4 != size)  /* overflow! */
2173            return PyErr_NoMemory();
2174        v = PyString_FromStringAndSize(NULL, nallocated);
2175        if (v == NULL)
2176            return NULL;
2177        p = PyString_AS_STRING(v);
2178    }
2179
2180    for (i = 0; i < size;) {
2181        Py_UCS4 ch = s[i++];
2182
2183        if (ch < 0x80)
2184            /* Encode ASCII */
2185            *p++ = (char) ch;
2186
2187        else if (ch < 0x0800) {
2188            /* Encode Latin-1 */
2189            *p++ = (char)(0xc0 | (ch >> 6));
2190            *p++ = (char)(0x80 | (ch & 0x3f));
2191        }
2192        else {
2193            /* Encode UCS2 Unicode ordinals */
2194            if (ch < 0x10000) {
2195                /* Special case: check for high surrogate */
2196                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2197                    Py_UCS4 ch2 = s[i];
2198                    /* Check for low surrogate and combine the two to
2199                       form a UCS4 value */
2200                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2201                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2202                        i++;
2203                        goto encodeUCS4;
2204                    }
2205                    /* Fall through: handles isolated high surrogates */
2206                }
2207                *p++ = (char)(0xe0 | (ch >> 12));
2208                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2209                *p++ = (char)(0x80 | (ch & 0x3f));
2210                continue;
2211            }
2212          encodeUCS4:
2213            /* Encode UCS4 Unicode ordinals */
2214            *p++ = (char)(0xf0 | (ch >> 18));
2215            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2216            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2217            *p++ = (char)(0x80 | (ch & 0x3f));
2218        }
2219    }
2220
2221    if (v == NULL) {
2222        /* This was stack allocated. */
2223        nneeded = p - stackbuf;
2224        assert(nneeded <= nallocated);
2225        v = PyString_FromStringAndSize(stackbuf, nneeded);
2226    }
2227    else {
2228        /* Cut back to size actually needed. */
2229        nneeded = p - PyString_AS_STRING(v);
2230        assert(nneeded <= nallocated);
2231        if (_PyString_Resize(&v, nneeded))
2232            return NULL;
2233    }
2234    return v;
2235
2236#undef MAX_SHORT_UNICHARS
2237}
2238
2239PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2240{
2241    if (!PyUnicode_Check(unicode)) {
2242        PyErr_BadArgument();
2243        return NULL;
2244    }
2245    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2246                                PyUnicode_GET_SIZE(unicode),
2247                                NULL);
2248}
2249
2250/* --- UTF-32 Codec ------------------------------------------------------- */
2251
2252PyObject *
2253PyUnicode_DecodeUTF32(const char *s,
2254                      Py_ssize_t size,
2255                      const char *errors,
2256                      int *byteorder)
2257{
2258    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2259}
2260
2261PyObject *
2262PyUnicode_DecodeUTF32Stateful(const char *s,
2263                              Py_ssize_t size,
2264                              const char *errors,
2265                              int *byteorder,
2266                              Py_ssize_t *consumed)
2267{
2268    const char *starts = s;
2269    Py_ssize_t startinpos;
2270    Py_ssize_t endinpos;
2271    Py_ssize_t outpos;
2272    PyUnicodeObject *unicode;
2273    Py_UNICODE *p;
2274#ifndef Py_UNICODE_WIDE
2275    int pairs = 0;
2276    const unsigned char *qq;
2277#else
2278    const int pairs = 0;
2279#endif
2280    const unsigned char *q, *e;
2281    int bo = 0;       /* assume native ordering by default */
2282    const char *errmsg = "";
2283    /* Offsets from q for retrieving bytes in the right order. */
2284#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2285    int iorder[] = {0, 1, 2, 3};
2286#else
2287    int iorder[] = {3, 2, 1, 0};
2288#endif
2289    PyObject *errorHandler = NULL;
2290    PyObject *exc = NULL;
2291
2292    q = (unsigned char *)s;
2293    e = q + size;
2294
2295    if (byteorder)
2296        bo = *byteorder;
2297
2298    /* Check for BOM marks (U+FEFF) in the input and adjust current
2299       byte order setting accordingly. In native mode, the leading BOM
2300       mark is skipped, in all other modes, it is copied to the output
2301       stream as-is (giving a ZWNBSP character). */
2302    if (bo == 0) {
2303        if (size >= 4) {
2304            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2305                (q[iorder[1]] << 8) | q[iorder[0]];
2306#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2307            if (bom == 0x0000FEFF) {
2308                q += 4;
2309                bo = -1;
2310            }
2311            else if (bom == 0xFFFE0000) {
2312                q += 4;
2313                bo = 1;
2314            }
2315#else
2316            if (bom == 0x0000FEFF) {
2317                q += 4;
2318                bo = 1;
2319            }
2320            else if (bom == 0xFFFE0000) {
2321                q += 4;
2322                bo = -1;
2323            }
2324#endif
2325        }
2326    }
2327
2328    if (bo == -1) {
2329        /* force LE */
2330        iorder[0] = 0;
2331        iorder[1] = 1;
2332        iorder[2] = 2;
2333        iorder[3] = 3;
2334    }
2335    else if (bo == 1) {
2336        /* force BE */
2337        iorder[0] = 3;
2338        iorder[1] = 2;
2339        iorder[2] = 1;
2340        iorder[3] = 0;
2341    }
2342
2343    /* On narrow builds we split characters outside the BMP into two
2344       code points => count how much extra space we need. */
2345#ifndef Py_UNICODE_WIDE
2346    for (qq = q; e - qq >= 4; qq += 4)
2347        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2348            pairs++;
2349#endif
2350
2351    /* This might be one to much, because of a BOM */
2352    unicode = _PyUnicode_New((size+3)/4+pairs);
2353    if (!unicode)
2354        return NULL;
2355    if (size == 0)
2356        return (PyObject *)unicode;
2357
2358    /* Unpack UTF-32 encoded data */
2359    p = unicode->str;
2360
2361    while (q < e) {
2362        Py_UCS4 ch;
2363        /* remaining bytes at the end? (size should be divisible by 4) */
2364        if (e-q<4) {
2365            if (consumed)
2366                break;
2367            errmsg = "truncated data";
2368            startinpos = ((const char *)q)-starts;
2369            endinpos = ((const char *)e)-starts;
2370            goto utf32Error;
2371            /* The remaining input chars are ignored if the callback
2372               chooses to skip the input */
2373        }
2374        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2375            (q[iorder[1]] << 8) | q[iorder[0]];
2376
2377        if (ch >= 0x110000)
2378        {
2379            errmsg = "code point not in range(0x110000)";
2380            startinpos = ((const char *)q)-starts;
2381            endinpos = startinpos+4;
2382            goto utf32Error;
2383        }
2384#ifndef Py_UNICODE_WIDE
2385        if (ch >= 0x10000)
2386        {
2387            *p++ = 0xD800 | ((ch-0x10000) >> 10);
2388            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2389        }
2390        else
2391#endif
2392            *p++ = ch;
2393        q += 4;
2394        continue;
2395      utf32Error:
2396        outpos = p-PyUnicode_AS_UNICODE(unicode);
2397        if (unicode_decode_call_errorhandler(
2398                errors, &errorHandler,
2399                "utf32", errmsg,
2400                starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2401                &unicode, &outpos, &p))
2402            goto onError;
2403    }
2404
2405    if (byteorder)
2406        *byteorder = bo;
2407
2408    if (consumed)
2409        *consumed = (const char *)q-starts;
2410
2411    /* Adjust length */
2412    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2413        goto onError;
2414
2415    Py_XDECREF(errorHandler);
2416    Py_XDECREF(exc);
2417    return (PyObject *)unicode;
2418
2419  onError:
2420    Py_DECREF(unicode);
2421    Py_XDECREF(errorHandler);
2422    Py_XDECREF(exc);
2423    return NULL;
2424}
2425
2426PyObject *
2427PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2428                      Py_ssize_t size,
2429                      const char *errors,
2430                      int byteorder)
2431{
2432    PyObject *v;
2433    unsigned char *p;
2434    Py_ssize_t nsize, bytesize;
2435#ifndef Py_UNICODE_WIDE
2436    Py_ssize_t i, pairs;
2437#else
2438    const int pairs = 0;
2439#endif
2440    /* Offsets from p for storing byte pairs in the right order. */
2441#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2442    int iorder[] = {0, 1, 2, 3};
2443#else
2444    int iorder[] = {3, 2, 1, 0};
2445#endif
2446
2447#define STORECHAR(CH)                           \
2448    do {                                        \
2449        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2450        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2451        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2452        p[iorder[0]] = (CH) & 0xff;             \
2453        p += 4;                                 \
2454    } while(0)
2455
2456    /* In narrow builds we can output surrogate pairs as one code point,
2457       so we need less space. */
2458#ifndef Py_UNICODE_WIDE
2459    for (i = pairs = 0; i < size-1; i++)
2460        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2461            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2462            pairs++;
2463#endif
2464    nsize = (size - pairs + (byteorder == 0));
2465    bytesize = nsize * 4;
2466    if (bytesize / 4 != nsize)
2467        return PyErr_NoMemory();
2468    v = PyString_FromStringAndSize(NULL, bytesize);
2469    if (v == NULL)
2470        return NULL;
2471
2472    p = (unsigned char *)PyString_AS_STRING(v);
2473    if (byteorder == 0)
2474        STORECHAR(0xFEFF);
2475    if (size == 0)
2476        return v;
2477
2478    if (byteorder == -1) {
2479        /* force LE */
2480        iorder[0] = 0;
2481        iorder[1] = 1;
2482        iorder[2] = 2;
2483        iorder[3] = 3;
2484    }
2485    else if (byteorder == 1) {
2486        /* force BE */
2487        iorder[0] = 3;
2488        iorder[1] = 2;
2489        iorder[2] = 1;
2490        iorder[3] = 0;
2491    }
2492
2493    while (size-- > 0) {
2494        Py_UCS4 ch = *s++;
2495#ifndef Py_UNICODE_WIDE
2496        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2497            Py_UCS4 ch2 = *s;
2498            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2499                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2500                s++;
2501                size--;
2502            }
2503        }
2504#endif
2505        STORECHAR(ch);
2506    }
2507    return v;
2508#undef STORECHAR
2509}
2510
2511PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2512{
2513    if (!PyUnicode_Check(unicode)) {
2514        PyErr_BadArgument();
2515        return NULL;
2516    }
2517    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2518                                 PyUnicode_GET_SIZE(unicode),
2519                                 NULL,
2520                                 0);
2521}
2522
2523/* --- UTF-16 Codec ------------------------------------------------------- */
2524
2525PyObject *
2526PyUnicode_DecodeUTF16(const char *s,
2527                      Py_ssize_t size,
2528                      const char *errors,
2529                      int *byteorder)
2530{
2531    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2532}
2533
2534PyObject *
2535PyUnicode_DecodeUTF16Stateful(const char *s,
2536                              Py_ssize_t size,
2537                              const char *errors,
2538                              int *byteorder,
2539                              Py_ssize_t *consumed)
2540{
2541    const char *starts = s;
2542    Py_ssize_t startinpos;
2543    Py_ssize_t endinpos;
2544    Py_ssize_t outpos;
2545    PyUnicodeObject *unicode;
2546    Py_UNICODE *p;
2547    const unsigned char *q, *e;
2548    int bo = 0;       /* assume native ordering by default */
2549    const char *errmsg = "";
2550    /* Offsets from q for retrieving byte pairs in the right order. */
2551#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2552    int ihi = 1, ilo = 0;
2553#else
2554    int ihi = 0, ilo = 1;
2555#endif
2556    PyObject *errorHandler = NULL;
2557    PyObject *exc = NULL;
2558
2559    /* Note: size will always be longer than the resulting Unicode
2560       character count */
2561    unicode = _PyUnicode_New(size);
2562    if (!unicode)
2563        return NULL;
2564    if (size == 0)
2565        return (PyObject *)unicode;
2566
2567    /* Unpack UTF-16 encoded data */
2568    p = unicode->str;
2569    q = (unsigned char *)s;
2570    e = q + size;
2571
2572    if (byteorder)
2573        bo = *byteorder;
2574
2575    /* Check for BOM marks (U+FEFF) in the input and adjust current
2576       byte order setting accordingly. In native mode, the leading BOM
2577       mark is skipped, in all other modes, it is copied to the output
2578       stream as-is (giving a ZWNBSP character). */
2579    if (bo == 0) {
2580        if (size >= 2) {
2581            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2582#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2583            if (bom == 0xFEFF) {
2584                q += 2;
2585                bo = -1;
2586            }
2587            else if (bom == 0xFFFE) {
2588                q += 2;
2589                bo = 1;
2590            }
2591#else
2592            if (bom == 0xFEFF) {
2593                q += 2;
2594                bo = 1;
2595            }
2596            else if (bom == 0xFFFE) {
2597                q += 2;
2598                bo = -1;
2599            }
2600#endif
2601        }
2602    }
2603
2604    if (bo == -1) {
2605        /* force LE */
2606        ihi = 1;
2607        ilo = 0;
2608    }
2609    else if (bo == 1) {
2610        /* force BE */
2611        ihi = 0;
2612        ilo = 1;
2613    }
2614
2615    while (q < e) {
2616        Py_UNICODE ch;
2617        /* remaining bytes at the end? (size should be even) */
2618        if (e-q<2) {
2619            if (consumed)
2620                break;
2621            errmsg = "truncated data";
2622            startinpos = ((const char *)q)-starts;
2623            endinpos = ((const char *)e)-starts;
2624            goto utf16Error;
2625            /* The remaining input chars are ignored if the callback
2626               chooses to skip the input */
2627        }
2628        ch = (q[ihi] << 8) | q[ilo];
2629
2630        q += 2;
2631
2632        if (ch < 0xD800 || ch > 0xDFFF) {
2633            *p++ = ch;
2634            continue;
2635        }
2636
2637        /* UTF-16 code pair: */
2638        if (e - q < 2) {
2639            q -= 2;
2640            if (consumed)
2641                break;
2642            errmsg = "unexpected end of data";
2643            startinpos = ((const char *)q)-starts;
2644            endinpos = ((const char *)e)-starts;
2645            goto utf16Error;
2646        }
2647        if (0xD800 <= ch && ch <= 0xDBFF) {
2648            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2649            q += 2;
2650            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2651#ifndef Py_UNICODE_WIDE
2652                *p++ = ch;
2653                *p++ = ch2;
2654#else
2655                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2656#endif
2657                continue;
2658            }
2659            else {
2660                errmsg = "illegal UTF-16 surrogate";
2661                startinpos = (((const char *)q)-4)-starts;
2662                endinpos = startinpos+2;
2663                goto utf16Error;
2664            }
2665
2666        }
2667        errmsg = "illegal encoding";
2668        startinpos = (((const char *)q)-2)-starts;
2669        endinpos = startinpos+2;
2670        /* Fall through to report the error */
2671
2672      utf16Error:
2673        outpos = p-PyUnicode_AS_UNICODE(unicode);
2674        if (unicode_decode_call_errorhandler(
2675                errors, &errorHandler,
2676                "utf16", errmsg,
2677                starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2678                &unicode, &outpos, &p))
2679            goto onError;
2680    }
2681
2682    if (byteorder)
2683        *byteorder = bo;
2684
2685    if (consumed)
2686        *consumed = (const char *)q-starts;
2687
2688    /* Adjust length */
2689    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2690        goto onError;
2691
2692    Py_XDECREF(errorHandler);
2693    Py_XDECREF(exc);
2694    return (PyObject *)unicode;
2695
2696  onError:
2697    Py_DECREF(unicode);
2698    Py_XDECREF(errorHandler);
2699    Py_XDECREF(exc);
2700    return NULL;
2701}
2702
2703PyObject *
2704PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2705                      Py_ssize_t size,
2706                      const char *errors,
2707                      int byteorder)
2708{
2709    PyObject *v;
2710    unsigned char *p;
2711    Py_ssize_t nsize, bytesize;
2712#ifdef Py_UNICODE_WIDE
2713    Py_ssize_t i, pairs;
2714#else
2715    const int pairs = 0;
2716#endif
2717    /* Offsets from p for storing byte pairs in the right order. */
2718#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2719    int ihi = 1, ilo = 0;
2720#else
2721    int ihi = 0, ilo = 1;
2722#endif
2723
2724#define STORECHAR(CH)                           \
2725    do {                                        \
2726        p[ihi] = ((CH) >> 8) & 0xff;            \
2727        p[ilo] = (CH) & 0xff;                   \
2728        p += 2;                                 \
2729    } while(0)
2730
2731#ifdef Py_UNICODE_WIDE
2732    for (i = pairs = 0; i < size; i++)
2733        if (s[i] >= 0x10000)
2734            pairs++;
2735#endif
2736    /* 2 * (size + pairs + (byteorder == 0)) */
2737    if (size > PY_SSIZE_T_MAX ||
2738        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2739        return PyErr_NoMemory();
2740    nsize = size + pairs + (byteorder == 0);
2741    bytesize = nsize * 2;
2742    if (bytesize / 2 != nsize)
2743        return PyErr_NoMemory();
2744    v = PyString_FromStringAndSize(NULL, bytesize);
2745    if (v == NULL)
2746        return NULL;
2747
2748    p = (unsigned char *)PyString_AS_STRING(v);
2749    if (byteorder == 0)
2750        STORECHAR(0xFEFF);
2751    if (size == 0)
2752        return v;
2753
2754    if (byteorder == -1) {
2755        /* force LE */
2756        ihi = 1;
2757        ilo = 0;
2758    }
2759    else if (byteorder == 1) {
2760        /* force BE */
2761        ihi = 0;
2762        ilo = 1;
2763    }
2764
2765    while (size-- > 0) {
2766        Py_UNICODE ch = *s++;
2767        Py_UNICODE ch2 = 0;
2768#ifdef Py_UNICODE_WIDE
2769        if (ch >= 0x10000) {
2770            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2771            ch  = 0xD800 | ((ch-0x10000) >> 10);
2772        }
2773#endif
2774        STORECHAR(ch);
2775        if (ch2)
2776            STORECHAR(ch2);
2777    }
2778    return v;
2779#undef STORECHAR
2780}
2781
2782PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2783{
2784    if (!PyUnicode_Check(unicode)) {
2785        PyErr_BadArgument();
2786        return NULL;
2787    }
2788    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2789                                 PyUnicode_GET_SIZE(unicode),
2790                                 NULL,
2791                                 0);
2792}
2793
2794/* --- Unicode Escape Codec ----------------------------------------------- */
2795
2796static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2797
2798PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2799                                        Py_ssize_t size,
2800                                        const char *errors)
2801{
2802    const char *starts = s;
2803    Py_ssize_t startinpos;
2804    Py_ssize_t endinpos;
2805    Py_ssize_t outpos;
2806    PyUnicodeObject *v;
2807    Py_UNICODE *p;
2808    const char *end;
2809    char* message;
2810    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2811    PyObject *errorHandler = NULL;
2812    PyObject *exc = NULL;
2813
2814    /* Escaped strings will always be longer than the resulting
2815       Unicode string, so we start with size here and then reduce the
2816       length after conversion to the true value.
2817       (but if the error callback returns a long replacement string
2818       we'll have to allocate more space) */
2819    v = _PyUnicode_New(size);
2820    if (v == NULL)
2821        goto onError;
2822    if (size == 0)
2823        return (PyObject *)v;
2824
2825    p = PyUnicode_AS_UNICODE(v);
2826    end = s + size;
2827
2828    while (s < end) {
2829        unsigned char c;
2830        Py_UNICODE x;
2831        int digits;
2832
2833        /* Non-escape characters are interpreted as Unicode ordinals */
2834        if (*s != '\\') {
2835            *p++ = (unsigned char) *s++;
2836            continue;
2837        }
2838
2839        startinpos = s-starts;
2840        /* \ - Escapes */
2841        s++;
2842        c = *s++;
2843        if (s > end)
2844            c = '\0'; /* Invalid after \ */
2845        switch (c) {
2846
2847            /* \x escapes */
2848        case '\n': break;
2849        case '\\': *p++ = '\\'; break;
2850        case '\'': *p++ = '\''; break;
2851        case '\"': *p++ = '\"'; break;
2852        case 'b': *p++ = '\b'; break;
2853        case 'f': *p++ = '\014'; break; /* FF */
2854        case 't': *p++ = '\t'; break;
2855        case 'n': *p++ = '\n'; break;
2856        case 'r': *p++ = '\r'; break;
2857        case 'v': *p++ = '\013'; break; /* VT */
2858        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2859
2860            /* \OOO (octal) escapes */
2861        case '0': case '1': case '2': case '3':
2862        case '4': case '5': case '6': case '7':
2863            x = s[-1] - '0';
2864            if (s < end && '0' <= *s && *s <= '7') {
2865                x = (x<<3) + *s++ - '0';
2866                if (s < end && '0' <= *s && *s <= '7')
2867                    x = (x<<3) + *s++ - '0';
2868            }
2869            *p++ = x;
2870            break;
2871
2872            /* hex escapes */
2873            /* \xXX */
2874        case 'x':
2875            digits = 2;
2876            message = "truncated \\xXX escape";
2877            goto hexescape;
2878
2879            /* \uXXXX */
2880        case 'u':
2881            digits = 4;
2882            message = "truncated \\uXXXX escape";
2883            goto hexescape;
2884
2885            /* \UXXXXXXXX */
2886        case 'U':
2887            digits = 8;
2888            message = "truncated \\UXXXXXXXX escape";
2889        hexescape:
2890            chr = 0;
2891            if (end - s < digits) {
2892                /* count only hex digits */
2893                for (; s < end; ++s) {
2894                    c = (unsigned char)*s;
2895                    if (!Py_ISXDIGIT(c))
2896                        goto error;
2897                }
2898                goto error;
2899            }
2900            for (; digits--; ++s) {
2901                c = (unsigned char)*s;
2902                if (!Py_ISXDIGIT(c))
2903                    goto error;
2904                chr = (chr<<4) & ~0xF;
2905                if (c >= '0' && c <= '9')
2906                    chr += c - '0';
2907                else if (c >= 'a' && c <= 'f')
2908                    chr += 10 + c - 'a';
2909                else
2910                    chr += 10 + c - 'A';
2911            }
2912            if (chr == 0xffffffff && PyErr_Occurred())
2913                /* _decoding_error will have already written into the
2914                   target buffer. */
2915                break;
2916        store:
2917            /* when we get here, chr is a 32-bit unicode character */
2918            if (chr <= 0xffff)
2919                /* UCS-2 character */
2920                *p++ = (Py_UNICODE) chr;
2921            else if (chr <= 0x10ffff) {
2922                /* UCS-4 character. Either store directly, or as
2923                   surrogate pair. */
2924#ifdef Py_UNICODE_WIDE
2925                *p++ = chr;
2926#else
2927                chr -= 0x10000L;
2928                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2929                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2930#endif
2931            } else {
2932                message = "illegal Unicode character";
2933                goto error;
2934            }
2935            break;
2936
2937            /* \N{name} */
2938        case 'N':
2939            message = "malformed \\N character escape";
2940            if (ucnhash_CAPI == NULL) {
2941                /* load the unicode data module */
2942                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2943                if (ucnhash_CAPI == NULL)
2944                    goto ucnhashError;
2945            }
2946            if (*s == '{') {
2947                const char *start = s+1;
2948                /* look for the closing brace */
2949                while (*s != '}' && s < end)
2950                    s++;
2951                if (s > start && s < end && *s == '}') {
2952                    /* found a name.  look it up in the unicode database */
2953                    message = "unknown Unicode character name";
2954                    s++;
2955                    if (s - start - 1 <= INT_MAX &&
2956                        ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2957                        goto store;
2958                }
2959            }
2960            goto error;
2961
2962        default:
2963            if (s > end) {
2964                message = "\\ at end of string";
2965                s--;
2966                goto error;
2967            }
2968            else {
2969                *p++ = '\\';
2970                *p++ = (unsigned char)s[-1];
2971            }
2972            break;
2973        }
2974        continue;
2975
2976      error:
2977        endinpos = s-starts;
2978        outpos = p-PyUnicode_AS_UNICODE(v);
2979        if (unicode_decode_call_errorhandler(
2980                errors, &errorHandler,
2981                "unicodeescape", message,
2982                starts, size, &startinpos, &endinpos, &exc, &s,
2983                &v, &outpos, &p))
2984            goto onError;
2985        continue;
2986    }
2987    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2988        goto onError;
2989    Py_XDECREF(errorHandler);
2990    Py_XDECREF(exc);
2991    return (PyObject *)v;
2992
2993  ucnhashError:
2994    PyErr_SetString(
2995        PyExc_UnicodeError,
2996        "\\N escapes not supported (can't load unicodedata module)"
2997        );
2998    Py_XDECREF(v);
2999    Py_XDECREF(errorHandler);
3000    Py_XDECREF(exc);
3001    return NULL;
3002
3003  onError:
3004    Py_XDECREF(v);
3005    Py_XDECREF(errorHandler);
3006    Py_XDECREF(exc);
3007    return NULL;
3008}
3009
3010/* Return a Unicode-Escape string version of the Unicode object.
3011
3012   If quotes is true, the string is enclosed in u"" or u'' quotes as
3013   appropriate.
3014
3015*/
3016
3017Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3018                                             Py_ssize_t size,
3019                                             Py_UNICODE ch)
3020{
3021    /* like wcschr, but doesn't stop at NULL characters */
3022
3023    while (size-- > 0) {
3024        if (*s == ch)
3025            return s;
3026        s++;
3027    }
3028
3029    return NULL;
3030}
3031
3032static
3033PyObject *unicodeescape_string(const Py_UNICODE *s,
3034                               Py_ssize_t size,
3035                               int quotes)
3036{
3037    PyObject *repr;
3038    char *p;
3039
3040    static const char *hexdigit = "0123456789abcdef";
3041#ifdef Py_UNICODE_WIDE
3042    const Py_ssize_t expandsize = 10;
3043#else
3044    const Py_ssize_t expandsize = 6;
3045#endif
3046
3047    /* XXX(nnorwitz): rather than over-allocating, it would be
3048       better to choose a different scheme.  Perhaps scan the
3049       first N-chars of the string and allocate based on that size.
3050    */
3051    /* Initial allocation is based on the longest-possible unichr
3052       escape.
3053
3054       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3055       unichr, so in this case it's the longest unichr escape. In
3056       narrow (UTF-16) builds this is five chars per source unichr
3057       since there are two unichrs in the surrogate pair, so in narrow
3058       (UTF-16) builds it's not the longest unichr escape.
3059
3060       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3061       so in the narrow (UTF-16) build case it's the longest unichr
3062       escape.
3063    */
3064
3065    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3066        return PyErr_NoMemory();
3067
3068    repr = PyString_FromStringAndSize(NULL,
3069                                      2
3070                                      + expandsize*size
3071                                      + 1);
3072    if (repr == NULL)
3073        return NULL;
3074
3075    p = PyString_AS_STRING(repr);
3076
3077    if (quotes) {
3078        *p++ = 'u';
3079        *p++ = (findchar(s, size, '\'') &&
3080                !findchar(s, size, '"')) ? '"' : '\'';
3081    }
3082    while (size-- > 0) {
3083        Py_UNICODE ch = *s++;
3084
3085        /* Escape quotes and backslashes */
3086        if ((quotes &&
3087             ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3088            *p++ = '\\';
3089            *p++ = (char) ch;
3090            continue;
3091        }
3092
3093#ifdef Py_UNICODE_WIDE
3094        /* Map 21-bit characters to '\U00xxxxxx' */
3095        else if (ch >= 0x10000) {
3096            *p++ = '\\';
3097            *p++ = 'U';
3098            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3099            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3100            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3101            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3102            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3103            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3104            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3105            *p++ = hexdigit[ch & 0x0000000F];
3106            continue;
3107        }
3108#else
3109        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3110        else if (ch >= 0xD800 && ch < 0xDC00) {
3111            Py_UNICODE ch2;
3112            Py_UCS4 ucs;
3113
3114            ch2 = *s++;
3115            size--;
3116            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3117                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3118                *p++ = '\\';
3119                *p++ = 'U';
3120                *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3121                *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3122                *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3123                *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3124                *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3125                *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3126                *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3127                *p++ = hexdigit[ucs & 0x0000000F];
3128                continue;
3129            }
3130            /* Fall through: isolated surrogates are copied as-is */
3131            s--;
3132            size++;
3133        }
3134#endif
3135
3136        /* Map 16-bit characters to '\uxxxx' */
3137        if (ch >= 256) {
3138            *p++ = '\\';
3139            *p++ = 'u';
3140            *p++ = hexdigit[(ch >> 12) & 0x000F];
3141            *p++ = hexdigit[(ch >> 8) & 0x000F];
3142            *p++ = hexdigit[(ch >> 4) & 0x000F];
3143            *p++ = hexdigit[ch & 0x000F];
3144        }
3145
3146        /* Map special whitespace to '\t', \n', '\r' */
3147        else if (ch == '\t') {
3148            *p++ = '\\';
3149            *p++ = 't';
3150        }
3151        else if (ch == '\n') {
3152            *p++ = '\\';
3153            *p++ = 'n';
3154        }
3155        else if (ch == '\r') {
3156            *p++ = '\\';
3157            *p++ = 'r';
3158        }
3159
3160        /* Map non-printable US ASCII to '\xhh' */
3161        else if (ch < ' ' || ch >= 0x7F) {
3162            *p++ = '\\';
3163            *p++ = 'x';
3164            *p++ = hexdigit[(ch >> 4) & 0x000F];
3165            *p++ = hexdigit[ch & 0x000F];
3166        }
3167
3168        /* Copy everything else as-is */
3169        else
3170            *p++ = (char) ch;
3171    }
3172    if (quotes)
3173        *p++ = PyString_AS_STRING(repr)[1];
3174
3175    *p = '\0';
3176    if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3177        return NULL;
3178    return repr;
3179}
3180
3181PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3182                                        Py_ssize_t size)
3183{
3184    return unicodeescape_string(s, size, 0);
3185}
3186
3187PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3188{
3189    if (!PyUnicode_Check(unicode)) {
3190        PyErr_BadArgument();
3191        return NULL;
3192    }
3193    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3194                                         PyUnicode_GET_SIZE(unicode));
3195}
3196
3197/* --- Raw Unicode Escape Codec ------------------------------------------- */
3198
3199PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3200                                           Py_ssize_t size,
3201                                           const char *errors)
3202{
3203    const char *starts = s;
3204    Py_ssize_t startinpos;
3205    Py_ssize_t endinpos;
3206    Py_ssize_t outpos;
3207    PyUnicodeObject *v;
3208    Py_UNICODE *p;
3209    const char *end;
3210    const char *bs;
3211    PyObject *errorHandler = NULL;
3212    PyObject *exc = NULL;
3213
3214    /* Escaped strings will always be longer than the resulting
3215       Unicode string, so we start with size here and then reduce the
3216       length after conversion to the true value. (But decoding error
3217       handler might have to resize the string) */
3218    v = _PyUnicode_New(size);
3219    if (v == NULL)
3220        goto onError;
3221    if (size == 0)
3222        return (PyObject *)v;
3223    p = PyUnicode_AS_UNICODE(v);
3224    end = s + size;
3225    while (s < end) {
3226        unsigned char c;
3227        Py_UCS4 x;
3228        int i;
3229        int count;
3230
3231        /* Non-escape characters are interpreted as Unicode ordinals */
3232        if (*s != '\\') {
3233            *p++ = (unsigned char)*s++;
3234            continue;
3235        }
3236        startinpos = s-starts;
3237
3238        /* \u-escapes are only interpreted iff the number of leading
3239           backslashes if odd */
3240        bs = s;
3241        for (;s < end;) {
3242            if (*s != '\\')
3243                break;
3244            *p++ = (unsigned char)*s++;
3245        }
3246        if (((s - bs) & 1) == 0 ||
3247            s >= end ||
3248            (*s != 'u' && *s != 'U')) {
3249            continue;
3250        }
3251        p--;
3252        count = *s=='u' ? 4 : 8;
3253        s++;
3254
3255        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3256        outpos = p-PyUnicode_AS_UNICODE(v);
3257        for (x = 0, i = 0; i < count; ++i, ++s) {
3258            c = (unsigned char)*s;
3259            if (!isxdigit(c)) {
3260                endinpos = s-starts;
3261                if (unicode_decode_call_errorhandler(
3262                        errors, &errorHandler,
3263                        "rawunicodeescape", "truncated \\uXXXX",
3264                        starts, size, &startinpos, &endinpos, &exc, &s,
3265                        &v, &outpos, &p))
3266                    goto onError;
3267                goto nextByte;
3268            }
3269            x = (x<<4) & ~0xF;
3270            if (c >= '0' && c <= '9')
3271                x += c - '0';
3272            else if (c >= 'a' && c <= 'f')
3273                x += 10 + c - 'a';
3274            else
3275                x += 10 + c - 'A';
3276        }
3277        if (x <= 0xffff)
3278            /* UCS-2 character */
3279            *p++ = (Py_UNICODE) x;
3280        else if (x <= 0x10ffff) {
3281            /* UCS-4 character. Either store directly, or as
3282               surrogate pair. */
3283#ifdef Py_UNICODE_WIDE
3284            *p++ = (Py_UNICODE) x;
3285#else
3286            x -= 0x10000L;
3287            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3288            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3289#endif
3290        } else {
3291            endinpos = s-starts;
3292            outpos = p-PyUnicode_AS_UNICODE(v);
3293            if (unicode_decode_call_errorhandler(
3294                    errors, &errorHandler,
3295                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3296                    starts, size, &startinpos, &endinpos, &exc, &s,
3297                    &v, &outpos, &p))
3298                goto onError;
3299        }
3300      nextByte:
3301        ;
3302    }
3303    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3304        goto onError;
3305    Py_XDECREF(errorHandler);
3306    Py_XDECREF(exc);
3307    return (PyObject *)v;
3308
3309  onError:
3310    Py_XDECREF(v);
3311    Py_XDECREF(errorHandler);
3312    Py_XDECREF(exc);
3313    return NULL;
3314}
3315
3316PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3317                                           Py_ssize_t size)
3318{
3319    PyObject *repr;
3320    char *p;
3321    char *q;
3322
3323    static const char *hexdigit = "0123456789abcdef";
3324#ifdef Py_UNICODE_WIDE
3325    const Py_ssize_t expandsize = 10;
3326#else
3327    const Py_ssize_t expandsize = 6;
3328#endif
3329
3330    if (size > PY_SSIZE_T_MAX / expandsize)
3331        return PyErr_NoMemory();
3332
3333    repr = PyString_FromStringAndSize(NULL, expandsize * size);
3334    if (repr == NULL)
3335        return NULL;
3336    if (size == 0)
3337        return repr;
3338
3339    p = q = PyString_AS_STRING(repr);
3340    while (size-- > 0) {
3341        Py_UNICODE ch = *s++;
3342#ifdef Py_UNICODE_WIDE
3343        /* Map 32-bit characters to '\Uxxxxxxxx' */
3344        if (ch >= 0x10000) {
3345            *p++ = '\\';
3346            *p++ = 'U';
3347            *p++ = hexdigit[(ch >> 28) & 0xf];
3348            *p++ = hexdigit[(ch >> 24) & 0xf];
3349            *p++ = hexdigit[(ch >> 20) & 0xf];
3350            *p++ = hexdigit[(ch >> 16) & 0xf];
3351            *p++ = hexdigit[(ch >> 12) & 0xf];
3352            *p++ = hexdigit[(ch >> 8) & 0xf];
3353            *p++ = hexdigit[(ch >> 4) & 0xf];
3354            *p++ = hexdigit[ch & 15];
3355        }
3356        else
3357#else
3358            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3359            if (ch >= 0xD800 && ch < 0xDC00) {
3360                Py_UNICODE ch2;
3361                Py_UCS4 ucs;
3362
3363                ch2 = *s++;
3364                size--;
3365                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3366                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3367                    *p++ = '\\';
3368                    *p++ = 'U';
3369                    *p++ = hexdigit[(ucs >> 28) & 0xf];
3370                    *p++ = hexdigit[(ucs >> 24) & 0xf];
3371                    *p++ = hexdigit[(ucs >> 20) & 0xf];
3372                    *p++ = hexdigit[(ucs >> 16) & 0xf];
3373                    *p++ = hexdigit[(ucs >> 12) & 0xf];
3374                    *p++ = hexdigit[(ucs >> 8) & 0xf];
3375                    *p++ = hexdigit[(ucs >> 4) & 0xf];
3376                    *p++ = hexdigit[ucs & 0xf];
3377                    continue;
3378                }
3379                /* Fall through: isolated surrogates are copied as-is */
3380                s--;
3381                size++;
3382            }
3383#endif
3384        /* Map 16-bit characters to '\uxxxx' */
3385        if (ch >= 256) {
3386            *p++ = '\\';
3387            *p++ = 'u';
3388            *p++ = hexdigit[(ch >> 12) & 0xf];
3389            *p++ = hexdigit[(ch >> 8) & 0xf];
3390            *p++ = hexdigit[(ch >> 4) & 0xf];
3391            *p++ = hexdigit[ch & 15];
3392        }
3393        /* Copy everything else as-is */
3394        else
3395            *p++ = (char) ch;
3396    }
3397    *p = '\0';
3398    if (_PyString_Resize(&repr, p - q))
3399        return NULL;
3400    return repr;
3401}
3402
3403PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3404{
3405    if (!PyUnicode_Check(unicode)) {
3406        PyErr_BadArgument();
3407        return NULL;
3408    }
3409    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3410                                            PyUnicode_GET_SIZE(unicode));
3411}
3412
3413/* --- Unicode Internal Codec ------------------------------------------- */
3414
3415PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3416                                           Py_ssize_t size,
3417                                           const char *errors)
3418{
3419    const char *starts = s;
3420    Py_ssize_t startinpos;
3421    Py_ssize_t endinpos;
3422    Py_ssize_t outpos;
3423    PyUnicodeObject *v;
3424    Py_UNICODE *p;
3425    const char *end;
3426    const char *reason;
3427    PyObject *errorHandler = NULL;
3428    PyObject *exc = NULL;
3429
3430#ifdef Py_UNICODE_WIDE
3431    Py_UNICODE unimax = PyUnicode_GetMax();
3432#endif
3433
3434    /* XXX overflow detection missing */
3435    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3436    if (v == NULL)
3437        goto onError;
3438    if (PyUnicode_GetSize((PyObject *)v) == 0)
3439        return (PyObject *)v;
3440    p = PyUnicode_AS_UNICODE(v);
3441    end = s + size;
3442
3443    while (s < end) {
3444        if (end-s < Py_UNICODE_SIZE) {
3445            endinpos = end-starts;
3446            reason = "truncated input";
3447            goto error;
3448        }
3449        memcpy(p, s, sizeof(Py_UNICODE));
3450#ifdef Py_UNICODE_WIDE
3451        /* We have to sanity check the raw data, otherwise doom looms for
3452           some malformed UCS-4 data. */
3453        if (*p > unimax || *p < 0) {
3454            endinpos = s - starts + Py_UNICODE_SIZE;
3455            reason = "illegal code point (> 0x10FFFF)";
3456            goto error;
3457        }
3458#endif
3459        p++;
3460        s += Py_UNICODE_SIZE;
3461        continue;
3462
3463  error:
3464        startinpos = s - starts;
3465        outpos = p - PyUnicode_AS_UNICODE(v);
3466        if (unicode_decode_call_errorhandler(
3467                errors, &errorHandler,
3468                "unicode_internal", reason,
3469                starts, size, &startinpos, &endinpos, &exc, &s,
3470                &v, &outpos, &p)) {
3471            goto onError;
3472        }
3473    }
3474
3475    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3476        goto onError;
3477    Py_XDECREF(errorHandler);
3478    Py_XDECREF(exc);
3479    return (PyObject *)v;
3480
3481  onError:
3482    Py_XDECREF(v);
3483    Py_XDECREF(errorHandler);
3484    Py_XDECREF(exc);
3485    return NULL;
3486}
3487
3488/* --- Latin-1 Codec ------------------------------------------------------ */
3489
3490PyObject *PyUnicode_DecodeLatin1(const char *s,
3491                                 Py_ssize_t size,
3492                                 const char *errors)
3493{
3494    PyUnicodeObject *v;
3495    Py_UNICODE *p;
3496
3497    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3498    if (size == 1) {
3499        Py_UNICODE r = *(unsigned char*)s;
3500        return PyUnicode_FromUnicode(&r, 1);
3501    }
3502
3503    v = _PyUnicode_New(size);
3504    if (v == NULL)
3505        goto onError;
3506    if (size == 0)
3507        return (PyObject *)v;
3508    p = PyUnicode_AS_UNICODE(v);
3509    while (size-- > 0)
3510        *p++ = (unsigned char)*s++;
3511    return (PyObject *)v;
3512
3513  onError:
3514    Py_XDECREF(v);
3515    return NULL;
3516}
3517
3518/* create or adjust a UnicodeEncodeError */
3519static void make_encode_exception(PyObject **exceptionObject,
3520                                  const char *encoding,
3521                                  const Py_UNICODE *unicode, Py_ssize_t size,
3522                                  Py_ssize_t startpos, Py_ssize_t endpos,
3523                                  const char *reason)
3524{
3525    if (*exceptionObject == NULL) {
3526        *exceptionObject = PyUnicodeEncodeError_Create(
3527            encoding, unicode, size, startpos, endpos, reason);
3528    }
3529    else {
3530        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3531            goto onError;
3532        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3533            goto onError;
3534        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3535            goto onError;
3536        return;
3537      onError:
3538        Py_CLEAR(*exceptionObject);
3539    }
3540}
3541
3542/* raises a UnicodeEncodeError */
3543static void raise_encode_exception(PyObject **exceptionObject,
3544                                   const char *encoding,
3545                                   const Py_UNICODE *unicode, Py_ssize_t size,
3546                                   Py_ssize_t startpos, Py_ssize_t endpos,
3547                                   const char *reason)
3548{
3549    make_encode_exception(exceptionObject,
3550                          encoding, unicode, size, startpos, endpos, reason);
3551    if (*exceptionObject != NULL)
3552        PyCodec_StrictErrors(*exceptionObject);
3553}
3554
3555/* error handling callback helper:
3556   build arguments, call the callback and check the arguments,
3557   put the result into newpos and return the replacement string, which
3558   has to be freed by the caller */
3559static PyObject *unicode_encode_call_errorhandler(const char *errors,
3560                                                  PyObject **errorHandler,
3561                                                  const char *encoding, const char *reason,
3562                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3563                                                  Py_ssize_t startpos, Py_ssize_t endpos,
3564                                                  Py_ssize_t *newpos)
3565{
3566    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3567
3568    PyObject *restuple;
3569    PyObject *resunicode;
3570
3571    if (*errorHandler == NULL) {
3572        *errorHandler = PyCodec_LookupError(errors);
3573        if (*errorHandler == NULL)
3574            return NULL;
3575    }
3576
3577    make_encode_exception(exceptionObject,
3578                          encoding, unicode, size, startpos, endpos, reason);
3579    if (*exceptionObject == NULL)
3580        return NULL;
3581
3582    restuple = PyObject_CallFunctionObjArgs(
3583        *errorHandler, *exceptionObject, NULL);
3584    if (restuple == NULL)
3585        return NULL;
3586    if (!PyTuple_Check(restuple)) {
3587        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3588        Py_DECREF(restuple);
3589        return NULL;
3590    }
3591    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3592                          &resunicode, newpos)) {
3593        Py_DECREF(restuple);
3594        return NULL;
3595    }
3596    if (*newpos<0)
3597        *newpos = size+*newpos;
3598    if (*newpos<0 || *newpos>size) {
3599        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3600        Py_DECREF(restuple);
3601        return NULL;
3602    }
3603    Py_INCREF(resunicode);
3604    Py_DECREF(restuple);
3605    return resunicode;
3606}
3607
3608static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3609                                     Py_ssize_t size,
3610                                     const char *errors,
3611                                     int limit)
3612{
3613    /* output object */
3614    PyObject *res;
3615    /* pointers to the beginning and end+1 of input */
3616    const Py_UNICODE *startp = p;
3617    const Py_UNICODE *endp = p + size;
3618    /* pointer to the beginning of the unencodable characters */
3619    /* const Py_UNICODE *badp = NULL; */
3620    /* pointer into the output */
3621    char *str;
3622    /* current output position */
3623    Py_ssize_t respos = 0;
3624    Py_ssize_t ressize;
3625    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3626    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3627    PyObject *errorHandler = NULL;
3628    PyObject *exc = NULL;
3629    /* the following variable is used for caching string comparisons
3630     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3631    int known_errorHandler = -1;
3632
3633    /* allocate enough for a simple encoding without
3634       replacements, if we need more, we'll resize */
3635    res = PyString_FromStringAndSize(NULL, size);
3636    if (res == NULL)
3637        goto onError;
3638    if (size == 0)
3639        return res;
3640    str = PyString_AS_STRING(res);
3641    ressize = size;
3642
3643    while (p<endp) {
3644        Py_UNICODE c = *p;
3645
3646        /* can we encode this? */
3647        if (c<limit) {
3648            /* no overflow check, because we know that the space is enough */
3649            *str++ = (char)c;
3650            ++p;
3651        }
3652        else {
3653            Py_ssize_t unicodepos = p-startp;
3654            Py_ssize_t requiredsize;
3655            PyObject *repunicode;
3656            Py_ssize_t repsize;
3657            Py_ssize_t newpos;
3658            Py_ssize_t respos;
3659            Py_UNICODE *uni2;
3660            /* startpos for collecting unencodable chars */
3661            const Py_UNICODE *collstart = p;
3662            const Py_UNICODE *collend = p;
3663            /* find all unecodable characters */
3664            while ((collend < endp) && ((*collend) >= limit))
3665                ++collend;
3666            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3667            if (known_errorHandler==-1) {
3668                if ((errors==NULL) || (!strcmp(errors, "strict")))
3669                    known_errorHandler = 1;
3670                else if (!strcmp(errors, "replace"))
3671                    known_errorHandler = 2;
3672                else if (!strcmp(errors, "ignore"))
3673                    known_errorHandler = 3;
3674                else if (!strcmp(errors, "xmlcharrefreplace"))
3675                    known_errorHandler = 4;
3676                else
3677                    known_errorHandler = 0;
3678            }
3679            switch (known_errorHandler) {
3680            case 1: /* strict */
3681                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3682                goto onError;
3683            case 2: /* replace */
3684                while (collstart++ < collend)
3685                    *str++ = '?'; /* fall through */
3686            case 3: /* ignore */
3687                p = collend;
3688                break;
3689            case 4: /* xmlcharrefreplace */
3690                respos = str - PyString_AS_STRING(res);
3691                /* determine replacement size (temporarily (mis)uses p) */
3692                requiredsize = respos;
3693                for (p = collstart; p < collend;) {
3694                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3695                    Py_ssize_t incr;
3696                    if (ch < 10)
3697                        incr = 2+1+1;
3698                    else if (ch < 100)
3699                        incr = 2+2+1;
3700                    else if (ch < 1000)
3701                        incr = 2+3+1;
3702                    else if (ch < 10000)
3703                        incr = 2+4+1;
3704                    else if (ch < 100000)
3705                        incr = 2+5+1;
3706                    else if (ch < 1000000)
3707                        incr = 2+6+1;
3708                    else
3709                        incr = 2+7+1;
3710                    if (requiredsize > PY_SSIZE_T_MAX - incr)
3711                        goto overflow;
3712                    requiredsize += incr;
3713                }
3714                if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3715                    goto overflow;
3716                requiredsize += endp - collend;
3717                if (requiredsize > ressize) {
3718                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3719                        requiredsize = 2*ressize;
3720                    if (_PyString_Resize(&res, requiredsize))
3721                        goto onError;
3722                    str = PyString_AS_STRING(res) + respos;
3723                    ressize = requiredsize;
3724                }
3725                /* generate replacement (temporarily (mis)uses p) */
3726                for (p = collstart; p < collend;) {
3727                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
3728                    str += sprintf(str, "&#%d;", (int)ch);
3729                }
3730                p = collend;
3731                break;
3732            default:
3733                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3734                                                              encoding, reason, startp, size, &exc,
3735                                                              collstart-startp, collend-startp, &newpos);
3736                if (repunicode == NULL)
3737                    goto onError;
3738                /* need more space? (at least enough for what we have+the
3739                   replacement+the rest of the string, so we won't have to
3740                   check space for encodable characters) */
3741                respos = str - PyString_AS_STRING(res);
3742                repsize = PyUnicode_GET_SIZE(repunicode);
3743                if (respos > PY_SSIZE_T_MAX - repsize)
3744                    goto overflow;
3745                requiredsize = respos + repsize;
3746                if (requiredsize > PY_SSIZE_T_MAX - (endp - collend))
3747                    goto overflow;
3748                requiredsize += endp - collend;
3749                if (requiredsize > ressize) {
3750                    if (ressize <= PY_SSIZE_T_MAX/2 && requiredsize < 2*ressize)
3751                        requiredsize = 2*ressize;
3752                    if (_PyString_Resize(&res, requiredsize)) {
3753                        Py_DECREF(repunicode);
3754                        goto onError;
3755                    }
3756                    str = PyString_AS_STRING(res) + respos;
3757                    ressize = requiredsize;
3758                }
3759                /* check if there is anything unencodable in the replacement
3760                   and copy it to the output */
3761                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2, ++str) {
3762                    c = *uni2;
3763                    if (c >= limit) {
3764                        raise_encode_exception(&exc, encoding, startp, size,
3765                                               unicodepos, unicodepos+1, reason);
3766                        Py_DECREF(repunicode);
3767                        goto onError;
3768                    }
3769                    *str = (char)c;
3770                }
3771                p = startp + newpos;
3772                Py_DECREF(repunicode);
3773            }
3774        }
3775    }
3776    /* Resize if we allocated to much */
3777    respos = str - PyString_AS_STRING(res);
3778    if (respos < ressize)
3779        /* If this falls res will be NULL */
3780        _PyString_Resize(&res, respos);
3781    Py_XDECREF(errorHandler);
3782    Py_XDECREF(exc);
3783    return res;
3784
3785  overflow:
3786    PyErr_SetString(PyExc_OverflowError,
3787                    "encoded result is too long for a Python string");
3788
3789  onError:
3790    Py_XDECREF(res);
3791    Py_XDECREF(errorHandler);
3792    Py_XDECREF(exc);
3793    return NULL;
3794}
3795
3796PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3797                                 Py_ssize_t size,
3798                                 const char *errors)
3799{
3800    return unicode_encode_ucs1(p, size, errors, 256);
3801}
3802
3803PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3804{
3805    if (!PyUnicode_Check(unicode)) {
3806        PyErr_BadArgument();
3807        return NULL;
3808    }
3809    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3810                                  PyUnicode_GET_SIZE(unicode),
3811                                  NULL);
3812}
3813
3814/* --- 7-bit ASCII Codec -------------------------------------------------- */
3815
3816PyObject *PyUnicode_DecodeASCII(const char *s,
3817                                Py_ssize_t size,
3818                                const char *errors)
3819{
3820    const char *starts = s;
3821    PyUnicodeObject *v;
3822    Py_UNICODE *p;
3823    Py_ssize_t startinpos;
3824    Py_ssize_t endinpos;
3825    Py_ssize_t outpos;
3826    const char *e;
3827    PyObject *errorHandler = NULL;
3828    PyObject *exc = NULL;
3829
3830    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3831    if (size == 1 && *(unsigned char*)s < 128) {
3832        Py_UNICODE r = *(unsigned char*)s;
3833        return PyUnicode_FromUnicode(&r, 1);
3834    }
3835
3836    v = _PyUnicode_New(size);
3837    if (v == NULL)
3838        goto onError;
3839    if (size == 0)
3840        return (PyObject *)v;
3841    p = PyUnicode_AS_UNICODE(v);
3842    e = s + size;
3843    while (s < e) {
3844        register unsigned char c = (unsigned char)*s;
3845        if (c < 128) {
3846            *p++ = c;
3847            ++s;
3848        }
3849        else {
3850            startinpos = s-starts;
3851            endinpos = startinpos + 1;
3852            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3853            if (unicode_decode_call_errorhandler(
3854                    errors, &errorHandler,
3855                    "ascii", "ordinal not in range(128)",
3856                    starts, size, &startinpos, &endinpos, &exc, &s,
3857                    &v, &outpos, &p))
3858                goto onError;
3859        }
3860    }
3861    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3862        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3863            goto onError;
3864    Py_XDECREF(errorHandler);
3865    Py_XDECREF(exc);
3866    return (PyObject *)v;
3867
3868  onError:
3869    Py_XDECREF(v);
3870    Py_XDECREF(errorHandler);
3871    Py_XDECREF(exc);
3872    return NULL;
3873}
3874
3875PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3876                                Py_ssize_t size,
3877                                const char *errors)
3878{
3879    return unicode_encode_ucs1(p, size, errors, 128);
3880}
3881
3882PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3883{
3884    if (!PyUnicode_Check(unicode)) {
3885        PyErr_BadArgument();
3886        return NULL;
3887    }
3888    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3889                                 PyUnicode_GET_SIZE(unicode),
3890                                 NULL);
3891}
3892
3893#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3894
3895/* --- MBCS codecs for Windows -------------------------------------------- */
3896
3897#if SIZEOF_INT < SIZEOF_SIZE_T
3898#define NEED_RETRY
3899#endif
3900
3901/* XXX This code is limited to "true" double-byte encodings, as
3902   a) it assumes an incomplete character consists of a single byte, and
3903   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3904   encodings, see IsDBCSLeadByteEx documentation. */
3905
3906static int is_dbcs_lead_byte(const char *s, int offset)
3907{
3908    const char *curr = s + offset;
3909
3910    if (IsDBCSLeadByte(*curr)) {
3911        const char *prev = CharPrev(s, curr);
3912        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3913    }
3914    return 0;
3915}
3916
3917/*
3918 * Decode MBCS string into unicode object. If 'final' is set, converts
3919 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3920 */
3921static int decode_mbcs(PyUnicodeObject **v,
3922                       const char *s, /* MBCS string */
3923                       int size, /* sizeof MBCS string */
3924                       int final)
3925{
3926    Py_UNICODE *p;
3927    Py_ssize_t n = 0;
3928    int usize = 0;
3929
3930    assert(size >= 0);
3931
3932    /* Skip trailing lead-byte unless 'final' is set */
3933    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3934        --size;
3935
3936    /* First get the size of the result */
3937    if (size > 0) {
3938        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3939        if (usize == 0) {
3940            PyErr_SetFromWindowsErrWithFilename(0, NULL);
3941            return -1;
3942        }
3943    }
3944
3945    if (*v == NULL) {
3946        /* Create unicode object */
3947        *v = _PyUnicode_New(usize);
3948        if (*v == NULL)
3949            return -1;
3950    }
3951    else {
3952        /* Extend unicode object */
3953        n = PyUnicode_GET_SIZE(*v);
3954        if (_PyUnicode_Resize(v, n + usize) < 0)
3955            return -1;
3956    }
3957
3958    /* Do the conversion */
3959    if (size > 0) {
3960        p = PyUnicode_AS_UNICODE(*v) + n;
3961        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3962            PyErr_SetFromWindowsErrWithFilename(0, NULL);
3963            return -1;
3964        }
3965    }
3966
3967    return size;
3968}
3969
3970PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3971                                       Py_ssize_t size,
3972                                       const char *errors,
3973                                       Py_ssize_t *consumed)
3974{
3975    PyUnicodeObject *v = NULL;
3976    int done;
3977
3978    if (consumed)
3979        *consumed = 0;
3980
3981#ifdef NEED_RETRY
3982  retry:
3983    if (size > INT_MAX)
3984        done = decode_mbcs(&v, s, INT_MAX, 0);
3985    else
3986#endif
3987        done = decode_mbcs(&v, s, (int)size, !consumed);
3988
3989    if (done < 0) {
3990        Py_XDECREF(v);
3991        return NULL;
3992    }
3993
3994    if (consumed)
3995        *consumed += done;
3996
3997#ifdef NEED_RETRY
3998    if (size > INT_MAX) {
3999        s += done;
4000        size -= done;
4001        goto retry;
4002    }
4003#endif
4004
4005    return (PyObject *)v;
4006}
4007
4008PyObject *PyUnicode_DecodeMBCS(const char *s,
4009                               Py_ssize_t size,
4010                               const char *errors)
4011{
4012    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4013}
4014
4015/*
4016 * Convert unicode into string object (MBCS).
4017 * Returns 0 if succeed, -1 otherwise.
4018 */
4019static int encode_mbcs(PyObject **repr,
4020                       const Py_UNICODE *p, /* unicode */
4021                       int size) /* size of unicode */
4022{
4023    int mbcssize = 0;
4024    Py_ssize_t n = 0;
4025
4026    assert(size >= 0);
4027
4028    /* First get the size of the result */
4029    if (size > 0) {
4030        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
4031        if (mbcssize == 0) {
4032            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4033            return -1;
4034        }
4035    }
4036
4037    if (*repr == NULL) {
4038        /* Create string object */
4039        *repr = PyString_FromStringAndSize(NULL, mbcssize);
4040        if (*repr == NULL)
4041            return -1;
4042    }
4043    else {
4044        /* Extend string object */
4045        n = PyString_Size(*repr);
4046        if (_PyString_Resize(repr, n + mbcssize) < 0)
4047            return -1;
4048    }
4049
4050    /* Do the conversion */
4051    if (size > 0) {
4052        char *s = PyString_AS_STRING(*repr) + n;
4053        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
4054            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4055            return -1;
4056        }
4057    }
4058
4059    return 0;
4060}
4061
4062PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4063                               Py_ssize_t size,
4064                               const char *errors)
4065{
4066    PyObject *repr = NULL;
4067    int ret;
4068
4069#ifdef NEED_RETRY
4070  retry:
4071    if (size > INT_MAX)
4072        ret = encode_mbcs(&repr, p, INT_MAX);
4073    else
4074#endif
4075        ret = encode_mbcs(&repr, p, (int)size);
4076
4077    if (ret < 0) {
4078        Py_XDECREF(repr);
4079        return NULL;
4080    }
4081
4082#ifdef NEED_RETRY
4083    if (size > INT_MAX) {
4084        p += INT_MAX;
4085        size -= INT_MAX;
4086        goto retry;
4087    }
4088#endif
4089
4090    return repr;
4091}
4092
4093PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4094{
4095    if (!PyUnicode_Check(unicode)) {
4096        PyErr_BadArgument();
4097        return NULL;
4098    }
4099    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4100                                PyUnicode_GET_SIZE(unicode),
4101                                NULL);
4102}
4103
4104#undef NEED_RETRY
4105
4106#endif /* MS_WINDOWS */
4107
4108/* --- Character Mapping Codec -------------------------------------------- */
4109
4110PyObject *PyUnicode_DecodeCharmap(const char *s,
4111                                  Py_ssize_t size,
4112                                  PyObject *mapping,
4113                                  const char *errors)
4114{
4115    const char *starts = s;
4116    Py_ssize_t startinpos;
4117    Py_ssize_t endinpos;
4118    Py_ssize_t outpos;
4119    const char *e;
4120    PyUnicodeObject *v;
4121    Py_UNICODE *p;
4122    Py_ssize_t extrachars = 0;
4123    PyObject *errorHandler = NULL;
4124    PyObject *exc = NULL;
4125    Py_UNICODE *mapstring = NULL;
4126    Py_ssize_t maplen = 0;
4127
4128    /* Default to Latin-1 */
4129    if (mapping == NULL)
4130        return PyUnicode_DecodeLatin1(s, size, errors);
4131
4132    v = _PyUnicode_New(size);
4133    if (v == NULL)
4134        goto onError;
4135    if (size == 0)
4136        return (PyObject *)v;
4137    p = PyUnicode_AS_UNICODE(v);
4138    e = s + size;
4139    if (PyUnicode_CheckExact(mapping)) {
4140        mapstring = PyUnicode_AS_UNICODE(mapping);
4141        maplen = PyUnicode_GET_SIZE(mapping);
4142        while (s < e) {
4143            unsigned char ch = *s;
4144            Py_UNICODE x = 0xfffe; /* illegal value */
4145
4146            if (ch < maplen)
4147                x = mapstring[ch];
4148
4149            if (x == 0xfffe) {
4150                /* undefined mapping */
4151                outpos = p-PyUnicode_AS_UNICODE(v);
4152                startinpos = s-starts;
4153                endinpos = startinpos+1;
4154                if (unicode_decode_call_errorhandler(
4155                        errors, &errorHandler,
4156                        "charmap", "character maps to <undefined>",
4157                        starts, size, &startinpos, &endinpos, &exc, &s,
4158                        &v, &outpos, &p)) {
4159                    goto onError;
4160                }
4161                continue;
4162            }
4163            *p++ = x;
4164            ++s;
4165        }
4166    }
4167    else {
4168        while (s < e) {
4169            unsigned char ch = *s;
4170            PyObject *w, *x;
4171
4172            /* Get mapping (char ordinal -> integer, Unicode char or None) */
4173            w = PyInt_FromLong((long)ch);
4174            if (w == NULL)
4175                goto onError;
4176            x = PyObject_GetItem(mapping, w);
4177            Py_DECREF(w);
4178            if (x == NULL) {
4179                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4180                    /* No mapping found means: mapping is undefined. */
4181                    PyErr_Clear();
4182                    goto Undefined;
4183                } else
4184                    goto onError;
4185            }
4186
4187            /* Apply mapping */
4188            if (x == Py_None)
4189                goto Undefined;
4190            if (PyInt_Check(x)) {
4191                long value = PyInt_AS_LONG(x);
4192                if (value == 0xFFFE)
4193                    goto Undefined;
4194                if (value < 0 || value > 0x10FFFF) {
4195                    PyErr_SetString(PyExc_TypeError,
4196                                    "character mapping must be in range(0x110000)");
4197                    Py_DECREF(x);
4198                    goto onError;
4199                }
4200
4201#ifndef Py_UNICODE_WIDE
4202                if (value > 0xFFFF) {
4203                    /* see the code for 1-n mapping below */
4204                    if (extrachars < 2) {
4205                        /* resize first */
4206                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4207                        Py_ssize_t needed = 10 - extrachars;
4208                        extrachars += needed;
4209                        /* XXX overflow detection missing */
4210                        if (_PyUnicode_Resize(&v,
4211                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4212                            Py_DECREF(x);
4213                            goto onError;
4214                        }
4215                        p = PyUnicode_AS_UNICODE(v) + oldpos;
4216                    }
4217                    value -= 0x10000;
4218                    *p++ = 0xD800 | (value >> 10);
4219                    *p++ = 0xDC00 | (value & 0x3FF);
4220                    extrachars -= 2;
4221                }
4222                else
4223#endif
4224                *p++ = (Py_UNICODE)value;
4225            }
4226            else if (PyUnicode_Check(x)) {
4227                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4228
4229                if (targetsize == 1) {
4230                    /* 1-1 mapping */
4231                    Py_UNICODE value = *PyUnicode_AS_UNICODE(x);
4232                    if (value == 0xFFFE)
4233                        goto Undefined;
4234                    *p++ = value;
4235                }
4236                else if (targetsize > 1) {
4237                    /* 1-n mapping */
4238                    if (targetsize > extrachars) {
4239                        /* resize first */
4240                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4241                        Py_ssize_t needed = (targetsize - extrachars) + \
4242                            (targetsize << 2);
4243                        extrachars += needed;
4244                        /* XXX overflow detection missing */
4245                        if (_PyUnicode_Resize(&v,
4246                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4247                            Py_DECREF(x);
4248                            goto onError;
4249                        }
4250                        p = PyUnicode_AS_UNICODE(v) + oldpos;
4251                    }
4252                    Py_UNICODE_COPY(p,
4253                                    PyUnicode_AS_UNICODE(x),
4254                                    targetsize);
4255                    p += targetsize;
4256                    extrachars -= targetsize;
4257                }
4258                /* 1-0 mapping: skip the character */
4259            }
4260            else {
4261                /* wrong return value */
4262                PyErr_SetString(PyExc_TypeError,
4263                                "character mapping must return integer, None or unicode");
4264                Py_DECREF(x);
4265                goto onError;
4266            }
4267            Py_DECREF(x);
4268            ++s;
4269            continue;
4270Undefined:
4271            /* undefined mapping */
4272            Py_XDECREF(x);
4273            outpos = p-PyUnicode_AS_UNICODE(v);
4274            startinpos = s-starts;
4275            endinpos = startinpos+1;
4276            if (unicode_decode_call_errorhandler(
4277                    errors, &errorHandler,
4278                    "charmap", "character maps to <undefined>",
4279                    starts, size, &startinpos, &endinpos, &exc, &s,
4280                    &v, &outpos, &p)) {
4281                goto onError;
4282            }
4283        }
4284    }
4285    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4286        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4287            goto onError;
4288    Py_XDECREF(errorHandler);
4289    Py_XDECREF(exc);
4290    return (PyObject *)v;
4291
4292  onError:
4293    Py_XDECREF(errorHandler);
4294    Py_XDECREF(exc);
4295    Py_XDECREF(v);
4296    return NULL;
4297}
4298
4299/* Charmap encoding: the lookup table */
4300
4301struct encoding_map{
4302    PyObject_HEAD
4303    unsigned char level1[32];
4304    int count2, count3;
4305    unsigned char level23[1];
4306};
4307
4308static PyObject*
4309encoding_map_size(PyObject *obj, PyObject* args)
4310{
4311    struct encoding_map *map = (struct encoding_map*)obj;
4312    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4313                          128*map->count3);
4314}
4315
4316static PyMethodDef encoding_map_methods[] = {
4317    {"size", encoding_map_size, METH_NOARGS,
4318     PyDoc_STR("Return the size (in bytes) of this object") },
4319    { 0 }
4320};
4321
4322static void
4323encoding_map_dealloc(PyObject* o)
4324{
4325    PyObject_FREE(o);
4326}
4327
4328static PyTypeObject EncodingMapType = {
4329    PyVarObject_HEAD_INIT(NULL, 0)
4330    "EncodingMap",          /*tp_name*/
4331    sizeof(struct encoding_map),   /*tp_basicsize*/
4332    0,                      /*tp_itemsize*/
4333    /* methods */
4334    encoding_map_dealloc,   /*tp_dealloc*/
4335    0,                      /*tp_print*/
4336    0,                      /*tp_getattr*/
4337    0,                      /*tp_setattr*/
4338    0,                      /*tp_compare*/
4339    0,                      /*tp_repr*/
4340    0,                      /*tp_as_number*/
4341    0,                      /*tp_as_sequence*/
4342    0,                      /*tp_as_mapping*/
4343    0,                      /*tp_hash*/
4344    0,                      /*tp_call*/
4345    0,                      /*tp_str*/
4346    0,                      /*tp_getattro*/
4347    0,                      /*tp_setattro*/
4348    0,                      /*tp_as_buffer*/
4349    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4350    0,                      /*tp_doc*/
4351    0,                      /*tp_traverse*/
4352    0,                      /*tp_clear*/
4353    0,                      /*tp_richcompare*/
4354    0,                      /*tp_weaklistoffset*/
4355    0,                      /*tp_iter*/
4356    0,                      /*tp_iternext*/
4357    encoding_map_methods,   /*tp_methods*/
4358    0,                      /*tp_members*/
4359    0,                      /*tp_getset*/
4360    0,                      /*tp_base*/
4361    0,                      /*tp_dict*/
4362    0,                      /*tp_descr_get*/
4363    0,                      /*tp_descr_set*/
4364    0,                      /*tp_dictoffset*/
4365    0,                      /*tp_init*/
4366    0,                      /*tp_alloc*/
4367    0,                      /*tp_new*/
4368    0,                      /*tp_free*/
4369    0,                      /*tp_is_gc*/
4370};
4371
4372PyObject*
4373PyUnicode_BuildEncodingMap(PyObject* string)
4374{
4375    Py_UNICODE *decode;
4376    PyObject *result;
4377    struct encoding_map *mresult;
4378    int i;
4379    int need_dict = 0;
4380    unsigned char level1[32];
4381    unsigned char level2[512];
4382    unsigned char *mlevel1, *mlevel2, *mlevel3;
4383    int count2 = 0, count3 = 0;
4384
4385    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4386        PyErr_BadArgument();
4387        return NULL;
4388    }
4389    decode = PyUnicode_AS_UNICODE(string);
4390    memset(level1, 0xFF, sizeof level1);
4391    memset(level2, 0xFF, sizeof level2);
4392
4393    /* If there isn't a one-to-one mapping of NULL to \0,
4394       or if there are non-BMP characters, we need to use
4395       a mapping dictionary. */
4396    if (decode[0] != 0)
4397        need_dict = 1;
4398    for (i = 1; i < 256; i++) {
4399        int l1, l2;
4400        if (decode[i] == 0
4401#ifdef Py_UNICODE_WIDE
4402            || decode[i] > 0xFFFF
4403#endif
4404            ) {
4405            need_dict = 1;
4406            break;
4407        }
4408        if (decode[i] == 0xFFFE)
4409            /* unmapped character */
4410            continue;
4411        l1 = decode[i] >> 11;
4412        l2 = decode[i] >> 7;
4413        if (level1[l1] == 0xFF)
4414            level1[l1] = count2++;
4415        if (level2[l2] == 0xFF)
4416            level2[l2] = count3++;
4417    }
4418
4419    if (count2 >= 0xFF || count3 >= 0xFF)
4420        need_dict = 1;
4421
4422    if (need_dict) {
4423        PyObject *result = PyDict_New();
4424        PyObject *key, *value;
4425        if (!result)
4426            return NULL;
4427        for (i = 0; i < 256; i++) {
4428            value = NULL;
4429            key = PyInt_FromLong(decode[i]);
4430            value = PyInt_FromLong(i);
4431            if (!key || !value)
4432                goto failed1;
4433            if (PyDict_SetItem(result, key, value) == -1)
4434                goto failed1;
4435            Py_DECREF(key);
4436            Py_DECREF(value);
4437        }
4438        return result;
4439      failed1:
4440        Py_XDECREF(key);
4441        Py_XDECREF(value);
4442        Py_DECREF(result);
4443        return NULL;
4444    }
4445
4446    /* Create a three-level trie */
4447    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4448                             16*count2 + 128*count3 - 1);
4449    if (!result)
4450        return PyErr_NoMemory();
4451    PyObject_Init(result, &EncodingMapType);
4452    mresult = (struct encoding_map*)result;
4453    mresult->count2 = count2;
4454    mresult->count3 = count3;
4455    mlevel1 = mresult->level1;
4456    mlevel2 = mresult->level23;
4457    mlevel3 = mresult->level23 + 16*count2;
4458    memcpy(mlevel1, level1, 32);
4459    memset(mlevel2, 0xFF, 16*count2);
4460    memset(mlevel3, 0, 128*count3);
4461    count3 = 0;
4462    for (i = 1; i < 256; i++) {
4463        int o1, o2, o3, i2, i3;
4464        if (decode[i] == 0xFFFE)
4465            /* unmapped character */
4466            continue;
4467        o1 = decode[i]>>11;
4468        o2 = (decode[i]>>7) & 0xF;
4469        i2 = 16*mlevel1[o1] + o2;
4470        if (mlevel2[i2] == 0xFF)
4471            mlevel2[i2] = count3++;
4472        o3 = decode[i] & 0x7F;
4473        i3 = 128*mlevel2[i2] + o3;
4474        mlevel3[i3] = i;
4475    }
4476    return result;
4477}
4478
4479static int
4480encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4481{
4482    struct encoding_map *map = (struct encoding_map*)mapping;
4483    int l1 = c>>11;
4484    int l2 = (c>>7) & 0xF;
4485    int l3 = c & 0x7F;
4486    int i;
4487
4488#ifdef Py_UNICODE_WIDE
4489    if (c > 0xFFFF) {
4490        return -1;
4491    }
4492#endif
4493    if (c == 0)
4494        return 0;
4495    /* level 1*/
4496    i = map->level1[l1];
4497    if (i == 0xFF) {
4498        return -1;
4499    }
4500    /* level 2*/
4501    i = map->level23[16*i+l2];
4502    if (i == 0xFF) {
4503        return -1;
4504    }
4505    /* level 3 */
4506    i = map->level23[16*map->count2 + 128*i + l3];
4507    if (i == 0) {
4508        return -1;
4509    }
4510    return i;
4511}
4512
4513/* Lookup the character ch in the mapping. If the character
4514   can't be found, Py_None is returned (or NULL, if another
4515   error occurred). */
4516static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4517{
4518    PyObject *w = PyInt_FromLong((long)c);
4519    PyObject *x;
4520
4521    if (w == NULL)
4522        return NULL;
4523    x = PyObject_GetItem(mapping, w);
4524    Py_DECREF(w);
4525    if (x == NULL) {
4526        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4527            /* No mapping found means: mapping is undefined. */
4528            PyErr_Clear();
4529            x = Py_None;
4530            Py_INCREF(x);
4531            return x;
4532        } else
4533            return NULL;
4534    }
4535    else if (x == Py_None)
4536        return x;
4537    else if (PyInt_Check(x)) {
4538        long value = PyInt_AS_LONG(x);
4539        if (value < 0 || value > 255) {
4540            PyErr_SetString(PyExc_TypeError,
4541                            "character mapping must be in range(256)");
4542            Py_DECREF(x);
4543            return NULL;
4544        }
4545        return x;
4546    }
4547    else if (PyString_Check(x))
4548        return x;
4549    else {
4550        /* wrong return value */
4551        PyErr_SetString(PyExc_TypeError,
4552                        "character mapping must return integer, None or str");
4553        Py_DECREF(x);
4554        return NULL;
4555    }
4556}
4557
4558static int
4559charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4560{
4561    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4562    /* exponentially overallocate to minimize reallocations */
4563    if (requiredsize < 2*outsize)
4564        requiredsize = 2*outsize;
4565    if (_PyString_Resize(outobj, requiredsize)) {
4566        return 0;
4567    }
4568    return 1;
4569}
4570
4571typedef enum charmapencode_result {
4572    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4573}charmapencode_result;
4574/* lookup the character, put the result in the output string and adjust
4575   various state variables. Reallocate the output string if not enough
4576   space is available. Return a new reference to the object that
4577   was put in the output buffer, or Py_None, if the mapping was undefined
4578   (in which case no character was written) or NULL, if a
4579   reallocation error occurred. The caller must decref the result */
4580static
4581charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4582                                          PyObject **outobj, Py_ssize_t *outpos)
4583{
4584    PyObject *rep;
4585    char *outstart;
4586    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4587
4588    if (Py_TYPE(mapping) == &EncodingMapType) {
4589        int res = encoding_map_lookup(c, mapping);
4590        Py_ssize_t requiredsize = *outpos+1;
4591        if (res == -1)
4592            return enc_FAILED;
4593        if (outsize<requiredsize)
4594            if (!charmapencode_resize(outobj, outpos, requiredsize))
4595                return enc_EXCEPTION;
4596        outstart = PyString_AS_STRING(*outobj);
4597        outstart[(*outpos)++] = (char)res;
4598        return enc_SUCCESS;
4599    }
4600
4601    rep = charmapencode_lookup(c, mapping);
4602    if (rep==NULL)
4603        return enc_EXCEPTION;
4604    else if (rep==Py_None) {
4605        Py_DECREF(rep);
4606        return enc_FAILED;
4607    } else {
4608        if (PyInt_Check(rep)) {
4609            Py_ssize_t requiredsize = *outpos+1;
4610            if (outsize<requiredsize)
4611                if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4612                    Py_DECREF(rep);
4613                    return enc_EXCEPTION;
4614                }
4615            outstart = PyString_AS_STRING(*outobj);
4616            outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4617        }
4618        else {
4619            const char *repchars = PyString_AS_STRING(rep);
4620            Py_ssize_t repsize = PyString_GET_SIZE(rep);
4621            Py_ssize_t requiredsize = *outpos+repsize;
4622            if (outsize<requiredsize)
4623                if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4624                    Py_DECREF(rep);
4625                    return enc_EXCEPTION;
4626                }
4627            outstart = PyString_AS_STRING(*outobj);
4628            memcpy(outstart + *outpos, repchars, repsize);
4629            *outpos += repsize;
4630        }
4631    }
4632    Py_DECREF(rep);
4633    return enc_SUCCESS;
4634}
4635
4636/* handle an error in PyUnicode_EncodeCharmap
4637   Return 0 on success, -1 on error */
4638static
4639int charmap_encoding_error(
4640    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4641    PyObject **exceptionObject,
4642    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4643    PyObject **res, Py_ssize_t *respos)
4644{
4645    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4646    Py_ssize_t repsize;
4647    Py_ssize_t newpos;
4648    Py_UNICODE *uni2;
4649    /* startpos for collecting unencodable chars */
4650    Py_ssize_t collstartpos = *inpos;
4651    Py_ssize_t collendpos = *inpos+1;
4652    Py_ssize_t collpos;
4653    char *encoding = "charmap";
4654    char *reason = "character maps to <undefined>";
4655    charmapencode_result x;
4656
4657    /* find all unencodable characters */
4658    while (collendpos < size) {
4659        PyObject *rep;
4660        if (Py_TYPE(mapping) == &EncodingMapType) {
4661            int res = encoding_map_lookup(p[collendpos], mapping);
4662            if (res != -1)
4663                break;
4664            ++collendpos;
4665            continue;
4666        }
4667
4668        rep = charmapencode_lookup(p[collendpos], mapping);
4669        if (rep==NULL)
4670            return -1;
4671        else if (rep!=Py_None) {
4672            Py_DECREF(rep);
4673            break;
4674        }
4675        Py_DECREF(rep);
4676        ++collendpos;
4677    }
4678    /* cache callback name lookup
4679     * (if not done yet, i.e. it's the first error) */
4680    if (*known_errorHandler==-1) {
4681        if ((errors==NULL) || (!strcmp(errors, "strict")))
4682            *known_errorHandler = 1;
4683        else if (!strcmp(errors, "replace"))
4684            *known_errorHandler = 2;
4685        else if (!strcmp(errors, "ignore"))
4686            *known_errorHandler = 3;
4687        else if (!strcmp(errors, "xmlcharrefreplace"))
4688            *known_errorHandler = 4;
4689        else
4690            *known_errorHandler = 0;
4691    }
4692    switch (*known_errorHandler) {
4693    case 1: /* strict */
4694        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4695        return -1;
4696    case 2: /* replace */
4697        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4698            x = charmapencode_output('?', mapping, res, respos);
4699            if (x==enc_EXCEPTION) {
4700                return -1;
4701            }
4702            else if (x==enc_FAILED) {
4703                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4704                return -1;
4705            }
4706        }
4707        /* fall through */
4708    case 3: /* ignore */
4709        *inpos = collendpos;
4710        break;
4711    case 4: /* xmlcharrefreplace */
4712        /* generate replacement */
4713        for (collpos = collstartpos; collpos < collendpos;) {
4714            char buffer[2+29+1+1];
4715            char *cp;
4716            Py_UCS4 ch = p[collpos++];
4717#ifndef Py_UNICODE_WIDE
4718            if ((0xD800 <= ch && ch <= 0xDBFF) &&
4719                (collpos < collendpos) &&
4720                (0xDC00 <= p[collpos] && p[collpos] <= 0xDFFF)) {
4721                ch = ((((ch & 0x03FF) << 10) |
4722                       ((Py_UCS4)p[collpos++] & 0x03FF)) + 0x10000);
4723            }
4724#endif
4725            sprintf(buffer, "&#%d;", (int)ch);
4726            for (cp = buffer; *cp; ++cp) {
4727                x = charmapencode_output(*cp, mapping, res, respos);
4728                if (x==enc_EXCEPTION)
4729                    return -1;
4730                else if (x==enc_FAILED) {
4731                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4732                    return -1;
4733                }
4734            }
4735        }
4736        *inpos = collendpos;
4737        break;
4738    default:
4739        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4740                                                      encoding, reason, p, size, exceptionObject,
4741                                                      collstartpos, collendpos, &newpos);
4742        if (repunicode == NULL)
4743            return -1;
4744        /* generate replacement  */
4745        repsize = PyUnicode_GET_SIZE(repunicode);
4746        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4747            x = charmapencode_output(*uni2, mapping, res, respos);
4748            if (x==enc_EXCEPTION) {
4749                return -1;
4750            }
4751            else if (x==enc_FAILED) {
4752                Py_DECREF(repunicode);
4753                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4754                return -1;
4755            }
4756        }
4757        *inpos = newpos;
4758        Py_DECREF(repunicode);
4759    }
4760    return 0;
4761}
4762
4763PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4764                                  Py_ssize_t size,
4765                                  PyObject *mapping,
4766                                  const char *errors)
4767{
4768    /* output object */
4769    PyObject *res = NULL;
4770    /* current input position */
4771    Py_ssize_t inpos = 0;
4772    /* current output position */
4773    Py_ssize_t respos = 0;
4774    PyObject *errorHandler = NULL;
4775    PyObject *exc = NULL;
4776    /* the following variable is used for caching string comparisons
4777     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4778     * 3=ignore, 4=xmlcharrefreplace */
4779    int known_errorHandler = -1;
4780
4781    /* Default to Latin-1 */
4782    if (mapping == NULL)
4783        return PyUnicode_EncodeLatin1(p, size, errors);
4784
4785    /* allocate enough for a simple encoding without
4786       replacements, if we need more, we'll resize */
4787    res = PyString_FromStringAndSize(NULL, size);
4788    if (res == NULL)
4789        goto onError;
4790    if (size == 0)
4791        return res;
4792
4793    while (inpos<size) {
4794        /* try to encode it */
4795        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4796        if (x==enc_EXCEPTION) /* error */
4797            goto onError;
4798        if (x==enc_FAILED) { /* unencodable character */
4799            if (charmap_encoding_error(p, size, &inpos, mapping,
4800                                       &exc,
4801                                       &known_errorHandler, &errorHandler, errors,
4802                                       &res, &respos)) {
4803                goto onError;
4804            }
4805        }
4806        else
4807            /* done with this character => adjust input position */
4808            ++inpos;
4809    }
4810
4811    /* Resize if we allocated to much */
4812    if (respos<PyString_GET_SIZE(res)) {
4813        if (_PyString_Resize(&res, respos))
4814            goto onError;
4815    }
4816    Py_XDECREF(exc);
4817    Py_XDECREF(errorHandler);
4818    return res;
4819
4820  onError:
4821    Py_XDECREF(res);
4822    Py_XDECREF(exc);
4823    Py_XDECREF(errorHandler);
4824    return NULL;
4825}
4826
4827PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4828                                    PyObject *mapping)
4829{
4830    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4831        PyErr_BadArgument();
4832        return NULL;
4833    }
4834    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4835                                   PyUnicode_GET_SIZE(unicode),
4836                                   mapping,
4837                                   NULL);
4838}
4839
4840/* create or adjust a UnicodeTranslateError */
4841static void make_translate_exception(PyObject **exceptionObject,
4842                                     const Py_UNICODE *unicode, Py_ssize_t size,
4843                                     Py_ssize_t startpos, Py_ssize_t endpos,
4844                                     const char *reason)
4845{
4846    if (*exceptionObject == NULL) {
4847        *exceptionObject = PyUnicodeTranslateError_Create(
4848            unicode, size, startpos, endpos, reason);
4849    }
4850    else {
4851        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4852            goto onError;
4853        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4854            goto onError;
4855        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4856            goto onError;
4857        return;
4858      onError:
4859        Py_CLEAR(*exceptionObject);
4860    }
4861}
4862
4863/* raises a UnicodeTranslateError */
4864static void raise_translate_exception(PyObject **exceptionObject,
4865                                      const Py_UNICODE *unicode, Py_ssize_t size,
4866                                      Py_ssize_t startpos, Py_ssize_t endpos,
4867                                      const char *reason)
4868{
4869    make_translate_exception(exceptionObject,
4870                             unicode, size, startpos, endpos, reason);
4871    if (*exceptionObject != NULL)
4872        PyCodec_StrictErrors(*exceptionObject);
4873}
4874
4875/* error handling callback helper:
4876   build arguments, call the callback and check the arguments,
4877   put the result into newpos and return the replacement string, which
4878   has to be freed by the caller */
4879static PyObject *unicode_translate_call_errorhandler(const char *errors,
4880                                                     PyObject **errorHandler,
4881                                                     const char *reason,
4882                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4883                                                     Py_ssize_t startpos, Py_ssize_t endpos,
4884                                                     Py_ssize_t *newpos)
4885{
4886    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4887
4888    Py_ssize_t i_newpos;
4889    PyObject *restuple;
4890    PyObject *resunicode;
4891
4892    if (*errorHandler == NULL) {
4893        *errorHandler = PyCodec_LookupError(errors);
4894        if (*errorHandler == NULL)
4895            return NULL;
4896    }
4897
4898    make_translate_exception(exceptionObject,
4899                             unicode, size, startpos, endpos, reason);
4900    if (*exceptionObject == NULL)
4901        return NULL;
4902
4903    restuple = PyObject_CallFunctionObjArgs(
4904        *errorHandler, *exceptionObject, NULL);
4905    if (restuple == NULL)
4906        return NULL;
4907    if (!PyTuple_Check(restuple)) {
4908        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4909        Py_DECREF(restuple);
4910        return NULL;
4911    }
4912    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4913                          &resunicode, &i_newpos)) {
4914        Py_DECREF(restuple);
4915        return NULL;
4916    }
4917    if (i_newpos<0)
4918        *newpos = size+i_newpos;
4919    else
4920        *newpos = i_newpos;
4921    if (*newpos<0 || *newpos>size) {
4922        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4923        Py_DECREF(restuple);
4924        return NULL;
4925    }
4926    Py_INCREF(resunicode);
4927    Py_DECREF(restuple);
4928    return resunicode;
4929}
4930
4931/* Lookup the character ch in the mapping and put the result in result,
4932   which must be decrefed by the caller.
4933   Return 0 on success, -1 on error */
4934static
4935int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4936{
4937    PyObject *w = PyInt_FromLong((long)c);
4938    PyObject *x;
4939
4940    if (w == NULL)
4941        return -1;
4942    x = PyObject_GetItem(mapping, w);
4943    Py_DECREF(w);
4944    if (x == NULL) {
4945        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4946            /* No mapping found means: use 1:1 mapping. */
4947            PyErr_Clear();
4948            *result = NULL;
4949            return 0;
4950        } else
4951            return -1;
4952    }
4953    else if (x == Py_None) {
4954        *result = x;
4955        return 0;
4956    }
4957    else if (PyInt_Check(x)) {
4958        long value = PyInt_AS_LONG(x);
4959        long max = PyUnicode_GetMax();
4960        if (value < 0 || value > max) {
4961            PyErr_Format(PyExc_TypeError,
4962                         "character mapping must be in range(0x%lx)", max+1);
4963            Py_DECREF(x);
4964            return -1;
4965        }
4966        *result = x;
4967        return 0;
4968    }
4969    else if (PyUnicode_Check(x)) {
4970        *result = x;
4971        return 0;
4972    }
4973    else {
4974        /* wrong return value */
4975        PyErr_SetString(PyExc_TypeError,
4976                        "character mapping must return integer, None or unicode");
4977        Py_DECREF(x);
4978        return -1;
4979    }
4980}
4981/* ensure that *outobj is at least requiredsize characters long,
4982   if not reallocate and adjust various state variables.
4983   Return 0 on success, -1 on error */
4984static
4985int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4986                               Py_ssize_t requiredsize)
4987{
4988    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4989    if (requiredsize > oldsize) {
4990        /* remember old output position */
4991        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4992        /* exponentially overallocate to minimize reallocations */
4993        if (requiredsize < 2 * oldsize)
4994            requiredsize = 2 * oldsize;
4995        if (PyUnicode_Resize(outobj, requiredsize) < 0)
4996            return -1;
4997        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4998    }
4999    return 0;
5000}
5001/* lookup the character, put the result in the output string and adjust
5002   various state variables. Return a new reference to the object that
5003   was put in the output buffer in *result, or Py_None, if the mapping was
5004   undefined (in which case no character was written).
5005   The called must decref result.
5006   Return 0 on success, -1 on error. */
5007static
5008int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5009                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5010                            PyObject **res)
5011{
5012    if (charmaptranslate_lookup(*curinp, mapping, res))
5013        return -1;
5014    if (*res==NULL) {
5015        /* not found => default to 1:1 mapping */
5016        *(*outp)++ = *curinp;
5017    }
5018    else if (*res==Py_None)
5019        ;
5020    else if (PyInt_Check(*res)) {
5021        /* no overflow check, because we know that the space is enough */
5022        *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
5023    }
5024    else if (PyUnicode_Check(*res)) {
5025        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5026        if (repsize==1) {
5027            /* no overflow check, because we know that the space is enough */
5028            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5029        }
5030        else if (repsize!=0) {
5031            /* more than one character */
5032            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5033                (insize - (curinp-startinp)) +
5034                repsize - 1;
5035            if (charmaptranslate_makespace(outobj, outp, requiredsize))
5036                return -1;
5037            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5038            *outp += repsize;
5039        }
5040    }
5041    else
5042        return -1;
5043    return 0;
5044}
5045
5046PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5047                                     Py_ssize_t size,
5048                                     PyObject *mapping,
5049                                     const char *errors)
5050{
5051    /* output object */
5052    PyObject *res = NULL;
5053    /* pointers to the beginning and end+1 of input */
5054    const Py_UNICODE *startp = p;
5055    const Py_UNICODE *endp = p + size;
5056    /* pointer into the output */
5057    Py_UNICODE *str;
5058    /* current output position */
5059    Py_ssize_t respos = 0;
5060    char *reason = "character maps to <undefined>";
5061    PyObject *errorHandler = NULL;
5062    PyObject *exc = NULL;
5063    /* the following variable is used for caching string comparisons
5064     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5065     * 3=ignore, 4=xmlcharrefreplace */
5066    int known_errorHandler = -1;
5067
5068    if (mapping == NULL) {
5069        PyErr_BadArgument();
5070        return NULL;
5071    }
5072
5073    /* allocate enough for a simple 1:1 translation without
5074       replacements, if we need more, we'll resize */
5075    res = PyUnicode_FromUnicode(NULL, size);
5076    if (res == NULL)
5077        goto onError;
5078    if (size == 0)
5079        return res;
5080    str = PyUnicode_AS_UNICODE(res);
5081
5082    while (p<endp) {
5083        /* try to encode it */
5084        PyObject *x = NULL;
5085        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5086            Py_XDECREF(x);
5087            goto onError;
5088        }
5089        Py_XDECREF(x);
5090        if (x!=Py_None) /* it worked => adjust input pointer */
5091            ++p;
5092        else { /* untranslatable character */
5093            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5094            Py_ssize_t repsize;
5095            Py_ssize_t newpos;
5096            Py_UNICODE *uni2;
5097            /* startpos for collecting untranslatable chars */
5098            const Py_UNICODE *collstart = p;
5099            const Py_UNICODE *collend = p+1;
5100            const Py_UNICODE *coll;
5101
5102            /* find all untranslatable characters */
5103            while (collend < endp) {
5104                if (charmaptranslate_lookup(*collend, mapping, &x))
5105                    goto onError;
5106                Py_XDECREF(x);
5107                if (x!=Py_None)
5108                    break;
5109                ++collend;
5110            }
5111            /* cache callback name lookup
5112             * (if not done yet, i.e. it's the first error) */
5113            if (known_errorHandler==-1) {
5114                if ((errors==NULL) || (!strcmp(errors, "strict")))
5115                    known_errorHandler = 1;
5116                else if (!strcmp(errors, "replace"))
5117                    known_errorHandler = 2;
5118                else if (!strcmp(errors, "ignore"))
5119                    known_errorHandler = 3;
5120                else if (!strcmp(errors, "xmlcharrefreplace"))
5121                    known_errorHandler = 4;
5122                else
5123                    known_errorHandler = 0;
5124            }
5125            switch (known_errorHandler) {
5126            case 1: /* strict */
5127                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5128                goto onError;
5129            case 2: /* replace */
5130                /* No need to check for space, this is a 1:1 replacement */
5131                for (coll = collstart; coll<collend; ++coll)
5132                    *str++ = '?';
5133                /* fall through */
5134            case 3: /* ignore */
5135                p = collend;
5136                break;
5137            case 4: /* xmlcharrefreplace */
5138                /* generate replacement (temporarily (mis)uses p) */
5139                for (p = collstart; p < collend;) {
5140                    char buffer[2+29+1+1];
5141                    char *cp;
5142                    Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5143                    sprintf(buffer, "&#%d;", (int)ch);
5144                    if (charmaptranslate_makespace(&res, &str,
5145                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5146                        goto onError;
5147                    for (cp = buffer; *cp; ++cp)
5148                        *str++ = *cp;
5149                }
5150                p = collend;
5151                break;
5152            default:
5153                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5154                                                                 reason, startp, size, &exc,
5155                                                                 collstart-startp, collend-startp, &newpos);
5156                if (repunicode == NULL)
5157                    goto onError;
5158                /* generate replacement  */
5159                repsize = PyUnicode_GET_SIZE(repunicode);
5160                if (charmaptranslate_makespace(&res, &str,
5161                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5162                    Py_DECREF(repunicode);
5163                    goto onError;
5164                }
5165                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5166                    *str++ = *uni2;
5167                p = startp + newpos;
5168                Py_DECREF(repunicode);
5169            }
5170        }
5171    }
5172    /* Resize if we allocated to much */
5173    respos = str-PyUnicode_AS_UNICODE(res);
5174    if (respos<PyUnicode_GET_SIZE(res)) {
5175        if (PyUnicode_Resize(&res, respos) < 0)
5176            goto onError;
5177    }
5178    Py_XDECREF(exc);
5179    Py_XDECREF(errorHandler);
5180    return res;
5181
5182  onError:
5183    Py_XDECREF(res);
5184    Py_XDECREF(exc);
5185    Py_XDECREF(errorHandler);
5186    return NULL;
5187}
5188
5189PyObject *PyUnicode_Translate(PyObject *str,
5190                              PyObject *mapping,
5191                              const char *errors)
5192{
5193    PyObject *result;
5194
5195    str = PyUnicode_FromObject(str);
5196    if (str == NULL)
5197        goto onError;
5198    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5199                                        PyUnicode_GET_SIZE(str),
5200                                        mapping,
5201                                        errors);
5202    Py_DECREF(str);
5203    return result;
5204
5205  onError:
5206    Py_XDECREF(str);
5207    return NULL;
5208}
5209
5210/* --- Decimal Encoder ---------------------------------------------------- */
5211
5212int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5213                            Py_ssize_t length,
5214                            char *output,
5215                            const char *errors)
5216{
5217    Py_UNICODE *p, *end;
5218    PyObject *errorHandler = NULL;
5219    PyObject *exc = NULL;
5220    const char *encoding = "decimal";
5221    const char *reason = "invalid decimal Unicode string";
5222    /* the following variable is used for caching string comparisons
5223     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5224    int known_errorHandler = -1;
5225
5226    if (output == NULL) {
5227        PyErr_BadArgument();
5228        return -1;
5229    }
5230
5231    p = s;
5232    end = s + length;
5233    while (p < end) {
5234        register Py_UNICODE ch = *p;
5235        int decimal;
5236        PyObject *repunicode;
5237        Py_ssize_t repsize;
5238        Py_ssize_t newpos;
5239        Py_UNICODE *uni2;
5240        Py_UNICODE *collstart;
5241        Py_UNICODE *collend;
5242
5243        if (Py_UNICODE_ISSPACE(ch)) {
5244            *output++ = ' ';
5245            ++p;
5246            continue;
5247        }
5248        decimal = Py_UNICODE_TODECIMAL(ch);
5249        if (decimal >= 0) {
5250            *output++ = '0' + decimal;
5251            ++p;
5252            continue;
5253        }
5254        if (0 < ch && ch < 256) {
5255            *output++ = (char)ch;
5256            ++p;
5257            continue;
5258        }
5259        /* All other characters are considered unencodable */
5260        collstart = p;
5261        for (collend = p+1; collend < end; collend++) {
5262            if ((0 < *collend && *collend < 256) ||
5263                Py_UNICODE_ISSPACE(*collend) ||
5264                0 <= Py_UNICODE_TODECIMAL(*collend))
5265                break;
5266        }
5267        /* cache callback name lookup
5268         * (if not done yet, i.e. it's the first error) */
5269        if (known_errorHandler==-1) {
5270            if ((errors==NULL) || (!strcmp(errors, "strict")))
5271                known_errorHandler = 1;
5272            else if (!strcmp(errors, "replace"))
5273                known_errorHandler = 2;
5274            else if (!strcmp(errors, "ignore"))
5275                known_errorHandler = 3;
5276            else if (!strcmp(errors, "xmlcharrefreplace"))
5277                known_errorHandler = 4;
5278            else
5279                known_errorHandler = 0;
5280        }
5281        switch (known_errorHandler) {
5282        case 1: /* strict */
5283            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5284            goto onError;
5285        case 2: /* replace */
5286            for (p = collstart; p < collend; ++p)
5287                *output++ = '?';
5288            /* fall through */
5289        case 3: /* ignore */
5290            p = collend;
5291            break;
5292        case 4: /* xmlcharrefreplace */
5293            /* generate replacement (temporarily (mis)uses p) */
5294            for (p = collstart; p < collend;) {
5295                Py_UCS4 ch = _Py_UNICODE_NEXT(p, collend);
5296                output += sprintf(output, "&#%d;", ch);
5297            }
5298            p = collend;
5299            break;
5300        default:
5301            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5302                                                          encoding, reason, s, length, &exc,
5303                                                          collstart-s, collend-s, &newpos);
5304            if (repunicode == NULL)
5305                goto onError;
5306            /* generate replacement  */
5307            repsize = PyUnicode_GET_SIZE(repunicode);
5308            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5309                Py_UNICODE ch = *uni2;
5310                if (Py_UNICODE_ISSPACE(ch))
5311                    *output++ = ' ';
5312                else {
5313                    decimal = Py_UNICODE_TODECIMAL(ch);
5314                    if (decimal >= 0)
5315                        *output++ = '0' + decimal;
5316                    else if (0 < ch && ch < 256)
5317                        *output++ = (char)ch;
5318                    else {
5319                        Py_DECREF(repunicode);
5320                        raise_encode_exception(&exc, encoding,
5321                                               s, length, collstart-s, collend-s, reason);
5322                        goto onError;
5323                    }
5324                }
5325            }
5326            p = s + newpos;
5327            Py_DECREF(repunicode);
5328        }
5329    }
5330    /* 0-terminate the output string */
5331    *output++ = '\0';
5332    Py_XDECREF(exc);
5333    Py_XDECREF(errorHandler);
5334    return 0;
5335
5336  onError:
5337    Py_XDECREF(exc);
5338    Py_XDECREF(errorHandler);
5339    return -1;
5340}
5341
5342/* --- Helpers ------------------------------------------------------------ */
5343
5344#include "stringlib/unicodedefs.h"
5345#include "stringlib/fastsearch.h"
5346
5347#include "stringlib/count.h"
5348#include "stringlib/find.h"
5349#include "stringlib/partition.h"
5350#include "stringlib/split.h"
5351
5352/* helper macro to fixup start/end slice values */
5353#define ADJUST_INDICES(start, end, len)         \
5354    if (end > len)                              \
5355        end = len;                              \
5356    else if (end < 0) {                         \
5357        end += len;                             \
5358        if (end < 0)                            \
5359            end = 0;                            \
5360    }                                           \
5361    if (start < 0) {                            \
5362        start += len;                           \
5363        if (start < 0)                          \
5364            start = 0;                          \
5365    }
5366
5367Py_ssize_t PyUnicode_Count(PyObject *str,
5368                           PyObject *substr,
5369                           Py_ssize_t start,
5370                           Py_ssize_t end)
5371{
5372    Py_ssize_t result;
5373    PyUnicodeObject* str_obj;
5374    PyUnicodeObject* sub_obj;
5375
5376    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5377    if (!str_obj)
5378        return -1;
5379    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5380    if (!sub_obj) {
5381        Py_DECREF(str_obj);
5382        return -1;
5383    }
5384
5385    ADJUST_INDICES(start, end, str_obj->length);
5386    result = stringlib_count(
5387        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5388        PY_SSIZE_T_MAX
5389        );
5390
5391    Py_DECREF(sub_obj);
5392    Py_DECREF(str_obj);
5393
5394    return result;
5395}
5396
5397Py_ssize_t PyUnicode_Find(PyObject *str,
5398                          PyObject *sub,
5399                          Py_ssize_t start,
5400                          Py_ssize_t end,
5401                          int direction)
5402{
5403    Py_ssize_t result;
5404
5405    str = PyUnicode_FromObject(str);
5406    if (!str)
5407        return -2;
5408    sub = PyUnicode_FromObject(sub);
5409    if (!sub) {
5410        Py_DECREF(str);
5411        return -2;
5412    }
5413
5414    if (direction > 0)
5415        result = stringlib_find_slice(
5416            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5417            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5418            start, end
5419            );
5420    else
5421        result = stringlib_rfind_slice(
5422            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5423            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5424            start, end
5425            );
5426
5427    Py_DECREF(str);
5428    Py_DECREF(sub);
5429
5430    return result;
5431}
5432
5433static
5434int tailmatch(PyUnicodeObject *self,
5435              PyUnicodeObject *substring,
5436              Py_ssize_t start,
5437              Py_ssize_t end,
5438              int direction)
5439{
5440    if (substring->length == 0)
5441        return 1;
5442
5443    ADJUST_INDICES(start, end, self->length);
5444    end -= substring->length;
5445    if (end < start)
5446        return 0;
5447
5448    if (direction > 0) {
5449        if (Py_UNICODE_MATCH(self, end, substring))
5450            return 1;
5451    } else {
5452        if (Py_UNICODE_MATCH(self, start, substring))
5453            return 1;
5454    }
5455
5456    return 0;
5457}
5458
5459Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5460                               PyObject *substr,
5461                               Py_ssize_t start,
5462                               Py_ssize_t end,
5463                               int direction)
5464{
5465    Py_ssize_t result;
5466
5467    str = PyUnicode_FromObject(str);
5468    if (str == NULL)
5469        return -1;
5470    substr = PyUnicode_FromObject(substr);
5471    if (substr == NULL) {
5472        Py_DECREF(str);
5473        return -1;
5474    }
5475
5476    result = tailmatch((PyUnicodeObject *)str,
5477                       (PyUnicodeObject *)substr,
5478                       start, end, direction);
5479    Py_DECREF(str);
5480    Py_DECREF(substr);
5481    return result;
5482}
5483
5484/* Apply fixfct filter to the Unicode object self and return a
5485   reference to the modified object */
5486
5487static
5488PyObject *fixup(PyUnicodeObject *self,
5489                int (*fixfct)(PyUnicodeObject *s))
5490{
5491
5492    PyUnicodeObject *u;
5493
5494    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5495    if (u == NULL)
5496        return NULL;
5497
5498    Py_UNICODE_COPY(u->str, self->str, self->length);
5499
5500    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5501        /* fixfct should return TRUE if it modified the buffer. If
5502           FALSE, return a reference to the original buffer instead
5503           (to save space, not time) */
5504        Py_INCREF(self);
5505        Py_DECREF(u);
5506        return (PyObject*) self;
5507    }
5508    return (PyObject*) u;
5509}
5510
5511static
5512int fixupper(PyUnicodeObject *self)
5513{
5514    Py_ssize_t len = self->length;
5515    Py_UNICODE *s = self->str;
5516    int status = 0;
5517
5518    while (len-- > 0) {
5519        register Py_UNICODE ch;
5520
5521        ch = Py_UNICODE_TOUPPER(*s);
5522        if (ch != *s) {
5523            status = 1;
5524            *s = ch;
5525        }
5526        s++;
5527    }
5528
5529    return status;
5530}
5531
5532static
5533int fixlower(PyUnicodeObject *self)
5534{
5535    Py_ssize_t len = self->length;
5536    Py_UNICODE *s = self->str;
5537    int status = 0;
5538
5539    while (len-- > 0) {
5540        register Py_UNICODE ch;
5541
5542        ch = Py_UNICODE_TOLOWER(*s);
5543        if (ch != *s) {
5544            status = 1;
5545            *s = ch;
5546        }
5547        s++;
5548    }
5549
5550    return status;
5551}
5552
5553static
5554int fixswapcase(PyUnicodeObject *self)
5555{
5556    Py_ssize_t len = self->length;
5557    Py_UNICODE *s = self->str;
5558    int status = 0;
5559
5560    while (len-- > 0) {
5561        if (Py_UNICODE_ISUPPER(*s)) {
5562            *s = Py_UNICODE_TOLOWER(*s);
5563            status = 1;
5564        } else if (Py_UNICODE_ISLOWER(*s)) {
5565            *s = Py_UNICODE_TOUPPER(*s);
5566            status = 1;
5567        }
5568        s++;
5569    }
5570
5571    return status;
5572}
5573
5574static
5575int fixcapitalize(PyUnicodeObject *self)
5576{
5577    Py_ssize_t len = self->length;
5578    Py_UNICODE *s = self->str;
5579    int status = 0;
5580
5581    if (len == 0)
5582        return 0;
5583    if (!Py_UNICODE_ISUPPER(*s)) {
5584        *s = Py_UNICODE_TOUPPER(*s);
5585        status = 1;
5586    }
5587    s++;
5588    while (--len > 0) {
5589        if (!Py_UNICODE_ISLOWER(*s)) {
5590            *s = Py_UNICODE_TOLOWER(*s);
5591            status = 1;
5592        }
5593        s++;
5594    }
5595    return status;
5596}
5597
5598static
5599int fixtitle(PyUnicodeObject *self)
5600{
5601    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5602    register Py_UNICODE *e;
5603    int previous_is_cased;
5604
5605    /* Shortcut for single character strings */
5606    if (PyUnicode_GET_SIZE(self) == 1) {
5607        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5608        if (*p != ch) {
5609            *p = ch;
5610            return 1;
5611        }
5612        else
5613            return 0;
5614    }
5615
5616    e = p + PyUnicode_GET_SIZE(self);
5617    previous_is_cased = 0;
5618    for (; p < e; p++) {
5619        register const Py_UNICODE ch = *p;
5620
5621        if (previous_is_cased)
5622            *p = Py_UNICODE_TOLOWER(ch);
5623        else
5624            *p = Py_UNICODE_TOTITLE(ch);
5625
5626        if (Py_UNICODE_ISLOWER(ch) ||
5627            Py_UNICODE_ISUPPER(ch) ||
5628            Py_UNICODE_ISTITLE(ch))
5629            previous_is_cased = 1;
5630        else
5631            previous_is_cased = 0;
5632    }
5633    return 1;
5634}
5635
5636PyObject *
5637PyUnicode_Join(PyObject *separator, PyObject *seq)
5638{
5639    PyObject *internal_separator = NULL;
5640    const Py_UNICODE blank = ' ';
5641    const Py_UNICODE *sep = &blank;
5642    Py_ssize_t seplen = 1;
5643    PyUnicodeObject *res = NULL; /* the result */
5644    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5645    Py_ssize_t res_used;         /* # used bytes */
5646    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5647    PyObject *fseq;          /* PySequence_Fast(seq) */
5648    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5649    PyObject *item;
5650    Py_ssize_t i;
5651
5652    fseq = PySequence_Fast(seq, "can only join an iterable");
5653    if (fseq == NULL) {
5654        return NULL;
5655    }
5656
5657    /* Grrrr.  A codec may be invoked to convert str objects to
5658     * Unicode, and so it's possible to call back into Python code
5659     * during PyUnicode_FromObject(), and so it's possible for a sick
5660     * codec to change the size of fseq (if seq is a list).  Therefore
5661     * we have to keep refetching the size -- can't assume seqlen
5662     * is invariant.
5663     */
5664    seqlen = PySequence_Fast_GET_SIZE(fseq);
5665    /* If empty sequence, return u"". */
5666    if (seqlen == 0) {
5667        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5668        goto Done;
5669    }
5670    /* If singleton sequence with an exact Unicode, return that. */
5671    if (seqlen == 1) {
5672        item = PySequence_Fast_GET_ITEM(fseq, 0);
5673        if (PyUnicode_CheckExact(item)) {
5674            Py_INCREF(item);
5675            res = (PyUnicodeObject *)item;
5676            goto Done;
5677        }
5678    }
5679
5680    /* At least two items to join, or one that isn't exact Unicode. */
5681    if (seqlen > 1) {
5682        /* Set up sep and seplen -- they're needed. */
5683        if (separator == NULL) {
5684            sep = &blank;
5685            seplen = 1;
5686        }
5687        else {
5688            internal_separator = PyUnicode_FromObject(separator);
5689            if (internal_separator == NULL)
5690                goto onError;
5691            sep = PyUnicode_AS_UNICODE(internal_separator);
5692            seplen = PyUnicode_GET_SIZE(internal_separator);
5693            /* In case PyUnicode_FromObject() mutated seq. */
5694            seqlen = PySequence_Fast_GET_SIZE(fseq);
5695        }
5696    }
5697
5698    /* Get space. */
5699    res = _PyUnicode_New(res_alloc);
5700    if (res == NULL)
5701        goto onError;
5702    res_p = PyUnicode_AS_UNICODE(res);
5703    res_used = 0;
5704
5705    for (i = 0; i < seqlen; ++i) {
5706        Py_ssize_t itemlen;
5707        Py_ssize_t new_res_used;
5708
5709        item = PySequence_Fast_GET_ITEM(fseq, i);
5710        /* Convert item to Unicode. */
5711        if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5712            PyErr_Format(PyExc_TypeError,
5713                         "sequence item %zd: expected string or Unicode,"
5714                         " %.80s found",
5715                         i, Py_TYPE(item)->tp_name);
5716            goto onError;
5717        }
5718        item = PyUnicode_FromObject(item);
5719        if (item == NULL)
5720            goto onError;
5721        /* We own a reference to item from here on. */
5722
5723        /* In case PyUnicode_FromObject() mutated seq. */
5724        seqlen = PySequence_Fast_GET_SIZE(fseq);
5725
5726        /* Make sure we have enough space for the separator and the item. */
5727        itemlen = PyUnicode_GET_SIZE(item);
5728        new_res_used = res_used + itemlen;
5729        if (new_res_used < 0)
5730            goto Overflow;
5731        if (i < seqlen - 1) {
5732            new_res_used += seplen;
5733            if (new_res_used < 0)
5734                goto Overflow;
5735        }
5736        if (new_res_used > res_alloc) {
5737            /* double allocated size until it's big enough */
5738            do {
5739                res_alloc += res_alloc;
5740                if (res_alloc <= 0)
5741                    goto Overflow;
5742            } while (new_res_used > res_alloc);
5743            if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5744                Py_DECREF(item);
5745                goto onError;
5746            }
5747            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5748        }
5749
5750        /* Copy item, and maybe the separator. */
5751        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5752        res_p += itemlen;
5753        if (i < seqlen - 1) {
5754            Py_UNICODE_COPY(res_p, sep, seplen);
5755            res_p += seplen;
5756        }
5757        Py_DECREF(item);
5758        res_used = new_res_used;
5759    }
5760
5761    /* Shrink res to match the used area; this probably can't fail,
5762     * but it's cheap to check.
5763     */
5764    if (_PyUnicode_Resize(&res, res_used) < 0)
5765        goto onError;
5766
5767  Done:
5768    Py_XDECREF(internal_separator);
5769    Py_DECREF(fseq);
5770    return (PyObject *)res;
5771
5772  Overflow:
5773    PyErr_SetString(PyExc_OverflowError,
5774                    "join() result is too long for a Python string");
5775    Py_DECREF(item);
5776    /* fall through */
5777
5778  onError:
5779    Py_XDECREF(internal_separator);
5780    Py_DECREF(fseq);
5781    Py_XDECREF(res);
5782    return NULL;
5783}
5784
5785static
5786PyUnicodeObject *pad(PyUnicodeObject *self,
5787                     Py_ssize_t left,
5788                     Py_ssize_t right,
5789                     Py_UNICODE fill)
5790{
5791    PyUnicodeObject *u;
5792
5793    if (left < 0)
5794        left = 0;
5795    if (right < 0)
5796        right = 0;
5797
5798    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5799        Py_INCREF(self);
5800        return self;
5801    }
5802
5803    if (left > PY_SSIZE_T_MAX - self->length ||
5804        right > PY_SSIZE_T_MAX - (left + self->length)) {
5805        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5806        return NULL;
5807    }
5808    u = _PyUnicode_New(left + self->length + right);
5809    if (u) {
5810        if (left)
5811            Py_UNICODE_FILL(u->str, fill, left);
5812        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5813        if (right)
5814            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5815    }
5816
5817    return u;
5818}
5819
5820PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5821{
5822    PyObject *list;
5823
5824    string = PyUnicode_FromObject(string);
5825    if (string == NULL)
5826        return NULL;
5827
5828    list = stringlib_splitlines(
5829        (PyObject*) string, PyUnicode_AS_UNICODE(string),
5830        PyUnicode_GET_SIZE(string), keepends);
5831
5832    Py_DECREF(string);
5833    return list;
5834}
5835
5836static
5837PyObject *split(PyUnicodeObject *self,
5838                PyUnicodeObject *substring,
5839                Py_ssize_t maxcount)
5840{
5841    if (maxcount < 0)
5842        maxcount = PY_SSIZE_T_MAX;
5843
5844    if (substring == NULL)
5845        return stringlib_split_whitespace(
5846            (PyObject*) self,  self->str, self->length, maxcount
5847            );
5848
5849    return stringlib_split(
5850        (PyObject*) self,  self->str, self->length,
5851        substring->str, substring->length,
5852        maxcount
5853        );
5854}
5855
5856static
5857PyObject *rsplit(PyUnicodeObject *self,
5858                 PyUnicodeObject *substring,
5859                 Py_ssize_t maxcount)
5860{
5861    if (maxcount < 0)
5862        maxcount = PY_SSIZE_T_MAX;
5863
5864    if (substring == NULL)
5865        return stringlib_rsplit_whitespace(
5866            (PyObject*) self,  self->str, self->length, maxcount
5867            );
5868
5869    return stringlib_rsplit(
5870        (PyObject*) self,  self->str, self->length,
5871        substring->str, substring->length,
5872        maxcount
5873        );
5874}
5875
5876static
5877PyObject *replace(PyUnicodeObject *self,
5878                  PyUnicodeObject *str1,
5879                  PyUnicodeObject *str2,
5880                  Py_ssize_t maxcount)
5881{
5882    PyUnicodeObject *u;
5883
5884    if (maxcount < 0)
5885        maxcount = PY_SSIZE_T_MAX;
5886    else if (maxcount == 0 || self->length == 0)
5887        goto nothing;
5888
5889    if (str1->length == str2->length) {
5890        Py_ssize_t i;
5891        /* same length */
5892        if (str1->length == 0)
5893            goto nothing;
5894        if (str1->length == 1) {
5895            /* replace characters */
5896            Py_UNICODE u1, u2;
5897            if (!findchar(self->str, self->length, str1->str[0]))
5898                goto nothing;
5899            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5900            if (!u)
5901                return NULL;
5902            Py_UNICODE_COPY(u->str, self->str, self->length);
5903            u1 = str1->str[0];
5904            u2 = str2->str[0];
5905            for (i = 0; i < u->length; i++)
5906                if (u->str[i] == u1) {
5907                    if (--maxcount < 0)
5908                        break;
5909                    u->str[i] = u2;
5910                }
5911        } else {
5912            i = stringlib_find(
5913                self->str, self->length, str1->str, str1->length, 0
5914                );
5915            if (i < 0)
5916                goto nothing;
5917            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5918            if (!u)
5919                return NULL;
5920            Py_UNICODE_COPY(u->str, self->str, self->length);
5921
5922            /* change everything in-place, starting with this one */
5923            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5924            i += str1->length;
5925
5926            while ( --maxcount > 0) {
5927                i = stringlib_find(self->str+i, self->length-i,
5928                                   str1->str, str1->length,
5929                                   i);
5930                if (i == -1)
5931                    break;
5932                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5933                i += str1->length;
5934            }
5935        }
5936    } else {
5937
5938        Py_ssize_t n, i, j;
5939        Py_ssize_t product, new_size, delta;
5940        Py_UNICODE *p;
5941
5942        /* replace strings */
5943        n = stringlib_count(self->str, self->length, str1->str, str1->length,
5944                            maxcount);
5945        if (n == 0)
5946            goto nothing;
5947        /* new_size = self->length + n * (str2->length - str1->length)); */
5948        delta = (str2->length - str1->length);
5949        if (delta == 0) {
5950            new_size = self->length;
5951        } else {
5952            product = n * (str2->length - str1->length);
5953            if ((product / (str2->length - str1->length)) != n) {
5954                PyErr_SetString(PyExc_OverflowError,
5955                                "replace string is too long");
5956                return NULL;
5957            }
5958            new_size = self->length + product;
5959            if (new_size < 0) {
5960                PyErr_SetString(PyExc_OverflowError,
5961                                "replace string is too long");
5962                return NULL;
5963            }
5964        }
5965        u = _PyUnicode_New(new_size);
5966        if (!u)
5967            return NULL;
5968        i = 0;
5969        p = u->str;
5970        if (str1->length > 0) {
5971            while (n-- > 0) {
5972                /* look for next match */
5973                j = stringlib_find(self->str+i, self->length-i,
5974                                   str1->str, str1->length,
5975                                   i);
5976                if (j == -1)
5977                    break;
5978                else if (j > i) {
5979                    /* copy unchanged part [i:j] */
5980                    Py_UNICODE_COPY(p, self->str+i, j-i);
5981                    p += j - i;
5982                }
5983                /* copy substitution string */
5984                if (str2->length > 0) {
5985                    Py_UNICODE_COPY(p, str2->str, str2->length);
5986                    p += str2->length;
5987                }
5988                i = j + str1->length;
5989            }
5990            if (i < self->length)
5991                /* copy tail [i:] */
5992                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5993        } else {
5994            /* interleave */
5995            while (n > 0) {
5996                Py_UNICODE_COPY(p, str2->str, str2->length);
5997                p += str2->length;
5998                if (--n <= 0)
5999                    break;
6000                *p++ = self->str[i++];
6001            }
6002            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6003        }
6004    }
6005    return (PyObject *) u;
6006
6007  nothing:
6008    /* nothing to replace; return original string (when possible) */
6009    if (PyUnicode_CheckExact(self)) {
6010        Py_INCREF(self);
6011        return (PyObject *) self;
6012    }
6013    return PyUnicode_FromUnicode(self->str, self->length);
6014}
6015
6016/* --- Unicode Object Methods --------------------------------------------- */
6017
6018PyDoc_STRVAR(title__doc__,
6019             "S.title() -> unicode\n\
6020\n\
6021Return a titlecased version of S, i.e. words start with title case\n\
6022characters, all remaining cased characters have lower case.");
6023
6024static PyObject*
6025unicode_title(PyUnicodeObject *self)
6026{
6027    return fixup(self, fixtitle);
6028}
6029
6030PyDoc_STRVAR(capitalize__doc__,
6031             "S.capitalize() -> unicode\n\
6032\n\
6033Return a capitalized version of S, i.e. make the first character\n\
6034have upper case and the rest lower case.");
6035
6036static PyObject*
6037unicode_capitalize(PyUnicodeObject *self)
6038{
6039    return fixup(self, fixcapitalize);
6040}
6041
6042#if 0
6043PyDoc_STRVAR(capwords__doc__,
6044             "S.capwords() -> unicode\n\
6045\n\
6046Apply .capitalize() to all words in S and return the result with\n\
6047normalized whitespace (all whitespace strings are replaced by ' ').");
6048
6049static PyObject*
6050unicode_capwords(PyUnicodeObject *self)
6051{
6052    PyObject *list;
6053    PyObject *item;
6054    Py_ssize_t i;
6055
6056    /* Split into words */
6057    list = split(self, NULL, -1);
6058    if (!list)
6059        return NULL;
6060
6061    /* Capitalize each word */
6062    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6063        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6064                     fixcapitalize);
6065        if (item == NULL)
6066            goto onError;
6067        Py_DECREF(PyList_GET_ITEM(list, i));
6068        PyList_SET_ITEM(list, i, item);
6069    }
6070
6071    /* Join the words to form a new string */
6072    item = PyUnicode_Join(NULL, list);
6073
6074  onError:
6075    Py_DECREF(list);
6076    return (PyObject *)item;
6077}
6078#endif
6079
6080/* Argument converter.  Coerces to a single unicode character */
6081
6082static int
6083convert_uc(PyObject *obj, void *addr)
6084{
6085    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6086    PyObject *uniobj;
6087    Py_UNICODE *unistr;
6088
6089    uniobj = PyUnicode_FromObject(obj);
6090    if (uniobj == NULL) {
6091        PyErr_SetString(PyExc_TypeError,
6092                        "The fill character cannot be converted to Unicode");
6093        return 0;
6094    }
6095    if (PyUnicode_GET_SIZE(uniobj) != 1) {
6096        PyErr_SetString(PyExc_TypeError,
6097                        "The fill character must be exactly one character long");
6098        Py_DECREF(uniobj);
6099        return 0;
6100    }
6101    unistr = PyUnicode_AS_UNICODE(uniobj);
6102    *fillcharloc = unistr[0];
6103    Py_DECREF(uniobj);
6104    return 1;
6105}
6106
6107PyDoc_STRVAR(center__doc__,
6108             "S.center(width[, fillchar]) -> unicode\n\
6109\n\
6110Return S centered in a Unicode string of length width. Padding is\n\
6111done using the specified fill character (default is a space)");
6112
6113static PyObject *
6114unicode_center(PyUnicodeObject *self, PyObject *args)
6115{
6116    Py_ssize_t marg, left;
6117    Py_ssize_t width;
6118    Py_UNICODE fillchar = ' ';
6119
6120    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6121        return NULL;
6122
6123    if (self->length >= width && PyUnicode_CheckExact(self)) {
6124        Py_INCREF(self);
6125        return (PyObject*) self;
6126    }
6127
6128    marg = width - self->length;
6129    left = marg / 2 + (marg & width & 1);
6130
6131    return (PyObject*) pad(self, left, marg - left, fillchar);
6132}
6133
6134#if 0
6135
6136/* This code should go into some future Unicode collation support
6137   module. The basic comparison should compare ordinals on a naive
6138   basis (this is what Java does and thus Jython too). */
6139
6140/* speedy UTF-16 code point order comparison */
6141/* gleaned from: */
6142/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6143
6144static short utf16Fixup[32] =
6145{
6146    0, 0, 0, 0, 0, 0, 0, 0,
6147    0, 0, 0, 0, 0, 0, 0, 0,
6148    0, 0, 0, 0, 0, 0, 0, 0,
6149    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6150};
6151
6152static int
6153unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6154{
6155    Py_ssize_t len1, len2;
6156
6157    Py_UNICODE *s1 = str1->str;
6158    Py_UNICODE *s2 = str2->str;
6159
6160    len1 = str1->length;
6161    len2 = str2->length;
6162
6163    while (len1 > 0 && len2 > 0) {
6164        Py_UNICODE c1, c2;
6165
6166        c1 = *s1++;
6167        c2 = *s2++;
6168
6169        if (c1 > (1<<11) * 26)
6170            c1 += utf16Fixup[c1>>11];
6171        if (c2 > (1<<11) * 26)
6172            c2 += utf16Fixup[c2>>11];
6173        /* now c1 and c2 are in UTF-32-compatible order */
6174
6175        if (c1 != c2)
6176            return (c1 < c2) ? -1 : 1;
6177
6178        len1--; len2--;
6179    }
6180
6181    return (len1 < len2) ? -1 : (len1 != len2);
6182}
6183
6184#else
6185
6186static int
6187unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6188{
6189    register Py_ssize_t len1, len2;
6190
6191    Py_UNICODE *s1 = str1->str;
6192    Py_UNICODE *s2 = str2->str;
6193
6194    len1 = str1->length;
6195    len2 = str2->length;
6196
6197    while (len1 > 0 && len2 > 0) {
6198        Py_UNICODE c1, c2;
6199
6200        c1 = *s1++;
6201        c2 = *s2++;
6202
6203        if (c1 != c2)
6204            return (c1 < c2) ? -1 : 1;
6205
6206        len1--; len2--;
6207    }
6208
6209    return (len1 < len2) ? -1 : (len1 != len2);
6210}
6211
6212#endif
6213
6214int PyUnicode_Compare(PyObject *left,
6215                      PyObject *right)
6216{
6217    PyUnicodeObject *u = NULL, *v = NULL;
6218    int result;
6219
6220    /* Coerce the two arguments */
6221    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6222    if (u == NULL)
6223        goto onError;
6224    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6225    if (v == NULL)
6226        goto onError;
6227
6228    /* Shortcut for empty or interned objects */
6229    if (v == u) {
6230        Py_DECREF(u);
6231        Py_DECREF(v);
6232        return 0;
6233    }
6234
6235    result = unicode_compare(u, v);
6236
6237    Py_DECREF(u);
6238    Py_DECREF(v);
6239    return result;
6240
6241  onError:
6242    Py_XDECREF(u);
6243    Py_XDECREF(v);
6244    return -1;
6245}
6246
6247PyObject *PyUnicode_RichCompare(PyObject *left,
6248                                PyObject *right,
6249                                int op)
6250{
6251    int result;
6252
6253    result = PyUnicode_Compare(left, right);
6254    if (result == -1 && PyErr_Occurred())
6255        goto onError;
6256
6257    /* Convert the return value to a Boolean */
6258    switch (op) {
6259    case Py_EQ:
6260        result = (result == 0);
6261        break;
6262    case Py_NE:
6263        result = (result != 0);
6264        break;
6265    case Py_LE:
6266        result = (result <= 0);
6267        break;
6268    case Py_GE:
6269        result = (result >= 0);
6270        break;
6271    case Py_LT:
6272        result = (result == -1);
6273        break;
6274    case Py_GT:
6275        result = (result == 1);
6276        break;
6277    }
6278    return PyBool_FromLong(result);
6279
6280  onError:
6281
6282    /* Standard case
6283
6284       Type errors mean that PyUnicode_FromObject() could not convert
6285       one of the arguments (usually the right hand side) to Unicode,
6286       ie. we can't handle the comparison request. However, it is
6287       possible that the other object knows a comparison method, which
6288       is why we return Py_NotImplemented to give the other object a
6289       chance.
6290
6291    */
6292    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6293        PyErr_Clear();
6294        Py_INCREF(Py_NotImplemented);
6295        return Py_NotImplemented;
6296    }
6297    if (op != Py_EQ && op != Py_NE)
6298        return NULL;
6299
6300    /* Equality comparison.
6301
6302       This is a special case: we silence any PyExc_UnicodeDecodeError
6303       and instead turn it into a PyErr_UnicodeWarning.
6304
6305    */
6306    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6307        return NULL;
6308    PyErr_Clear();
6309    if (PyErr_Warn(PyExc_UnicodeWarning,
6310                   (op == Py_EQ) ?
6311                   "Unicode equal comparison "
6312                   "failed to convert both arguments to Unicode - "
6313                   "interpreting them as being unequal" :
6314                   "Unicode unequal comparison "
6315                   "failed to convert both arguments to Unicode - "
6316                   "interpreting them as being unequal"
6317            ) < 0)
6318        return NULL;
6319    result = (op == Py_NE);
6320    return PyBool_FromLong(result);
6321}
6322
6323int PyUnicode_Contains(PyObject *container,
6324                       PyObject *element)
6325{
6326    PyObject *str, *sub;
6327    int result;
6328
6329    /* Coerce the two arguments */
6330    sub = PyUnicode_FromObject(element);
6331    if (!sub) {
6332        return -1;
6333    }
6334
6335    str = PyUnicode_FromObject(container);
6336    if (!str) {
6337        Py_DECREF(sub);
6338        return -1;
6339    }
6340
6341    result = stringlib_contains_obj(str, sub);
6342
6343    Py_DECREF(str);
6344    Py_DECREF(sub);
6345
6346    return result;
6347}
6348
6349/* Concat to string or Unicode object giving a new Unicode object. */
6350
6351PyObject *PyUnicode_Concat(PyObject *left,
6352                           PyObject *right)
6353{
6354    PyUnicodeObject *u = NULL, *v = NULL, *w;
6355
6356    /* Coerce the two arguments */
6357    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6358    if (u == NULL)
6359        goto onError;
6360    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6361    if (v == NULL)
6362        goto onError;
6363
6364    /* Shortcuts */
6365    if (v == unicode_empty) {
6366        Py_DECREF(v);
6367        return (PyObject *)u;
6368    }
6369    if (u == unicode_empty) {
6370        Py_DECREF(u);
6371        return (PyObject *)v;
6372    }
6373
6374    /* Concat the two Unicode strings */
6375    w = _PyUnicode_New(u->length + v->length);
6376    if (w == NULL)
6377        goto onError;
6378    Py_UNICODE_COPY(w->str, u->str, u->length);
6379    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6380
6381    Py_DECREF(u);
6382    Py_DECREF(v);
6383    return (PyObject *)w;
6384
6385  onError:
6386    Py_XDECREF(u);
6387    Py_XDECREF(v);
6388    return NULL;
6389}
6390
6391PyDoc_STRVAR(count__doc__,
6392             "S.count(sub[, start[, end]]) -> int\n\
6393\n\
6394Return the number of non-overlapping occurrences of substring sub in\n\
6395Unicode string S[start:end].  Optional arguments start and end are\n\
6396interpreted as in slice notation.");
6397
6398static PyObject *
6399unicode_count(PyUnicodeObject *self, PyObject *args)
6400{
6401    PyUnicodeObject *substring;
6402    Py_ssize_t start = 0;
6403    Py_ssize_t end = PY_SSIZE_T_MAX;
6404    PyObject *result;
6405
6406    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6407                                            &start, &end))
6408        return NULL;
6409
6410    ADJUST_INDICES(start, end, self->length);
6411    result = PyInt_FromSsize_t(
6412        stringlib_count(self->str + start, end - start,
6413                        substring->str, substring->length,
6414                        PY_SSIZE_T_MAX)
6415        );
6416
6417    Py_DECREF(substring);
6418
6419    return result;
6420}
6421
6422PyDoc_STRVAR(encode__doc__,
6423             "S.encode([encoding[,errors]]) -> string or unicode\n\
6424\n\
6425Encodes S using the codec registered for encoding. encoding defaults\n\
6426to the default encoding. errors may be given to set a different error\n\
6427handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6428a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6429'xmlcharrefreplace' as well as any other name registered with\n\
6430codecs.register_error that can handle UnicodeEncodeErrors.");
6431
6432static PyObject *
6433unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6434{
6435    static char *kwlist[] = {"encoding", "errors", 0};
6436    char *encoding = NULL;
6437    char *errors = NULL;
6438    PyObject *v;
6439
6440    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6441                                     kwlist, &encoding, &errors))
6442        return NULL;
6443    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6444    if (v == NULL)
6445        goto onError;
6446    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6447        PyErr_Format(PyExc_TypeError,
6448                     "encoder did not return a string/unicode object "
6449                     "(type=%.400s)",
6450                     Py_TYPE(v)->tp_name);
6451        Py_DECREF(v);
6452        return NULL;
6453    }
6454    return v;
6455
6456  onError:
6457    return NULL;
6458}
6459
6460PyDoc_STRVAR(decode__doc__,
6461             "S.decode([encoding[,errors]]) -> string or unicode\n\
6462\n\
6463Decodes S using the codec registered for encoding. encoding defaults\n\
6464to the default encoding. errors may be given to set a different error\n\
6465handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6466a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6467as well as any other name registered with codecs.register_error that is\n\
6468able to handle UnicodeDecodeErrors.");
6469
6470static PyObject *
6471unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6472{
6473    static char *kwlist[] = {"encoding", "errors", 0};
6474    char *encoding = NULL;
6475    char *errors = NULL;
6476    PyObject *v;
6477
6478    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6479                                     kwlist, &encoding, &errors))
6480        return NULL;
6481    v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6482    if (v == NULL)
6483        goto onError;
6484    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6485        PyErr_Format(PyExc_TypeError,
6486                     "decoder did not return a string/unicode object "
6487                     "(type=%.400s)",
6488                     Py_TYPE(v)->tp_name);
6489        Py_DECREF(v);
6490        return NULL;
6491    }
6492    return v;
6493
6494  onError:
6495    return NULL;
6496}
6497
6498PyDoc_STRVAR(expandtabs__doc__,
6499             "S.expandtabs([tabsize]) -> unicode\n\
6500\n\
6501Return a copy of S where all tab characters are expanded using spaces.\n\
6502If tabsize is not given, a tab size of 8 characters is assumed.");
6503
6504static PyObject*
6505unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6506{
6507    Py_UNICODE *e;
6508    Py_UNICODE *p;
6509    Py_UNICODE *q;
6510    Py_UNICODE *qe;
6511    Py_ssize_t i, j, incr;
6512    PyUnicodeObject *u;
6513    int tabsize = 8;
6514
6515    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6516        return NULL;
6517
6518    /* First pass: determine size of output string */
6519    i = 0; /* chars up to and including most recent \n or \r */
6520    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6521    e = self->str + self->length; /* end of input */
6522    for (p = self->str; p < e; p++)
6523        if (*p == '\t') {
6524            if (tabsize > 0) {
6525                incr = tabsize - (j % tabsize); /* cannot overflow */
6526                if (j > PY_SSIZE_T_MAX - incr)
6527                    goto overflow1;
6528                j += incr;
6529            }
6530        }
6531        else {
6532            if (j > PY_SSIZE_T_MAX - 1)
6533                goto overflow1;
6534            j++;
6535            if (*p == '\n' || *p == '\r') {
6536                if (i > PY_SSIZE_T_MAX - j)
6537                    goto overflow1;
6538                i += j;
6539                j = 0;
6540            }
6541        }
6542
6543    if (i > PY_SSIZE_T_MAX - j)
6544        goto overflow1;
6545
6546    /* Second pass: create output string and fill it */
6547    u = _PyUnicode_New(i + j);
6548    if (!u)
6549        return NULL;
6550
6551    j = 0; /* same as in first pass */
6552    q = u->str; /* next output char */
6553    qe = u->str + u->length; /* end of output */
6554
6555    for (p = self->str; p < e; p++)
6556        if (*p == '\t') {
6557            if (tabsize > 0) {
6558                i = tabsize - (j % tabsize);
6559                j += i;
6560                while (i--) {
6561                    if (q >= qe)
6562                        goto overflow2;
6563                    *q++ = ' ';
6564                }
6565            }
6566        }
6567        else {
6568            if (q >= qe)
6569                goto overflow2;
6570            *q++ = *p;
6571            j++;
6572            if (*p == '\n' || *p == '\r')
6573                j = 0;
6574        }
6575
6576    return (PyObject*) u;
6577
6578  overflow2:
6579    Py_DECREF(u);
6580  overflow1:
6581    PyErr_SetString(PyExc_OverflowError, "new string is too long");
6582    return NULL;
6583}
6584
6585PyDoc_STRVAR(find__doc__,
6586             "S.find(sub [,start [,end]]) -> int\n\
6587\n\
6588Return the lowest index in S where substring sub is found,\n\
6589such that sub is contained within S[start:end].  Optional\n\
6590arguments start and end are interpreted as in slice notation.\n\
6591\n\
6592Return -1 on failure.");
6593
6594static PyObject *
6595unicode_find(PyUnicodeObject *self, PyObject *args)
6596{
6597    PyUnicodeObject *substring;
6598    Py_ssize_t start;
6599    Py_ssize_t end;
6600    Py_ssize_t result;
6601
6602    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6603                                            &start, &end))
6604        return NULL;
6605
6606    result = stringlib_find_slice(
6607        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6608        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6609        start, end
6610        );
6611
6612    Py_DECREF(substring);
6613
6614    return PyInt_FromSsize_t(result);
6615}
6616
6617static PyObject *
6618unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6619{
6620    if (index < 0 || index >= self->length) {
6621        PyErr_SetString(PyExc_IndexError, "string index out of range");
6622        return NULL;
6623    }
6624
6625    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6626}
6627
6628static long
6629unicode_hash(PyUnicodeObject *self)
6630{
6631    /* Since Unicode objects compare equal to their ASCII string
6632       counterparts, they should use the individual character values
6633       as basis for their hash value.  This is needed to assure that
6634       strings and Unicode objects behave in the same way as
6635       dictionary keys. */
6636
6637    register Py_ssize_t len;
6638    register Py_UNICODE *p;
6639    register long x;
6640
6641#ifdef Py_DEBUG
6642    assert(_Py_HashSecret_Initialized);
6643#endif
6644    if (self->hash != -1)
6645        return self->hash;
6646    len = PyUnicode_GET_SIZE(self);
6647    /*
6648      We make the hash of the empty string be 0, rather than using
6649      (prefix ^ suffix), since this slightly obfuscates the hash secret
6650    */
6651    if (len == 0) {
6652        self->hash = 0;
6653        return 0;
6654    }
6655    p = PyUnicode_AS_UNICODE(self);
6656    x = _Py_HashSecret.prefix;
6657    x ^= *p << 7;
6658    while (--len >= 0)
6659        x = (1000003*x) ^ *p++;
6660    x ^= PyUnicode_GET_SIZE(self);
6661    x ^= _Py_HashSecret.suffix;
6662    if (x == -1)
6663        x = -2;
6664    self->hash = x;
6665    return x;
6666}
6667
6668PyDoc_STRVAR(index__doc__,
6669             "S.index(sub [,start [,end]]) -> int\n\
6670\n\
6671Like S.find() but raise ValueError when the substring is not found.");
6672
6673static PyObject *
6674unicode_index(PyUnicodeObject *self, PyObject *args)
6675{
6676    Py_ssize_t result;
6677    PyUnicodeObject *substring;
6678    Py_ssize_t start;
6679    Py_ssize_t end;
6680
6681    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6682                                            &start, &end))
6683        return NULL;
6684
6685    result = stringlib_find_slice(
6686        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6687        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6688        start, end
6689        );
6690
6691    Py_DECREF(substring);
6692
6693    if (result < 0) {
6694        PyErr_SetString(PyExc_ValueError, "substring not found");
6695        return NULL;
6696    }
6697
6698    return PyInt_FromSsize_t(result);
6699}
6700
6701PyDoc_STRVAR(islower__doc__,
6702             "S.islower() -> bool\n\
6703\n\
6704Return True if all cased characters in S are lowercase and there is\n\
6705at least one cased character in S, False otherwise.");
6706
6707static PyObject*
6708unicode_islower(PyUnicodeObject *self)
6709{
6710    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6711    register const Py_UNICODE *e;
6712    int cased;
6713
6714    /* Shortcut for single character strings */
6715    if (PyUnicode_GET_SIZE(self) == 1)
6716        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6717
6718    /* Special case for empty strings */
6719    if (PyUnicode_GET_SIZE(self) == 0)
6720        return PyBool_FromLong(0);
6721
6722    e = p + PyUnicode_GET_SIZE(self);
6723    cased = 0;
6724    for (; p < e; p++) {
6725        register const Py_UNICODE ch = *p;
6726
6727        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6728            return PyBool_FromLong(0);
6729        else if (!cased && Py_UNICODE_ISLOWER(ch))
6730            cased = 1;
6731    }
6732    return PyBool_FromLong(cased);
6733}
6734
6735PyDoc_STRVAR(isupper__doc__,
6736             "S.isupper() -> bool\n\
6737\n\
6738Return True if all cased characters in S are uppercase and there is\n\
6739at least one cased character in S, False otherwise.");
6740
6741static PyObject*
6742unicode_isupper(PyUnicodeObject *self)
6743{
6744    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6745    register const Py_UNICODE *e;
6746    int cased;
6747
6748    /* Shortcut for single character strings */
6749    if (PyUnicode_GET_SIZE(self) == 1)
6750        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6751
6752    /* Special case for empty strings */
6753    if (PyUnicode_GET_SIZE(self) == 0)
6754        return PyBool_FromLong(0);
6755
6756    e = p + PyUnicode_GET_SIZE(self);
6757    cased = 0;
6758    for (; p < e; p++) {
6759        register const Py_UNICODE ch = *p;
6760
6761        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6762            return PyBool_FromLong(0);
6763        else if (!cased && Py_UNICODE_ISUPPER(ch))
6764            cased = 1;
6765    }
6766    return PyBool_FromLong(cased);
6767}
6768
6769PyDoc_STRVAR(istitle__doc__,
6770             "S.istitle() -> bool\n\
6771\n\
6772Return True if S is a titlecased string and there is at least one\n\
6773character in S, i.e. upper- and titlecase characters may only\n\
6774follow uncased characters and lowercase characters only cased ones.\n\
6775Return False otherwise.");
6776
6777static PyObject*
6778unicode_istitle(PyUnicodeObject *self)
6779{
6780    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6781    register const Py_UNICODE *e;
6782    int cased, previous_is_cased;
6783
6784    /* Shortcut for single character strings */
6785    if (PyUnicode_GET_SIZE(self) == 1)
6786        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6787                               (Py_UNICODE_ISUPPER(*p) != 0));
6788
6789    /* Special case for empty strings */
6790    if (PyUnicode_GET_SIZE(self) == 0)
6791        return PyBool_FromLong(0);
6792
6793    e = p + PyUnicode_GET_SIZE(self);
6794    cased = 0;
6795    previous_is_cased = 0;
6796    for (; p < e; p++) {
6797        register const Py_UNICODE ch = *p;
6798
6799        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6800            if (previous_is_cased)
6801                return PyBool_FromLong(0);
6802            previous_is_cased = 1;
6803            cased = 1;
6804        }
6805        else if (Py_UNICODE_ISLOWER(ch)) {
6806            if (!previous_is_cased)
6807                return PyBool_FromLong(0);
6808            previous_is_cased = 1;
6809            cased = 1;
6810        }
6811        else
6812            previous_is_cased = 0;
6813    }
6814    return PyBool_FromLong(cased);
6815}
6816
6817PyDoc_STRVAR(isspace__doc__,
6818             "S.isspace() -> bool\n\
6819\n\
6820Return True if all characters in S are whitespace\n\
6821and there is at least one character in S, False otherwise.");
6822
6823static PyObject*
6824unicode_isspace(PyUnicodeObject *self)
6825{
6826    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6827    register const Py_UNICODE *e;
6828
6829    /* Shortcut for single character strings */
6830    if (PyUnicode_GET_SIZE(self) == 1 &&
6831        Py_UNICODE_ISSPACE(*p))
6832        return PyBool_FromLong(1);
6833
6834    /* Special case for empty strings */
6835    if (PyUnicode_GET_SIZE(self) == 0)
6836        return PyBool_FromLong(0);
6837
6838    e = p + PyUnicode_GET_SIZE(self);
6839    for (; p < e; p++) {
6840        if (!Py_UNICODE_ISSPACE(*p))
6841            return PyBool_FromLong(0);
6842    }
6843    return PyBool_FromLong(1);
6844}
6845
6846PyDoc_STRVAR(isalpha__doc__,
6847             "S.isalpha() -> bool\n\
6848\n\
6849Return True if all characters in S are alphabetic\n\
6850and there is at least one character in S, False otherwise.");
6851
6852static PyObject*
6853unicode_isalpha(PyUnicodeObject *self)
6854{
6855    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6856    register const Py_UNICODE *e;
6857
6858    /* Shortcut for single character strings */
6859    if (PyUnicode_GET_SIZE(self) == 1 &&
6860        Py_UNICODE_ISALPHA(*p))
6861        return PyBool_FromLong(1);
6862
6863    /* Special case for empty strings */
6864    if (PyUnicode_GET_SIZE(self) == 0)
6865        return PyBool_FromLong(0);
6866
6867    e = p + PyUnicode_GET_SIZE(self);
6868    for (; p < e; p++) {
6869        if (!Py_UNICODE_ISALPHA(*p))
6870            return PyBool_FromLong(0);
6871    }
6872    return PyBool_FromLong(1);
6873}
6874
6875PyDoc_STRVAR(isalnum__doc__,
6876             "S.isalnum() -> bool\n\
6877\n\
6878Return True if all characters in S are alphanumeric\n\
6879and there is at least one character in S, False otherwise.");
6880
6881static PyObject*
6882unicode_isalnum(PyUnicodeObject *self)
6883{
6884    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6885    register const Py_UNICODE *e;
6886
6887    /* Shortcut for single character strings */
6888    if (PyUnicode_GET_SIZE(self) == 1 &&
6889        Py_UNICODE_ISALNUM(*p))
6890        return PyBool_FromLong(1);
6891
6892    /* Special case for empty strings */
6893    if (PyUnicode_GET_SIZE(self) == 0)
6894        return PyBool_FromLong(0);
6895
6896    e = p + PyUnicode_GET_SIZE(self);
6897    for (; p < e; p++) {
6898        if (!Py_UNICODE_ISALNUM(*p))
6899            return PyBool_FromLong(0);
6900    }
6901    return PyBool_FromLong(1);
6902}
6903
6904PyDoc_STRVAR(isdecimal__doc__,
6905             "S.isdecimal() -> bool\n\
6906\n\
6907Return True if there are only decimal characters in S,\n\
6908False otherwise.");
6909
6910static PyObject*
6911unicode_isdecimal(PyUnicodeObject *self)
6912{
6913    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6914    register const Py_UNICODE *e;
6915
6916    /* Shortcut for single character strings */
6917    if (PyUnicode_GET_SIZE(self) == 1 &&
6918        Py_UNICODE_ISDECIMAL(*p))
6919        return PyBool_FromLong(1);
6920
6921    /* Special case for empty strings */
6922    if (PyUnicode_GET_SIZE(self) == 0)
6923        return PyBool_FromLong(0);
6924
6925    e = p + PyUnicode_GET_SIZE(self);
6926    for (; p < e; p++) {
6927        if (!Py_UNICODE_ISDECIMAL(*p))
6928            return PyBool_FromLong(0);
6929    }
6930    return PyBool_FromLong(1);
6931}
6932
6933PyDoc_STRVAR(isdigit__doc__,
6934             "S.isdigit() -> bool\n\
6935\n\
6936Return True if all characters in S are digits\n\
6937and there is at least one character in S, False otherwise.");
6938
6939static PyObject*
6940unicode_isdigit(PyUnicodeObject *self)
6941{
6942    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6943    register const Py_UNICODE *e;
6944
6945    /* Shortcut for single character strings */
6946    if (PyUnicode_GET_SIZE(self) == 1 &&
6947        Py_UNICODE_ISDIGIT(*p))
6948        return PyBool_FromLong(1);
6949
6950    /* Special case for empty strings */
6951    if (PyUnicode_GET_SIZE(self) == 0)
6952        return PyBool_FromLong(0);
6953
6954    e = p + PyUnicode_GET_SIZE(self);
6955    for (; p < e; p++) {
6956        if (!Py_UNICODE_ISDIGIT(*p))
6957            return PyBool_FromLong(0);
6958    }
6959    return PyBool_FromLong(1);
6960}
6961
6962PyDoc_STRVAR(isnumeric__doc__,
6963             "S.isnumeric() -> bool\n\
6964\n\
6965Return True if there are only numeric characters in S,\n\
6966False otherwise.");
6967
6968static PyObject*
6969unicode_isnumeric(PyUnicodeObject *self)
6970{
6971    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6972    register const Py_UNICODE *e;
6973
6974    /* Shortcut for single character strings */
6975    if (PyUnicode_GET_SIZE(self) == 1 &&
6976        Py_UNICODE_ISNUMERIC(*p))
6977        return PyBool_FromLong(1);
6978
6979    /* Special case for empty strings */
6980    if (PyUnicode_GET_SIZE(self) == 0)
6981        return PyBool_FromLong(0);
6982
6983    e = p + PyUnicode_GET_SIZE(self);
6984    for (; p < e; p++) {
6985        if (!Py_UNICODE_ISNUMERIC(*p))
6986            return PyBool_FromLong(0);
6987    }
6988    return PyBool_FromLong(1);
6989}
6990
6991PyDoc_STRVAR(join__doc__,
6992             "S.join(iterable) -> unicode\n\
6993\n\
6994Return a string which is the concatenation of the strings in the\n\
6995iterable.  The separator between elements is S.");
6996
6997static PyObject*
6998unicode_join(PyObject *self, PyObject *data)
6999{
7000    return PyUnicode_Join(self, data);
7001}
7002
7003static Py_ssize_t
7004unicode_length(PyUnicodeObject *self)
7005{
7006    return self->length;
7007}
7008
7009PyDoc_STRVAR(ljust__doc__,
7010             "S.ljust(width[, fillchar]) -> int\n\
7011\n\
7012Return S left-justified in a Unicode string of length width. Padding is\n\
7013done using the specified fill character (default is a space).");
7014
7015static PyObject *
7016unicode_ljust(PyUnicodeObject *self, PyObject *args)
7017{
7018    Py_ssize_t width;
7019    Py_UNICODE fillchar = ' ';
7020
7021    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7022        return NULL;
7023
7024    if (self->length >= width && PyUnicode_CheckExact(self)) {
7025        Py_INCREF(self);
7026        return (PyObject*) self;
7027    }
7028
7029    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7030}
7031
7032PyDoc_STRVAR(lower__doc__,
7033             "S.lower() -> unicode\n\
7034\n\
7035Return a copy of the string S converted to lowercase.");
7036
7037static PyObject*
7038unicode_lower(PyUnicodeObject *self)
7039{
7040    return fixup(self, fixlower);
7041}
7042
7043#define LEFTSTRIP 0
7044#define RIGHTSTRIP 1
7045#define BOTHSTRIP 2
7046
7047/* Arrays indexed by above */
7048static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7049
7050#define STRIPNAME(i) (stripformat[i]+3)
7051
7052/* externally visible for str.strip(unicode) */
7053PyObject *
7054_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7055{
7056    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7057    Py_ssize_t len = PyUnicode_GET_SIZE(self);
7058    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7059    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7060    Py_ssize_t i, j;
7061
7062    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7063
7064    i = 0;
7065    if (striptype != RIGHTSTRIP) {
7066        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7067            i++;
7068        }
7069    }
7070
7071    j = len;
7072    if (striptype != LEFTSTRIP) {
7073        do {
7074            j--;
7075        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7076        j++;
7077    }
7078
7079    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7080        Py_INCREF(self);
7081        return (PyObject*)self;
7082    }
7083    else
7084        return PyUnicode_FromUnicode(s+i, j-i);
7085}
7086
7087
7088static PyObject *
7089do_strip(PyUnicodeObject *self, int striptype)
7090{
7091    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7092    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7093
7094    i = 0;
7095    if (striptype != RIGHTSTRIP) {
7096        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7097            i++;
7098        }
7099    }
7100
7101    j = len;
7102    if (striptype != LEFTSTRIP) {
7103        do {
7104            j--;
7105        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7106        j++;
7107    }
7108
7109    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7110        Py_INCREF(self);
7111        return (PyObject*)self;
7112    }
7113    else
7114        return PyUnicode_FromUnicode(s+i, j-i);
7115}
7116
7117
7118static PyObject *
7119do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7120{
7121    PyObject *sep = NULL;
7122
7123    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7124        return NULL;
7125
7126    if (sep != NULL && sep != Py_None) {
7127        if (PyUnicode_Check(sep))
7128            return _PyUnicode_XStrip(self, striptype, sep);
7129        else if (PyString_Check(sep)) {
7130            PyObject *res;
7131            sep = PyUnicode_FromObject(sep);
7132            if (sep==NULL)
7133                return NULL;
7134            res = _PyUnicode_XStrip(self, striptype, sep);
7135            Py_DECREF(sep);
7136            return res;
7137        }
7138        else {
7139            PyErr_Format(PyExc_TypeError,
7140                         "%s arg must be None, unicode or str",
7141                         STRIPNAME(striptype));
7142            return NULL;
7143        }
7144    }
7145
7146    return do_strip(self, striptype);
7147}
7148
7149
7150PyDoc_STRVAR(strip__doc__,
7151             "S.strip([chars]) -> unicode\n\
7152\n\
7153Return a copy of the string S with leading and trailing\n\
7154whitespace removed.\n\
7155If chars is given and not None, remove characters in chars instead.\n\
7156If chars is a str, it will be converted to unicode before stripping");
7157
7158static PyObject *
7159unicode_strip(PyUnicodeObject *self, PyObject *args)
7160{
7161    if (PyTuple_GET_SIZE(args) == 0)
7162        return do_strip(self, BOTHSTRIP); /* Common case */
7163    else
7164        return do_argstrip(self, BOTHSTRIP, args);
7165}
7166
7167
7168PyDoc_STRVAR(lstrip__doc__,
7169             "S.lstrip([chars]) -> unicode\n\
7170\n\
7171Return a copy of the string S with leading whitespace removed.\n\
7172If chars is given and not None, remove characters in chars instead.\n\
7173If chars is a str, it will be converted to unicode before stripping");
7174
7175static PyObject *
7176unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7177{
7178    if (PyTuple_GET_SIZE(args) == 0)
7179        return do_strip(self, LEFTSTRIP); /* Common case */
7180    else
7181        return do_argstrip(self, LEFTSTRIP, args);
7182}
7183
7184
7185PyDoc_STRVAR(rstrip__doc__,
7186             "S.rstrip([chars]) -> unicode\n\
7187\n\
7188Return a copy of the string S with trailing whitespace removed.\n\
7189If chars is given and not None, remove characters in chars instead.\n\
7190If chars is a str, it will be converted to unicode before stripping");
7191
7192static PyObject *
7193unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7194{
7195    if (PyTuple_GET_SIZE(args) == 0)
7196        return do_strip(self, RIGHTSTRIP); /* Common case */
7197    else
7198        return do_argstrip(self, RIGHTSTRIP, args);
7199}
7200
7201
7202static PyObject*
7203unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7204{
7205    PyUnicodeObject *u;
7206    Py_UNICODE *p;
7207    Py_ssize_t nchars;
7208    size_t nbytes;
7209
7210    if (len < 0)
7211        len = 0;
7212
7213    if (len == 1 && PyUnicode_CheckExact(str)) {
7214        /* no repeat, return original string */
7215        Py_INCREF(str);
7216        return (PyObject*) str;
7217    }
7218
7219    /* ensure # of chars needed doesn't overflow int and # of bytes
7220     * needed doesn't overflow size_t
7221     */
7222    nchars = len * str->length;
7223    if (len && nchars / len != str->length) {
7224        PyErr_SetString(PyExc_OverflowError,
7225                        "repeated string is too long");
7226        return NULL;
7227    }
7228    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7229    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7230        PyErr_SetString(PyExc_OverflowError,
7231                        "repeated string is too long");
7232        return NULL;
7233    }
7234    u = _PyUnicode_New(nchars);
7235    if (!u)
7236        return NULL;
7237
7238    p = u->str;
7239
7240    if (str->length == 1 && len > 0) {
7241        Py_UNICODE_FILL(p, str->str[0], len);
7242    } else {
7243        Py_ssize_t done = 0; /* number of characters copied this far */
7244        if (done < nchars) {
7245            Py_UNICODE_COPY(p, str->str, str->length);
7246            done = str->length;
7247        }
7248        while (done < nchars) {
7249            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7250            Py_UNICODE_COPY(p+done, p, n);
7251            done += n;
7252        }
7253    }
7254
7255    return (PyObject*) u;
7256}
7257
7258PyObject *PyUnicode_Replace(PyObject *obj,
7259                            PyObject *subobj,
7260                            PyObject *replobj,
7261                            Py_ssize_t maxcount)
7262{
7263    PyObject *self;
7264    PyObject *str1;
7265    PyObject *str2;
7266    PyObject *result;
7267
7268    self = PyUnicode_FromObject(obj);
7269    if (self == NULL)
7270        return NULL;
7271    str1 = PyUnicode_FromObject(subobj);
7272    if (str1 == NULL) {
7273        Py_DECREF(self);
7274        return NULL;
7275    }
7276    str2 = PyUnicode_FromObject(replobj);
7277    if (str2 == NULL) {
7278        Py_DECREF(self);
7279        Py_DECREF(str1);
7280        return NULL;
7281    }
7282    result = replace((PyUnicodeObject *)self,
7283                     (PyUnicodeObject *)str1,
7284                     (PyUnicodeObject *)str2,
7285                     maxcount);
7286    Py_DECREF(self);
7287    Py_DECREF(str1);
7288    Py_DECREF(str2);
7289    return result;
7290}
7291
7292PyDoc_STRVAR(replace__doc__,
7293             "S.replace(old, new[, count]) -> unicode\n\
7294\n\
7295Return a copy of S with all occurrences of substring\n\
7296old replaced by new.  If the optional argument count is\n\
7297given, only the first count occurrences are replaced.");
7298
7299static PyObject*
7300unicode_replace(PyUnicodeObject *self, PyObject *args)
7301{
7302    PyUnicodeObject *str1;
7303    PyUnicodeObject *str2;
7304    Py_ssize_t maxcount = -1;
7305    PyObject *result;
7306
7307    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7308        return NULL;
7309    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7310    if (str1 == NULL)
7311        return NULL;
7312    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7313    if (str2 == NULL) {
7314        Py_DECREF(str1);
7315        return NULL;
7316    }
7317
7318    result = replace(self, str1, str2, maxcount);
7319
7320    Py_DECREF(str1);
7321    Py_DECREF(str2);
7322    return result;
7323}
7324
7325static
7326PyObject *unicode_repr(PyObject *unicode)
7327{
7328    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7329                                PyUnicode_GET_SIZE(unicode),
7330                                1);
7331}
7332
7333PyDoc_STRVAR(rfind__doc__,
7334             "S.rfind(sub [,start [,end]]) -> int\n\
7335\n\
7336Return the highest index in S where substring sub is found,\n\
7337such that sub is contained within S[start:end].  Optional\n\
7338arguments start and end are interpreted as in slice notation.\n\
7339\n\
7340Return -1 on failure.");
7341
7342static PyObject *
7343unicode_rfind(PyUnicodeObject *self, PyObject *args)
7344{
7345    PyUnicodeObject *substring;
7346    Py_ssize_t start;
7347    Py_ssize_t end;
7348    Py_ssize_t result;
7349
7350    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7351                                            &start, &end))
7352        return NULL;
7353
7354    result = stringlib_rfind_slice(
7355        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7356        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7357        start, end
7358        );
7359
7360    Py_DECREF(substring);
7361
7362    return PyInt_FromSsize_t(result);
7363}
7364
7365PyDoc_STRVAR(rindex__doc__,
7366             "S.rindex(sub [,start [,end]]) -> int\n\
7367\n\
7368Like S.rfind() but raise ValueError when the substring is not found.");
7369
7370static PyObject *
7371unicode_rindex(PyUnicodeObject *self, PyObject *args)
7372{
7373    PyUnicodeObject *substring;
7374    Py_ssize_t start;
7375    Py_ssize_t end;
7376    Py_ssize_t result;
7377
7378    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7379                                            &start, &end))
7380        return NULL;
7381
7382    result = stringlib_rfind_slice(
7383        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7384        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7385        start, end
7386        );
7387
7388    Py_DECREF(substring);
7389
7390    if (result < 0) {
7391        PyErr_SetString(PyExc_ValueError, "substring not found");
7392        return NULL;
7393    }
7394    return PyInt_FromSsize_t(result);
7395}
7396
7397PyDoc_STRVAR(rjust__doc__,
7398             "S.rjust(width[, fillchar]) -> unicode\n\
7399\n\
7400Return S right-justified in a Unicode string of length width. Padding is\n\
7401done using the specified fill character (default is a space).");
7402
7403static PyObject *
7404unicode_rjust(PyUnicodeObject *self, PyObject *args)
7405{
7406    Py_ssize_t width;
7407    Py_UNICODE fillchar = ' ';
7408
7409    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7410        return NULL;
7411
7412    if (self->length >= width && PyUnicode_CheckExact(self)) {
7413        Py_INCREF(self);
7414        return (PyObject*) self;
7415    }
7416
7417    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7418}
7419
7420static PyObject*
7421unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7422{
7423    /* standard clamping */
7424    if (start < 0)
7425        start = 0;
7426    if (end < 0)
7427        end = 0;
7428    if (end > self->length)
7429        end = self->length;
7430    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7431        /* full slice, return original string */
7432        Py_INCREF(self);
7433        return (PyObject*) self;
7434    }
7435    if (start > end)
7436        start = end;
7437    /* copy slice */
7438    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7439                                             end - start);
7440}
7441
7442PyObject *PyUnicode_Split(PyObject *s,
7443                          PyObject *sep,
7444                          Py_ssize_t maxsplit)
7445{
7446    PyObject *result;
7447
7448    s = PyUnicode_FromObject(s);
7449    if (s == NULL)
7450        return NULL;
7451    if (sep != NULL) {
7452        sep = PyUnicode_FromObject(sep);
7453        if (sep == NULL) {
7454            Py_DECREF(s);
7455            return NULL;
7456        }
7457    }
7458
7459    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7460
7461    Py_DECREF(s);
7462    Py_XDECREF(sep);
7463    return result;
7464}
7465
7466PyDoc_STRVAR(split__doc__,
7467             "S.split([sep [,maxsplit]]) -> list of strings\n\
7468\n\
7469Return a list of the words in S, using sep as the\n\
7470delimiter string.  If maxsplit is given, at most maxsplit\n\
7471splits are done. If sep is not specified or is None, any\n\
7472whitespace string is a separator and empty strings are\n\
7473removed from the result.");
7474
7475static PyObject*
7476unicode_split(PyUnicodeObject *self, PyObject *args)
7477{
7478    PyObject *substring = Py_None;
7479    Py_ssize_t maxcount = -1;
7480
7481    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7482        return NULL;
7483
7484    if (substring == Py_None)
7485        return split(self, NULL, maxcount);
7486    else if (PyUnicode_Check(substring))
7487        return split(self, (PyUnicodeObject *)substring, maxcount);
7488    else
7489        return PyUnicode_Split((PyObject *)self, substring, maxcount);
7490}
7491
7492PyObject *
7493PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7494{
7495    PyObject* str_obj;
7496    PyObject* sep_obj;
7497    PyObject* out;
7498
7499    str_obj = PyUnicode_FromObject(str_in);
7500    if (!str_obj)
7501        return NULL;
7502    sep_obj = PyUnicode_FromObject(sep_in);
7503    if (!sep_obj) {
7504        Py_DECREF(str_obj);
7505        return NULL;
7506    }
7507
7508    out = stringlib_partition(
7509        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7510        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7511        );
7512
7513    Py_DECREF(sep_obj);
7514    Py_DECREF(str_obj);
7515
7516    return out;
7517}
7518
7519
7520PyObject *
7521PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7522{
7523    PyObject* str_obj;
7524    PyObject* sep_obj;
7525    PyObject* out;
7526
7527    str_obj = PyUnicode_FromObject(str_in);
7528    if (!str_obj)
7529        return NULL;
7530    sep_obj = PyUnicode_FromObject(sep_in);
7531    if (!sep_obj) {
7532        Py_DECREF(str_obj);
7533        return NULL;
7534    }
7535
7536    out = stringlib_rpartition(
7537        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7538        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7539        );
7540
7541    Py_DECREF(sep_obj);
7542    Py_DECREF(str_obj);
7543
7544    return out;
7545}
7546
7547PyDoc_STRVAR(partition__doc__,
7548             "S.partition(sep) -> (head, sep, tail)\n\
7549\n\
7550Search for the separator sep in S, and return the part before it,\n\
7551the separator itself, and the part after it.  If the separator is not\n\
7552found, return S and two empty strings.");
7553
7554static PyObject*
7555unicode_partition(PyUnicodeObject *self, PyObject *separator)
7556{
7557    return PyUnicode_Partition((PyObject *)self, separator);
7558}
7559
7560PyDoc_STRVAR(rpartition__doc__,
7561             "S.rpartition(sep) -> (head, sep, tail)\n\
7562\n\
7563Search for the separator sep in S, starting at the end of S, and return\n\
7564the part before it, the separator itself, and the part after it.  If the\n\
7565separator is not found, return two empty strings and S.");
7566
7567static PyObject*
7568unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7569{
7570    return PyUnicode_RPartition((PyObject *)self, separator);
7571}
7572
7573PyObject *PyUnicode_RSplit(PyObject *s,
7574                           PyObject *sep,
7575                           Py_ssize_t maxsplit)
7576{
7577    PyObject *result;
7578
7579    s = PyUnicode_FromObject(s);
7580    if (s == NULL)
7581        return NULL;
7582    if (sep != NULL) {
7583        sep = PyUnicode_FromObject(sep);
7584        if (sep == NULL) {
7585            Py_DECREF(s);
7586            return NULL;
7587        }
7588    }
7589
7590    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7591
7592    Py_DECREF(s);
7593    Py_XDECREF(sep);
7594    return result;
7595}
7596
7597PyDoc_STRVAR(rsplit__doc__,
7598             "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7599\n\
7600Return a list of the words in S, using sep as the\n\
7601delimiter string, starting at the end of the string and\n\
7602working to the front.  If maxsplit is given, at most maxsplit\n\
7603splits are done. If sep is not specified, any whitespace string\n\
7604is a separator.");
7605
7606static PyObject*
7607unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7608{
7609    PyObject *substring = Py_None;
7610    Py_ssize_t maxcount = -1;
7611
7612    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7613        return NULL;
7614
7615    if (substring == Py_None)
7616        return rsplit(self, NULL, maxcount);
7617    else if (PyUnicode_Check(substring))
7618        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7619    else
7620        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7621}
7622
7623PyDoc_STRVAR(splitlines__doc__,
7624             "S.splitlines(keepends=False) -> list of strings\n\
7625\n\
7626Return a list of the lines in S, breaking at line boundaries.\n\
7627Line breaks are not included in the resulting list unless keepends\n\
7628is given and true.");
7629
7630static PyObject*
7631unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7632{
7633    int keepends = 0;
7634
7635    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7636        return NULL;
7637
7638    return PyUnicode_Splitlines((PyObject *)self, keepends);
7639}
7640
7641static
7642PyObject *unicode_str(PyUnicodeObject *self)
7643{
7644    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7645}
7646
7647PyDoc_STRVAR(swapcase__doc__,
7648             "S.swapcase() -> unicode\n\
7649\n\
7650Return a copy of S with uppercase characters converted to lowercase\n\
7651and vice versa.");
7652
7653static PyObject*
7654unicode_swapcase(PyUnicodeObject *self)
7655{
7656    return fixup(self, fixswapcase);
7657}
7658
7659PyDoc_STRVAR(translate__doc__,
7660             "S.translate(table) -> unicode\n\
7661\n\
7662Return a copy of the string S, where all characters have been mapped\n\
7663through the given translation table, which must be a mapping of\n\
7664Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7665Unmapped characters are left untouched. Characters mapped to None\n\
7666are deleted.");
7667
7668static PyObject*
7669unicode_translate(PyUnicodeObject *self, PyObject *table)
7670{
7671    return PyUnicode_TranslateCharmap(self->str,
7672                                      self->length,
7673                                      table,
7674                                      "ignore");
7675}
7676
7677PyDoc_STRVAR(upper__doc__,
7678             "S.upper() -> unicode\n\
7679\n\
7680Return a copy of S converted to uppercase.");
7681
7682static PyObject*
7683unicode_upper(PyUnicodeObject *self)
7684{
7685    return fixup(self, fixupper);
7686}
7687
7688PyDoc_STRVAR(zfill__doc__,
7689             "S.zfill(width) -> unicode\n\
7690\n\
7691Pad a numeric string S with zeros on the left, to fill a field\n\
7692of the specified width. The string S is never truncated.");
7693
7694static PyObject *
7695unicode_zfill(PyUnicodeObject *self, PyObject *args)
7696{
7697    Py_ssize_t fill;
7698    PyUnicodeObject *u;
7699
7700    Py_ssize_t width;
7701    if (!PyArg_ParseTuple(args, "n:zfill", &width))
7702        return NULL;
7703
7704    if (self->length >= width) {
7705        if (PyUnicode_CheckExact(self)) {
7706            Py_INCREF(self);
7707            return (PyObject*) self;
7708        }
7709        else
7710            return PyUnicode_FromUnicode(
7711                PyUnicode_AS_UNICODE(self),
7712                PyUnicode_GET_SIZE(self)
7713                );
7714    }
7715
7716    fill = width - self->length;
7717
7718    u = pad(self, fill, 0, '0');
7719
7720    if (u == NULL)
7721        return NULL;
7722
7723    if (u->str[fill] == '+' || u->str[fill] == '-') {
7724        /* move sign to beginning of string */
7725        u->str[0] = u->str[fill];
7726        u->str[fill] = '0';
7727    }
7728
7729    return (PyObject*) u;
7730}
7731
7732#if 0
7733static PyObject*
7734free_listsize(PyUnicodeObject *self)
7735{
7736    return PyInt_FromLong(numfree);
7737}
7738#endif
7739
7740PyDoc_STRVAR(startswith__doc__,
7741             "S.startswith(prefix[, start[, end]]) -> bool\n\
7742\n\
7743Return True if S starts with the specified prefix, False otherwise.\n\
7744With optional start, test S beginning at that position.\n\
7745With optional end, stop comparing S at that position.\n\
7746prefix can also be a tuple of strings to try.");
7747
7748static PyObject *
7749unicode_startswith(PyUnicodeObject *self,
7750                   PyObject *args)
7751{
7752    PyObject *subobj;
7753    PyUnicodeObject *substring;
7754    Py_ssize_t start = 0;
7755    Py_ssize_t end = PY_SSIZE_T_MAX;
7756    int result;
7757
7758    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7759        return NULL;
7760    if (PyTuple_Check(subobj)) {
7761        Py_ssize_t i;
7762        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7763            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7764                PyTuple_GET_ITEM(subobj, i));
7765            if (substring == NULL)
7766                return NULL;
7767            result = tailmatch(self, substring, start, end, -1);
7768            Py_DECREF(substring);
7769            if (result) {
7770                Py_RETURN_TRUE;
7771            }
7772        }
7773        /* nothing matched */
7774        Py_RETURN_FALSE;
7775    }
7776    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7777    if (substring == NULL) {
7778        if (PyErr_ExceptionMatches(PyExc_TypeError))
7779            PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7780                         "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7781        return NULL;
7782    }
7783    result = tailmatch(self, substring, start, end, -1);
7784    Py_DECREF(substring);
7785    return PyBool_FromLong(result);
7786}
7787
7788
7789PyDoc_STRVAR(endswith__doc__,
7790             "S.endswith(suffix[, start[, end]]) -> bool\n\
7791\n\
7792Return True if S ends with the specified suffix, False otherwise.\n\
7793With optional start, test S beginning at that position.\n\
7794With optional end, stop comparing S at that position.\n\
7795suffix can also be a tuple of strings to try.");
7796
7797static PyObject *
7798unicode_endswith(PyUnicodeObject *self,
7799                 PyObject *args)
7800{
7801    PyObject *subobj;
7802    PyUnicodeObject *substring;
7803    Py_ssize_t start = 0;
7804    Py_ssize_t end = PY_SSIZE_T_MAX;
7805    int result;
7806
7807    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7808        return NULL;
7809    if (PyTuple_Check(subobj)) {
7810        Py_ssize_t i;
7811        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7812            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7813                PyTuple_GET_ITEM(subobj, i));
7814            if (substring == NULL)
7815                return NULL;
7816            result = tailmatch(self, substring, start, end, +1);
7817            Py_DECREF(substring);
7818            if (result) {
7819                Py_RETURN_TRUE;
7820            }
7821        }
7822        Py_RETURN_FALSE;
7823    }
7824    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7825    if (substring == NULL) {
7826        if (PyErr_ExceptionMatches(PyExc_TypeError))
7827            PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7828                         "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7829        return NULL;
7830    }
7831    result = tailmatch(self, substring, start, end, +1);
7832    Py_DECREF(substring);
7833    return PyBool_FromLong(result);
7834}
7835
7836
7837/* Implements do_string_format, which is unicode because of stringlib */
7838#include "stringlib/string_format.h"
7839
7840PyDoc_STRVAR(format__doc__,
7841             "S.format(*args, **kwargs) -> unicode\n\
7842\n\
7843Return a formatted version of S, using substitutions from args and kwargs.\n\
7844The substitutions are identified by braces ('{' and '}').");
7845
7846static PyObject *
7847unicode__format__(PyObject *self, PyObject *args)
7848{
7849    PyObject *format_spec;
7850    PyObject *result = NULL;
7851    PyObject *tmp = NULL;
7852
7853    /* If 2.x, convert format_spec to the same type as value */
7854    /* This is to allow things like u''.format('') */
7855    if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7856        goto done;
7857    if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7858        PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7859                     "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7860        goto done;
7861    }
7862    tmp = PyObject_Unicode(format_spec);
7863    if (tmp == NULL)
7864        goto done;
7865    format_spec = tmp;
7866
7867    result = _PyUnicode_FormatAdvanced(self,
7868                                       PyUnicode_AS_UNICODE(format_spec),
7869                                       PyUnicode_GET_SIZE(format_spec));
7870  done:
7871    Py_XDECREF(tmp);
7872    return result;
7873}
7874
7875PyDoc_STRVAR(p_format__doc__,
7876             "S.__format__(format_spec) -> unicode\n\
7877\n\
7878Return a formatted version of S as described by format_spec.");
7879
7880static PyObject *
7881unicode__sizeof__(PyUnicodeObject *v)
7882{
7883    return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7884                             sizeof(Py_UNICODE) * (v->length + 1));
7885}
7886
7887PyDoc_STRVAR(sizeof__doc__,
7888             "S.__sizeof__() -> size of S in memory, in bytes\n\
7889\n\
7890");
7891
7892static PyObject *
7893unicode_getnewargs(PyUnicodeObject *v)
7894{
7895    return Py_BuildValue("(u#)", v->str, v->length);
7896}
7897
7898
7899static PyMethodDef unicode_methods[] = {
7900    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7901    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7902    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7903    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7904    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7905    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7906    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7907    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7908    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7909    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7910    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7911    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7912    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7913    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7914    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7915    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7916    {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7917/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7918    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7919    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7920    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7921    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7922    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7923    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7924    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7925    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7926    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7927    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7928    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7929    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7930    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7931    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7932    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7933    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7934    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7935    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7936    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7937    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7938    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7939    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7940    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7941    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7942    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7943    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7944    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7945#if 0
7946    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7947#endif
7948
7949#if 0
7950    /* This one is just used for debugging the implementation. */
7951    {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7952#endif
7953
7954    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7955    {NULL, NULL}
7956};
7957
7958static PyObject *
7959unicode_mod(PyObject *v, PyObject *w)
7960{
7961    if (!PyUnicode_Check(v)) {
7962        Py_INCREF(Py_NotImplemented);
7963        return Py_NotImplemented;
7964    }
7965    return PyUnicode_Format(v, w);
7966}
7967
7968static PyNumberMethods unicode_as_number = {
7969    0,              /*nb_add*/
7970    0,              /*nb_subtract*/
7971    0,              /*nb_multiply*/
7972    0,              /*nb_divide*/
7973    unicode_mod,            /*nb_remainder*/
7974};
7975
7976static PySequenceMethods unicode_as_sequence = {
7977    (lenfunc) unicode_length,       /* sq_length */
7978    PyUnicode_Concat,           /* sq_concat */
7979    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7980    (ssizeargfunc) unicode_getitem,     /* sq_item */
7981    (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7982    0,                  /* sq_ass_item */
7983    0,                  /* sq_ass_slice */
7984    PyUnicode_Contains,         /* sq_contains */
7985};
7986
7987static PyObject*
7988unicode_subscript(PyUnicodeObject* self, PyObject* item)
7989{
7990    if (PyIndex_Check(item)) {
7991        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7992        if (i == -1 && PyErr_Occurred())
7993            return NULL;
7994        if (i < 0)
7995            i += PyUnicode_GET_SIZE(self);
7996        return unicode_getitem(self, i);
7997    } else if (PySlice_Check(item)) {
7998        Py_ssize_t start, stop, step, slicelength, cur, i;
7999        Py_UNICODE* source_buf;
8000        Py_UNICODE* result_buf;
8001        PyObject* result;
8002
8003        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8004                                 &start, &stop, &step, &slicelength) < 0) {
8005            return NULL;
8006        }
8007
8008        if (slicelength <= 0) {
8009            return PyUnicode_FromUnicode(NULL, 0);
8010        } else if (start == 0 && step == 1 && slicelength == self->length &&
8011                   PyUnicode_CheckExact(self)) {
8012            Py_INCREF(self);
8013            return (PyObject *)self;
8014        } else if (step == 1) {
8015            return PyUnicode_FromUnicode(self->str + start, slicelength);
8016        } else {
8017            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8018            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8019                                                       sizeof(Py_UNICODE));
8020
8021            if (result_buf == NULL)
8022                return PyErr_NoMemory();
8023
8024            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8025                result_buf[i] = source_buf[cur];
8026            }
8027
8028            result = PyUnicode_FromUnicode(result_buf, slicelength);
8029            PyObject_FREE(result_buf);
8030            return result;
8031        }
8032    } else {
8033        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8034        return NULL;
8035    }
8036}
8037
8038static PyMappingMethods unicode_as_mapping = {
8039    (lenfunc)unicode_length,        /* mp_length */
8040    (binaryfunc)unicode_subscript,  /* mp_subscript */
8041    (objobjargproc)0,           /* mp_ass_subscript */
8042};
8043
8044static Py_ssize_t
8045unicode_buffer_getreadbuf(PyUnicodeObject *self,
8046                          Py_ssize_t index,
8047                          const void **ptr)
8048{
8049    if (index != 0) {
8050        PyErr_SetString(PyExc_SystemError,
8051                        "accessing non-existent unicode segment");
8052        return -1;
8053    }
8054    *ptr = (void *) self->str;
8055    return PyUnicode_GET_DATA_SIZE(self);
8056}
8057
8058static Py_ssize_t
8059unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
8060                           const void **ptr)
8061{
8062    PyErr_SetString(PyExc_TypeError,
8063                    "cannot use unicode as modifiable buffer");
8064    return -1;
8065}
8066
8067static int
8068unicode_buffer_getsegcount(PyUnicodeObject *self,
8069                           Py_ssize_t *lenp)
8070{
8071    if (lenp)
8072        *lenp = PyUnicode_GET_DATA_SIZE(self);
8073    return 1;
8074}
8075
8076static Py_ssize_t
8077unicode_buffer_getcharbuf(PyUnicodeObject *self,
8078                          Py_ssize_t index,
8079                          const void **ptr)
8080{
8081    PyObject *str;
8082
8083    if (index != 0) {
8084        PyErr_SetString(PyExc_SystemError,
8085                        "accessing non-existent unicode segment");
8086        return -1;
8087    }
8088    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
8089    if (str == NULL)
8090        return -1;
8091    *ptr = (void *) PyString_AS_STRING(str);
8092    return PyString_GET_SIZE(str);
8093}
8094
8095/* Helpers for PyUnicode_Format() */
8096
8097static PyObject *
8098getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8099{
8100    Py_ssize_t argidx = *p_argidx;
8101    if (argidx < arglen) {
8102        (*p_argidx)++;
8103        if (arglen < 0)
8104            return args;
8105        else
8106            return PyTuple_GetItem(args, argidx);
8107    }
8108    PyErr_SetString(PyExc_TypeError,
8109                    "not enough arguments for format string");
8110    return NULL;
8111}
8112
8113#define F_LJUST (1<<0)
8114#define F_SIGN  (1<<1)
8115#define F_BLANK (1<<2)
8116#define F_ALT   (1<<3)
8117#define F_ZERO  (1<<4)
8118
8119static Py_ssize_t
8120strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8121{
8122    register Py_ssize_t i;
8123    Py_ssize_t len = strlen(charbuffer);
8124    for (i = len - 1; i >= 0; i--)
8125        buffer[i] = (Py_UNICODE) charbuffer[i];
8126
8127    return len;
8128}
8129
8130static int
8131longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8132{
8133    Py_ssize_t result;
8134
8135    PyOS_snprintf((char *)buffer, len, format, x);
8136    result = strtounicode(buffer, (char *)buffer);
8137    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8138}
8139
8140/* XXX To save some code duplication, formatfloat/long/int could have been
8141   shared with stringobject.c, converting from 8-bit to Unicode after the
8142   formatting is done. */
8143
8144/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8145
8146static PyObject *
8147formatfloat(PyObject *v, int flags, int prec, int type)
8148{
8149    char *p;
8150    PyObject *result;
8151    double x;
8152
8153    x = PyFloat_AsDouble(v);
8154    if (x == -1.0 && PyErr_Occurred())
8155        return NULL;
8156
8157    if (prec < 0)
8158        prec = 6;
8159
8160    p = PyOS_double_to_string(x, type, prec,
8161                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8162    if (p == NULL)
8163        return NULL;
8164    result = PyUnicode_FromStringAndSize(p, strlen(p));
8165    PyMem_Free(p);
8166    return result;
8167}
8168
8169static PyObject*
8170formatlong(PyObject *val, int flags, int prec, int type)
8171{
8172    char *buf;
8173    int i, len;
8174    PyObject *str; /* temporary string object. */
8175    PyUnicodeObject *result;
8176
8177    str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8178    if (!str)
8179        return NULL;
8180    result = _PyUnicode_New(len);
8181    if (!result) {
8182        Py_DECREF(str);
8183        return NULL;
8184    }
8185    for (i = 0; i < len; i++)
8186        result->str[i] = buf[i];
8187    result->str[len] = 0;
8188    Py_DECREF(str);
8189    return (PyObject*)result;
8190}
8191
8192static int
8193formatint(Py_UNICODE *buf,
8194          size_t buflen,
8195          int flags,
8196          int prec,
8197          int type,
8198          PyObject *v)
8199{
8200    /* fmt = '%#.' + `prec` + 'l' + `type`
8201     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8202     *                     + 1 + 1
8203     *                   = 24
8204     */
8205    char fmt[64]; /* plenty big enough! */
8206    char *sign;
8207    long x;
8208
8209    x = PyInt_AsLong(v);
8210    if (x == -1 && PyErr_Occurred())
8211        return -1;
8212    if (x < 0 && type == 'u') {
8213        type = 'd';
8214    }
8215    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8216        sign = "-";
8217    else
8218        sign = "";
8219    if (prec < 0)
8220        prec = 1;
8221
8222    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8223     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8224     */
8225    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8226        PyErr_SetString(PyExc_OverflowError,
8227                        "formatted integer is too long (precision too large?)");
8228        return -1;
8229    }
8230
8231    if ((flags & F_ALT) &&
8232        (type == 'x' || type == 'X')) {
8233        /* When converting under %#x or %#X, there are a number
8234         * of issues that cause pain:
8235         * - when 0 is being converted, the C standard leaves off
8236         *   the '0x' or '0X', which is inconsistent with other
8237         *   %#x/%#X conversions and inconsistent with Python's
8238         *   hex() function
8239         * - there are platforms that violate the standard and
8240         *   convert 0 with the '0x' or '0X'
8241         *   (Metrowerks, Compaq Tru64)
8242         * - there are platforms that give '0x' when converting
8243         *   under %#X, but convert 0 in accordance with the
8244         *   standard (OS/2 EMX)
8245         *
8246         * We can achieve the desired consistency by inserting our
8247         * own '0x' or '0X' prefix, and substituting %x/%X in place
8248         * of %#x/%#X.
8249         *
8250         * Note that this is the same approach as used in
8251         * formatint() in stringobject.c
8252         */
8253        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8254                      sign, type, prec, type);
8255    }
8256    else {
8257        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8258                      sign, (flags&F_ALT) ? "#" : "",
8259                      prec, type);
8260    }
8261    if (sign[0])
8262        return longtounicode(buf, buflen, fmt, -x);
8263    else
8264        return longtounicode(buf, buflen, fmt, x);
8265}
8266
8267static int
8268formatchar(Py_UNICODE *buf,
8269           size_t buflen,
8270           PyObject *v)
8271{
8272    PyObject *unistr;
8273    char *str;
8274    /* presume that the buffer is at least 2 characters long */
8275    if (PyUnicode_Check(v)) {
8276        if (PyUnicode_GET_SIZE(v) != 1)
8277            goto onError;
8278        buf[0] = PyUnicode_AS_UNICODE(v)[0];
8279    }
8280
8281    else if (PyString_Check(v)) {
8282        if (PyString_GET_SIZE(v) != 1)
8283            goto onError;
8284        /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8285           with a UnicodeDecodeError if 'char' is not decodable with the
8286           default encoding (usually ASCII, but it might be something else) */
8287        str = PyString_AS_STRING(v);
8288        if ((unsigned char)str[0] > 0x7F) {
8289            /* the char is not ASCII; try to decode the string using the
8290               default encoding and return -1 to let the UnicodeDecodeError
8291               be raised if the string can't be decoded */
8292            unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8293            if (unistr == NULL)
8294                return -1;
8295            buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8296            Py_DECREF(unistr);
8297        }
8298        else
8299            buf[0] = (Py_UNICODE)str[0];
8300    }
8301
8302    else {
8303        /* Integer input truncated to a character */
8304        long x;
8305        x = PyInt_AsLong(v);
8306        if (x == -1 && PyErr_Occurred())
8307            goto onError;
8308#ifdef Py_UNICODE_WIDE
8309        if (x < 0 || x > 0x10ffff) {
8310            PyErr_SetString(PyExc_OverflowError,
8311                            "%c arg not in range(0x110000) "
8312                            "(wide Python build)");
8313            return -1;
8314        }
8315#else
8316        if (x < 0 || x > 0xffff) {
8317            PyErr_SetString(PyExc_OverflowError,
8318                            "%c arg not in range(0x10000) "
8319                            "(narrow Python build)");
8320            return -1;
8321        }
8322#endif
8323        buf[0] = (Py_UNICODE) x;
8324    }
8325    buf[1] = '\0';
8326    return 1;
8327
8328  onError:
8329    PyErr_SetString(PyExc_TypeError,
8330                    "%c requires int or char");
8331    return -1;
8332}
8333
8334/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8335
8336   FORMATBUFLEN is the length of the buffer in which the ints &
8337   chars are formatted. XXX This is a magic number. Each formatting
8338   routine does bounds checking to ensure no overflow, but a better
8339   solution may be to malloc a buffer of appropriate size for each
8340   format. For now, the current solution is sufficient.
8341*/
8342#define FORMATBUFLEN (size_t)120
8343
8344PyObject *PyUnicode_Format(PyObject *format,
8345                           PyObject *args)
8346{
8347    Py_UNICODE *fmt, *res;
8348    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8349    int args_owned = 0;
8350    PyUnicodeObject *result = NULL;
8351    PyObject *dict = NULL;
8352    PyObject *uformat;
8353
8354    if (format == NULL || args == NULL) {
8355        PyErr_BadInternalCall();
8356        return NULL;
8357    }
8358    uformat = PyUnicode_FromObject(format);
8359    if (uformat == NULL)
8360        return NULL;
8361    fmt = PyUnicode_AS_UNICODE(uformat);
8362    fmtcnt = PyUnicode_GET_SIZE(uformat);
8363
8364    reslen = rescnt = fmtcnt + 100;
8365    result = _PyUnicode_New(reslen);
8366    if (result == NULL)
8367        goto onError;
8368    res = PyUnicode_AS_UNICODE(result);
8369
8370    if (PyTuple_Check(args)) {
8371        arglen = PyTuple_Size(args);
8372        argidx = 0;
8373    }
8374    else {
8375        arglen = -1;
8376        argidx = -2;
8377    }
8378    if (Py_TYPE(args)->tp_as_mapping && Py_TYPE(args)->tp_as_mapping->mp_subscript &&
8379        !PyTuple_Check(args) && !PyObject_TypeCheck(args, &PyBaseString_Type))
8380        dict = args;
8381
8382    while (--fmtcnt >= 0) {
8383        if (*fmt != '%') {
8384            if (--rescnt < 0) {
8385                rescnt = fmtcnt + 100;
8386                reslen += rescnt;
8387                if (_PyUnicode_Resize(&result, reslen) < 0)
8388                    goto onError;
8389                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8390                --rescnt;
8391            }
8392            *res++ = *fmt++;
8393        }
8394        else {
8395            /* Got a format specifier */
8396            int flags = 0;
8397            Py_ssize_t width = -1;
8398            int prec = -1;
8399            Py_UNICODE c = '\0';
8400            Py_UNICODE fill;
8401            int isnumok;
8402            PyObject *v = NULL;
8403            PyObject *temp = NULL;
8404            Py_UNICODE *pbuf;
8405            Py_UNICODE sign;
8406            Py_ssize_t len;
8407            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8408
8409            fmt++;
8410            if (*fmt == '(') {
8411                Py_UNICODE *keystart;
8412                Py_ssize_t keylen;
8413                PyObject *key;
8414                int pcount = 1;
8415
8416                if (dict == NULL) {
8417                    PyErr_SetString(PyExc_TypeError,
8418                                    "format requires a mapping");
8419                    goto onError;
8420                }
8421                ++fmt;
8422                --fmtcnt;
8423                keystart = fmt;
8424                /* Skip over balanced parentheses */
8425                while (pcount > 0 && --fmtcnt >= 0) {
8426                    if (*fmt == ')')
8427                        --pcount;
8428                    else if (*fmt == '(')
8429                        ++pcount;
8430                    fmt++;
8431                }
8432                keylen = fmt - keystart - 1;
8433                if (fmtcnt < 0 || pcount > 0) {
8434                    PyErr_SetString(PyExc_ValueError,
8435                                    "incomplete format key");
8436                    goto onError;
8437                }
8438#if 0
8439                /* keys are converted to strings using UTF-8 and
8440                   then looked up since Python uses strings to hold
8441                   variables names etc. in its namespaces and we
8442                   wouldn't want to break common idioms. */
8443                key = PyUnicode_EncodeUTF8(keystart,
8444                                           keylen,
8445                                           NULL);
8446#else
8447                key = PyUnicode_FromUnicode(keystart, keylen);
8448#endif
8449                if (key == NULL)
8450                    goto onError;
8451                if (args_owned) {
8452                    Py_DECREF(args);
8453                    args_owned = 0;
8454                }
8455                args = PyObject_GetItem(dict, key);
8456                Py_DECREF(key);
8457                if (args == NULL) {
8458                    goto onError;
8459                }
8460                args_owned = 1;
8461                arglen = -1;
8462                argidx = -2;
8463            }
8464            while (--fmtcnt >= 0) {
8465                switch (c = *fmt++) {
8466                case '-': flags |= F_LJUST; continue;
8467                case '+': flags |= F_SIGN; continue;
8468                case ' ': flags |= F_BLANK; continue;
8469                case '#': flags |= F_ALT; continue;
8470                case '0': flags |= F_ZERO; continue;
8471                }
8472                break;
8473            }
8474            if (c == '*') {
8475                v = getnextarg(args, arglen, &argidx);
8476                if (v == NULL)
8477                    goto onError;
8478                if (!PyInt_Check(v)) {
8479                    PyErr_SetString(PyExc_TypeError,
8480                                    "* wants int");
8481                    goto onError;
8482                }
8483                width = PyInt_AsSsize_t(v);
8484                if (width == -1 && PyErr_Occurred())
8485                    goto onError;
8486                if (width < 0) {
8487                    flags |= F_LJUST;
8488                    width = -width;
8489                }
8490                if (--fmtcnt >= 0)
8491                    c = *fmt++;
8492            }
8493            else if (c >= '0' && c <= '9') {
8494                width = c - '0';
8495                while (--fmtcnt >= 0) {
8496                    c = *fmt++;
8497                    if (c < '0' || c > '9')
8498                        break;
8499                    if (width > (PY_SSIZE_T_MAX - ((int)c - '0')) / 10) {
8500                        PyErr_SetString(PyExc_ValueError,
8501                                        "width too big");
8502                        goto onError;
8503                    }
8504                    width = width*10 + (c - '0');
8505                }
8506            }
8507            if (c == '.') {
8508                prec = 0;
8509                if (--fmtcnt >= 0)
8510                    c = *fmt++;
8511                if (c == '*') {
8512                    v = getnextarg(args, arglen, &argidx);
8513                    if (v == NULL)
8514                        goto onError;
8515                    if (!PyInt_Check(v)) {
8516                        PyErr_SetString(PyExc_TypeError,
8517                                        "* wants int");
8518                        goto onError;
8519                    }
8520                    prec = _PyInt_AsInt(v);
8521                    if (prec == -1 && PyErr_Occurred())
8522                        goto onError;
8523                    if (prec < 0)
8524                        prec = 0;
8525                    if (--fmtcnt >= 0)
8526                        c = *fmt++;
8527                }
8528                else if (c >= '0' && c <= '9') {
8529                    prec = c - '0';
8530                    while (--fmtcnt >= 0) {
8531                        c = *fmt++;
8532                        if (c < '0' || c > '9')
8533                            break;
8534                        if (prec > (INT_MAX - ((int)c - '0')) / 10) {
8535                            PyErr_SetString(PyExc_ValueError,
8536                                            "prec too big");
8537                            goto onError;
8538                        }
8539                        prec = prec*10 + (c - '0');
8540                    }
8541                }
8542            } /* prec */
8543            if (fmtcnt >= 0) {
8544                if (c == 'h' || c == 'l' || c == 'L') {
8545                    if (--fmtcnt >= 0)
8546                        c = *fmt++;
8547                }
8548            }
8549            if (fmtcnt < 0) {
8550                PyErr_SetString(PyExc_ValueError,
8551                                "incomplete format");
8552                goto onError;
8553            }
8554            if (c != '%') {
8555                v = getnextarg(args, arglen, &argidx);
8556                if (v == NULL)
8557                    goto onError;
8558            }
8559            sign = 0;
8560            fill = ' ';
8561            switch (c) {
8562
8563            case '%':
8564                pbuf = formatbuf;
8565                /* presume that buffer length is at least 1 */
8566                pbuf[0] = '%';
8567                len = 1;
8568                break;
8569
8570            case 's':
8571            case 'r':
8572                if (PyUnicode_CheckExact(v) && c == 's') {
8573                    temp = v;
8574                    Py_INCREF(temp);
8575                }
8576                else {
8577                    PyObject *unicode;
8578                    if (c == 's')
8579                        temp = PyObject_Unicode(v);
8580                    else
8581                        temp = PyObject_Repr(v);
8582                    if (temp == NULL)
8583                        goto onError;
8584                    if (PyUnicode_Check(temp))
8585                        /* nothing to do */;
8586                    else if (PyString_Check(temp)) {
8587                        /* convert to string to Unicode */
8588                        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8589                                                   PyString_GET_SIZE(temp),
8590                                                   NULL,
8591                                                   "strict");
8592                        Py_DECREF(temp);
8593                        temp = unicode;
8594                        if (temp == NULL)
8595                            goto onError;
8596                    }
8597                    else {
8598                        Py_DECREF(temp);
8599                        PyErr_SetString(PyExc_TypeError,
8600                                        "%s argument has non-string str()");
8601                        goto onError;
8602                    }
8603                }
8604                pbuf = PyUnicode_AS_UNICODE(temp);
8605                len = PyUnicode_GET_SIZE(temp);
8606                if (prec >= 0 && len > prec)
8607                    len = prec;
8608                break;
8609
8610            case 'i':
8611            case 'd':
8612            case 'u':
8613            case 'o':
8614            case 'x':
8615            case 'X':
8616                if (c == 'i')
8617                    c = 'd';
8618                isnumok = 0;
8619                if (PyNumber_Check(v)) {
8620                    PyObject *iobj=NULL;
8621
8622                    if (PyInt_Check(v) || (PyLong_Check(v))) {
8623                        iobj = v;
8624                        Py_INCREF(iobj);
8625                    }
8626                    else {
8627                        iobj = PyNumber_Int(v);
8628                        if (iobj==NULL) iobj = PyNumber_Long(v);
8629                    }
8630                    if (iobj!=NULL) {
8631                        if (PyInt_Check(iobj)) {
8632                            isnumok = 1;
8633                            pbuf = formatbuf;
8634                            len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8635                                            flags, prec, c, iobj);
8636                            Py_DECREF(iobj);
8637                            if (len < 0)
8638                                goto onError;
8639                            sign = 1;
8640                        }
8641                        else if (PyLong_Check(iobj)) {
8642                            isnumok = 1;
8643                            temp = formatlong(iobj, flags, prec, c);
8644                            Py_DECREF(iobj);
8645                            if (!temp)
8646                                goto onError;
8647                            pbuf = PyUnicode_AS_UNICODE(temp);
8648                            len = PyUnicode_GET_SIZE(temp);
8649                            sign = 1;
8650                        }
8651                        else {
8652                            Py_DECREF(iobj);
8653                        }
8654                    }
8655                }
8656                if (!isnumok) {
8657                    PyErr_Format(PyExc_TypeError,
8658                                 "%%%c format: a number is required, "
8659                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8660                    goto onError;
8661                }
8662                if (flags & F_ZERO)
8663                    fill = '0';
8664                break;
8665
8666            case 'e':
8667            case 'E':
8668            case 'f':
8669            case 'F':
8670            case 'g':
8671            case 'G':
8672                temp = formatfloat(v, flags, prec, c);
8673                if (temp == NULL)
8674                    goto onError;
8675                pbuf = PyUnicode_AS_UNICODE(temp);
8676                len = PyUnicode_GET_SIZE(temp);
8677                sign = 1;
8678                if (flags & F_ZERO)
8679                    fill = '0';
8680                break;
8681
8682            case 'c':
8683                pbuf = formatbuf;
8684                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8685                if (len < 0)
8686                    goto onError;
8687                break;
8688
8689            default:
8690                PyErr_Format(PyExc_ValueError,
8691                             "unsupported format character '%c' (0x%x) "
8692                             "at index %zd",
8693                             (31<=c && c<=126) ? (char)c : '?',
8694                             (int)c,
8695                             (Py_ssize_t)(fmt - 1 -
8696                                          PyUnicode_AS_UNICODE(uformat)));
8697                goto onError;
8698            }
8699            if (sign) {
8700                if (*pbuf == '-' || *pbuf == '+') {
8701                    sign = *pbuf++;
8702                    len--;
8703                }
8704                else if (flags & F_SIGN)
8705                    sign = '+';
8706                else if (flags & F_BLANK)
8707                    sign = ' ';
8708                else
8709                    sign = 0;
8710            }
8711            if (width < len)
8712                width = len;
8713            if (rescnt - (sign != 0) < width) {
8714                reslen -= rescnt;
8715                rescnt = width + fmtcnt + 100;
8716                reslen += rescnt;
8717                if (reslen < 0) {
8718                    Py_XDECREF(temp);
8719                    PyErr_NoMemory();
8720                    goto onError;
8721                }
8722                if (_PyUnicode_Resize(&result, reslen) < 0) {
8723                    Py_XDECREF(temp);
8724                    goto onError;
8725                }
8726                res = PyUnicode_AS_UNICODE(result)
8727                    + reslen - rescnt;
8728            }
8729            if (sign) {
8730                if (fill != ' ')
8731                    *res++ = sign;
8732                rescnt--;
8733                if (width > len)
8734                    width--;
8735            }
8736            if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8737                assert(pbuf[0] == '0');
8738                assert(pbuf[1] == c);
8739                if (fill != ' ') {
8740                    *res++ = *pbuf++;
8741                    *res++ = *pbuf++;
8742                }
8743                rescnt -= 2;
8744                width -= 2;
8745                if (width < 0)
8746                    width = 0;
8747                len -= 2;
8748            }
8749            if (width > len && !(flags & F_LJUST)) {
8750                do {
8751                    --rescnt;
8752                    *res++ = fill;
8753                } while (--width > len);
8754            }
8755            if (fill == ' ') {
8756                if (sign)
8757                    *res++ = sign;
8758                if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8759                    assert(pbuf[0] == '0');
8760                    assert(pbuf[1] == c);
8761                    *res++ = *pbuf++;
8762                    *res++ = *pbuf++;
8763                }
8764            }
8765            Py_UNICODE_COPY(res, pbuf, len);
8766            res += len;
8767            rescnt -= len;
8768            while (--width >= len) {
8769                --rescnt;
8770                *res++ = ' ';
8771            }
8772            if (dict && (argidx < arglen) && c != '%') {
8773                PyErr_SetString(PyExc_TypeError,
8774                                "not all arguments converted during string formatting");
8775                Py_XDECREF(temp);
8776                goto onError;
8777            }
8778            Py_XDECREF(temp);
8779        } /* '%' */
8780    } /* until end */
8781    if (argidx < arglen && !dict) {
8782        PyErr_SetString(PyExc_TypeError,
8783                        "not all arguments converted during string formatting");
8784        goto onError;
8785    }
8786
8787    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8788        goto onError;
8789    if (args_owned) {
8790        Py_DECREF(args);
8791    }
8792    Py_DECREF(uformat);
8793    return (PyObject *)result;
8794
8795  onError:
8796    Py_XDECREF(result);
8797    Py_DECREF(uformat);
8798    if (args_owned) {
8799        Py_DECREF(args);
8800    }
8801    return NULL;
8802}
8803
8804static PyBufferProcs unicode_as_buffer = {
8805    (readbufferproc) unicode_buffer_getreadbuf,
8806    (writebufferproc) unicode_buffer_getwritebuf,
8807    (segcountproc) unicode_buffer_getsegcount,
8808    (charbufferproc) unicode_buffer_getcharbuf,
8809};
8810
8811static PyObject *
8812unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8813
8814static PyObject *
8815unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8816{
8817    PyObject *x = NULL;
8818    static char *kwlist[] = {"string", "encoding", "errors", 0};
8819    char *encoding = NULL;
8820    char *errors = NULL;
8821
8822    if (type != &PyUnicode_Type)
8823        return unicode_subtype_new(type, args, kwds);
8824    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8825                                     kwlist, &x, &encoding, &errors))
8826        return NULL;
8827    if (x == NULL)
8828        return (PyObject *)_PyUnicode_New(0);
8829    if (encoding == NULL && errors == NULL)
8830        return PyObject_Unicode(x);
8831    else
8832        return PyUnicode_FromEncodedObject(x, encoding, errors);
8833}
8834
8835static PyObject *
8836unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8837{
8838    PyUnicodeObject *tmp, *pnew;
8839    Py_ssize_t n;
8840
8841    assert(PyType_IsSubtype(type, &PyUnicode_Type));
8842    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8843    if (tmp == NULL)
8844        return NULL;
8845    assert(PyUnicode_Check(tmp));
8846    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8847    if (pnew == NULL) {
8848        Py_DECREF(tmp);
8849        return NULL;
8850    }
8851    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8852    if (pnew->str == NULL) {
8853        _Py_ForgetReference((PyObject *)pnew);
8854        PyObject_Del(pnew);
8855        Py_DECREF(tmp);
8856        return PyErr_NoMemory();
8857    }
8858    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8859    pnew->length = n;
8860    pnew->hash = tmp->hash;
8861    Py_DECREF(tmp);
8862    return (PyObject *)pnew;
8863}
8864
8865PyDoc_STRVAR(unicode_doc,
8866             "unicode(object='') -> unicode object\n\
8867unicode(string[, encoding[, errors]]) -> unicode object\n\
8868\n\
8869Create a new Unicode object from the given encoded string.\n\
8870encoding defaults to the current default string encoding.\n\
8871errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8872
8873PyTypeObject PyUnicode_Type = {
8874    PyVarObject_HEAD_INIT(&PyType_Type, 0)
8875    "unicode",              /* tp_name */
8876    sizeof(PyUnicodeObject),        /* tp_size */
8877    0,                  /* tp_itemsize */
8878    /* Slots */
8879    (destructor)unicode_dealloc,    /* tp_dealloc */
8880    0,                  /* tp_print */
8881    0,                  /* tp_getattr */
8882    0,                  /* tp_setattr */
8883    0,                  /* tp_compare */
8884    unicode_repr,           /* tp_repr */
8885    &unicode_as_number,         /* tp_as_number */
8886    &unicode_as_sequence,       /* tp_as_sequence */
8887    &unicode_as_mapping,        /* tp_as_mapping */
8888    (hashfunc) unicode_hash,        /* tp_hash*/
8889    0,                  /* tp_call*/
8890    (reprfunc) unicode_str,     /* tp_str */
8891    PyObject_GenericGetAttr,        /* tp_getattro */
8892    0,                  /* tp_setattro */
8893    &unicode_as_buffer,         /* tp_as_buffer */
8894    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8895    Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8896    unicode_doc,            /* tp_doc */
8897    0,                  /* tp_traverse */
8898    0,                  /* tp_clear */
8899    PyUnicode_RichCompare,      /* tp_richcompare */
8900    0,                  /* tp_weaklistoffset */
8901    0,                  /* tp_iter */
8902    0,                  /* tp_iternext */
8903    unicode_methods,            /* tp_methods */
8904    0,                  /* tp_members */
8905    0,                  /* tp_getset */
8906    &PyBaseString_Type,         /* tp_base */
8907    0,                  /* tp_dict */
8908    0,                  /* tp_descr_get */
8909    0,                  /* tp_descr_set */
8910    0,                  /* tp_dictoffset */
8911    0,                  /* tp_init */
8912    0,                  /* tp_alloc */
8913    unicode_new,            /* tp_new */
8914    PyObject_Del,           /* tp_free */
8915};
8916
8917/* Initialize the Unicode implementation */
8918
8919void _PyUnicode_Init(void)
8920{
8921    /* XXX - move this array to unicodectype.c ? */
8922    Py_UNICODE linebreak[] = {
8923        0x000A, /* LINE FEED */
8924        0x000D, /* CARRIAGE RETURN */
8925        0x001C, /* FILE SEPARATOR */
8926        0x001D, /* GROUP SEPARATOR */
8927        0x001E, /* RECORD SEPARATOR */
8928        0x0085, /* NEXT LINE */
8929        0x2028, /* LINE SEPARATOR */
8930        0x2029, /* PARAGRAPH SEPARATOR */
8931    };
8932
8933    /* Init the implementation */
8934    if (!unicode_empty) {
8935        unicode_empty = _PyUnicode_New(0);
8936        if (!unicode_empty)
8937            return;
8938    }
8939
8940    if (PyType_Ready(&PyUnicode_Type) < 0)
8941        Py_FatalError("Can't initialize 'unicode'");
8942
8943    /* initialize the linebreak bloom filter */
8944    bloom_linebreak = make_bloom_mask(
8945        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8946        );
8947
8948    PyType_Ready(&EncodingMapType);
8949
8950    if (PyType_Ready(&PyFieldNameIter_Type) < 0)
8951        Py_FatalError("Can't initialize field name iterator type");
8952
8953    if (PyType_Ready(&PyFormatterIter_Type) < 0)
8954        Py_FatalError("Can't initialize formatter iter type");
8955}
8956
8957/* Finalize the Unicode implementation */
8958
8959int
8960PyUnicode_ClearFreeList(void)
8961{
8962    int freelist_size = numfree;
8963    PyUnicodeObject *u;
8964
8965    for (u = free_list; u != NULL;) {
8966        PyUnicodeObject *v = u;
8967        u = *(PyUnicodeObject **)u;
8968        if (v->str)
8969            PyObject_DEL(v->str);
8970        Py_XDECREF(v->defenc);
8971        PyObject_Del(v);
8972        numfree--;
8973    }
8974    free_list = NULL;
8975    assert(numfree == 0);
8976    return freelist_size;
8977}
8978
8979void
8980_PyUnicode_Fini(void)
8981{
8982    int i;
8983
8984    Py_CLEAR(unicode_empty);
8985
8986    for (i = 0; i < 256; i++)
8987        Py_CLEAR(unicode_latin1[i]);
8988
8989    (void)PyUnicode_ClearFreeList();
8990}
8991
8992#ifdef __cplusplus
8993}
8994#endif
8995