1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44
45#include "unicodeobject.h"
46#include "ucnhash.h"
47
48#ifdef MS_WINDOWS
49#include <windows.h>
50#endif
51
52/* Limit for the Unicode object free list */
53
54#define PyUnicode_MAXFREELIST       1024
55
56/* Limit for the Unicode object free list stay alive optimization.
57
58   The implementation will keep allocated Unicode memory intact for
59   all objects on the free list having a size less than this
60   limit. This reduces malloc() overhead for small Unicode objects.
61
62   At worst this will result in PyUnicode_MAXFREELIST *
63   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
64   malloc()-overhead) bytes of unused garbage.
65
66   Setting the limit to 0 effectively turns the feature off.
67
68   Note: This is an experimental feature ! If you get core dumps when
69   using Unicode objects, turn this feature off.
70
71*/
72
73#define KEEPALIVE_SIZE_LIMIT       9
74
75/* Endianness switches; defaults to little endian */
76
77#ifdef WORDS_BIGENDIAN
78# define BYTEORDER_IS_BIG_ENDIAN
79#else
80# define BYTEORDER_IS_LITTLE_ENDIAN
81#endif
82
83/* --- Globals ------------------------------------------------------------
84
85   The globals are initialized by the _PyUnicode_Init() API and should
86   not be used before calling that API.
87
88*/
89
90
91#ifdef __cplusplus
92extern "C" {
93#endif
94
95/* Free list for Unicode objects */
96static PyUnicodeObject *free_list;
97static int numfree;
98
99/* The empty Unicode object is shared to improve performance. */
100static PyUnicodeObject *unicode_empty;
101
102/* Single character Unicode strings in the Latin-1 range are being
103   shared as well. */
104static PyUnicodeObject *unicode_latin1[256];
105
106/* Default encoding to use and assume when NULL is passed as encoding
107   parameter; it is initialized by _PyUnicode_Init().
108
109   Always use the PyUnicode_SetDefaultEncoding() and
110   PyUnicode_GetDefaultEncoding() APIs to access this global.
111
112*/
113static char unicode_default_encoding[100];
114
115/* Fast detection of the most frequent whitespace characters */
116const unsigned char _Py_ascii_whitespace[] = {
117    0, 0, 0, 0, 0, 0, 0, 0,
118/*     case 0x0009: * CHARACTER TABULATION */
119/*     case 0x000A: * LINE FEED */
120/*     case 0x000B: * LINE TABULATION */
121/*     case 0x000C: * FORM FEED */
122/*     case 0x000D: * CARRIAGE RETURN */
123    0, 1, 1, 1, 1, 1, 0, 0,
124    0, 0, 0, 0, 0, 0, 0, 0,
125/*     case 0x001C: * FILE SEPARATOR */
126/*     case 0x001D: * GROUP SEPARATOR */
127/*     case 0x001E: * RECORD SEPARATOR */
128/*     case 0x001F: * UNIT SEPARATOR */
129    0, 0, 0, 0, 1, 1, 1, 1,
130/*     case 0x0020: * SPACE */
131    1, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0,
134    0, 0, 0, 0, 0, 0, 0, 0,
135
136    0, 0, 0, 0, 0, 0, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0,
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0,
143    0, 0, 0, 0, 0, 0, 0, 0
144};
145
146/* Same for linebreaks */
147static unsigned char ascii_linebreak[] = {
148    0, 0, 0, 0, 0, 0, 0, 0,
149/*         0x000A, * LINE FEED */
150/*         0x000B, * LINE TABULATION */
151/*         0x000C, * FORM FEED */
152/*         0x000D, * CARRIAGE RETURN */
153    0, 0, 1, 1, 1, 1, 0, 0,
154    0, 0, 0, 0, 0, 0, 0, 0,
155/*         0x001C, * FILE SEPARATOR */
156/*         0x001D, * GROUP SEPARATOR */
157/*         0x001E, * RECORD SEPARATOR */
158    0, 0, 0, 0, 1, 1, 1, 0,
159    0, 0, 0, 0, 0, 0, 0, 0,
160    0, 0, 0, 0, 0, 0, 0, 0,
161    0, 0, 0, 0, 0, 0, 0, 0,
162    0, 0, 0, 0, 0, 0, 0, 0,
163
164    0, 0, 0, 0, 0, 0, 0, 0,
165    0, 0, 0, 0, 0, 0, 0, 0,
166    0, 0, 0, 0, 0, 0, 0, 0,
167    0, 0, 0, 0, 0, 0, 0, 0,
168    0, 0, 0, 0, 0, 0, 0, 0,
169    0, 0, 0, 0, 0, 0, 0, 0,
170    0, 0, 0, 0, 0, 0, 0, 0,
171    0, 0, 0, 0, 0, 0, 0, 0
172};
173
174
175Py_UNICODE
176PyUnicode_GetMax(void)
177{
178#ifdef Py_UNICODE_WIDE
179    return 0x10FFFF;
180#else
181    /* This is actually an illegal character, so it should
182       not be passed to unichr. */
183    return 0xFFFF;
184#endif
185}
186
187/* --- Bloom Filters ----------------------------------------------------- */
188
189/* stuff to implement simple "bloom filters" for Unicode characters.
190   to keep things simple, we use a single bitmask, using the least 5
191   bits from each unicode characters as the bit index. */
192
193/* the linebreak mask is set up by Unicode_Init below */
194
195#if LONG_BIT >= 128
196#define BLOOM_WIDTH 128
197#elif LONG_BIT >= 64
198#define BLOOM_WIDTH 64
199#elif LONG_BIT >= 32
200#define BLOOM_WIDTH 32
201#else
202#error "LONG_BIT is smaller than 32"
203#endif
204
205#define BLOOM_MASK unsigned long
206
207static BLOOM_MASK bloom_linebreak;
208
209#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
210#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
211
212#define BLOOM_LINEBREAK(ch)                                             \
213    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
214     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
215
216Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
217{
218    /* calculate simple bloom-style bitmask for a given unicode string */
219
220    BLOOM_MASK mask;
221    Py_ssize_t i;
222
223    mask = 0;
224    for (i = 0; i < len; i++)
225        BLOOM_ADD(mask, ptr[i]);
226
227    return mask;
228}
229
230Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
231{
232    Py_ssize_t i;
233
234    for (i = 0; i < setlen; i++)
235        if (set[i] == chr)
236            return 1;
237
238    return 0;
239}
240
241#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
242    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
243
244/* --- Unicode Object ----------------------------------------------------- */
245
246static
247int unicode_resize(register PyUnicodeObject *unicode,
248                   Py_ssize_t length)
249{
250    void *oldstr;
251
252    /* Shortcut if there's nothing much to do. */
253    if (unicode->length == length)
254        goto reset;
255
256    /* Resizing shared object (unicode_empty or single character
257       objects) in-place is not allowed. Use PyUnicode_Resize()
258       instead ! */
259
260    if (unicode == unicode_empty ||
261        (unicode->length == 1 &&
262         unicode->str[0] < 256U &&
263         unicode_latin1[unicode->str[0]] == unicode)) {
264        PyErr_SetString(PyExc_SystemError,
265                        "can't resize shared unicode objects");
266        return -1;
267    }
268
269    /* We allocate one more byte to make sure the string is Ux0000 terminated.
270       The overallocation is also used by fastsearch, which assumes that it's
271       safe to look at str[length] (without making any assumptions about what
272       it contains). */
273
274    oldstr = unicode->str;
275    unicode->str = PyObject_REALLOC(unicode->str,
276                                    sizeof(Py_UNICODE) * (length + 1));
277    if (!unicode->str) {
278        unicode->str = (Py_UNICODE *)oldstr;
279        PyErr_NoMemory();
280        return -1;
281    }
282    unicode->str[length] = 0;
283    unicode->length = length;
284
285  reset:
286    /* Reset the object caches */
287    if (unicode->defenc) {
288        Py_CLEAR(unicode->defenc);
289    }
290    unicode->hash = -1;
291
292    return 0;
293}
294
295/* We allocate one more byte to make sure the string is
296   Ux0000 terminated; some code relies on that.
297
298   XXX This allocator could further be enhanced by assuring that the
299   free list never reduces its size below 1.
300
301*/
302
303static
304PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
305{
306    register PyUnicodeObject *unicode;
307
308    /* Optimization for empty strings */
309    if (length == 0 && unicode_empty != NULL) {
310        Py_INCREF(unicode_empty);
311        return unicode_empty;
312    }
313
314    /* Ensure we won't overflow the size. */
315    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
316        return (PyUnicodeObject *)PyErr_NoMemory();
317    }
318
319    /* Unicode freelist & memory allocation */
320    if (free_list) {
321        unicode = free_list;
322        free_list = *(PyUnicodeObject **)unicode;
323        numfree--;
324        if (unicode->str) {
325            /* Keep-Alive optimization: we only upsize the buffer,
326               never downsize it. */
327            if ((unicode->length < length) &&
328                unicode_resize(unicode, length) < 0) {
329                PyObject_DEL(unicode->str);
330                unicode->str = NULL;
331            }
332        }
333        else {
334            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
335            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
336        }
337        PyObject_INIT(unicode, &PyUnicode_Type);
338    }
339    else {
340        size_t new_size;
341        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
342        if (unicode == NULL)
343            return NULL;
344        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
345        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
346    }
347
348    if (!unicode->str) {
349        PyErr_NoMemory();
350        goto onError;
351    }
352    /* Initialize the first element to guard against cases where
353     * the caller fails before initializing str -- unicode_resize()
354     * reads str[0], and the Keep-Alive optimization can keep memory
355     * allocated for str alive across a call to unicode_dealloc(unicode).
356     * We don't want unicode_resize to read uninitialized memory in
357     * that case.
358     */
359    unicode->str[0] = 0;
360    unicode->str[length] = 0;
361    unicode->length = length;
362    unicode->hash = -1;
363    unicode->defenc = NULL;
364    return unicode;
365
366  onError:
367    /* XXX UNREF/NEWREF interface should be more symmetrical */
368    _Py_DEC_REFTOTAL;
369    _Py_ForgetReference((PyObject *)unicode);
370    PyObject_Del(unicode);
371    return NULL;
372}
373
374static
375void unicode_dealloc(register PyUnicodeObject *unicode)
376{
377    if (PyUnicode_CheckExact(unicode) &&
378        numfree < PyUnicode_MAXFREELIST) {
379        /* Keep-Alive optimization */
380        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
381            PyObject_DEL(unicode->str);
382            unicode->str = NULL;
383            unicode->length = 0;
384        }
385        if (unicode->defenc) {
386            Py_CLEAR(unicode->defenc);
387        }
388        /* Add to free list */
389        *(PyUnicodeObject **)unicode = free_list;
390        free_list = unicode;
391        numfree++;
392    }
393    else {
394        PyObject_DEL(unicode->str);
395        Py_XDECREF(unicode->defenc);
396        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
397    }
398}
399
400static
401int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
402{
403    register PyUnicodeObject *v;
404
405    /* Argument checks */
406    if (unicode == NULL) {
407        PyErr_BadInternalCall();
408        return -1;
409    }
410    v = *unicode;
411    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
412        PyErr_BadInternalCall();
413        return -1;
414    }
415
416    /* Resizing unicode_empty and single character objects is not
417       possible since these are being shared. We simply return a fresh
418       copy with the same Unicode content. */
419    if (v->length != length &&
420        (v == unicode_empty || v->length == 1)) {
421        PyUnicodeObject *w = _PyUnicode_New(length);
422        if (w == NULL)
423            return -1;
424        Py_UNICODE_COPY(w->str, v->str,
425                        length < v->length ? length : v->length);
426        Py_DECREF(*unicode);
427        *unicode = w;
428        return 0;
429    }
430
431    /* Note that we don't have to modify *unicode for unshared Unicode
432       objects, since we can modify them in-place. */
433    return unicode_resize(v, length);
434}
435
436int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
437{
438    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
439}
440
441PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
442                                Py_ssize_t size)
443{
444    PyUnicodeObject *unicode;
445
446    /* If the Unicode data is known at construction time, we can apply
447       some optimizations which share commonly used objects. */
448    if (u != NULL) {
449
450        /* Optimization for empty strings */
451        if (size == 0 && unicode_empty != NULL) {
452            Py_INCREF(unicode_empty);
453            return (PyObject *)unicode_empty;
454        }
455
456        /* Single character Unicode objects in the Latin-1 range are
457           shared when using this constructor */
458        if (size == 1 && *u < 256) {
459            unicode = unicode_latin1[*u];
460            if (!unicode) {
461                unicode = _PyUnicode_New(1);
462                if (!unicode)
463                    return NULL;
464                unicode->str[0] = *u;
465                unicode_latin1[*u] = unicode;
466            }
467            Py_INCREF(unicode);
468            return (PyObject *)unicode;
469        }
470    }
471
472    unicode = _PyUnicode_New(size);
473    if (!unicode)
474        return NULL;
475
476    /* Copy the Unicode data into the new object */
477    if (u != NULL)
478        Py_UNICODE_COPY(unicode->str, u, size);
479
480    return (PyObject *)unicode;
481}
482
483PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
484{
485    PyUnicodeObject *unicode;
486
487    if (size < 0) {
488        PyErr_SetString(PyExc_SystemError,
489                        "Negative size passed to PyUnicode_FromStringAndSize");
490        return NULL;
491    }
492
493    /* If the Unicode data is known at construction time, we can apply
494       some optimizations which share commonly used objects.
495       Also, this means the input must be UTF-8, so fall back to the
496       UTF-8 decoder at the end. */
497    if (u != NULL) {
498
499        /* Optimization for empty strings */
500        if (size == 0 && unicode_empty != NULL) {
501            Py_INCREF(unicode_empty);
502            return (PyObject *)unicode_empty;
503        }
504
505        /* Single characters are shared when using this constructor.
506           Restrict to ASCII, since the input must be UTF-8. */
507        if (size == 1 && Py_CHARMASK(*u) < 128) {
508            unicode = unicode_latin1[Py_CHARMASK(*u)];
509            if (!unicode) {
510                unicode = _PyUnicode_New(1);
511                if (!unicode)
512                    return NULL;
513                unicode->str[0] = Py_CHARMASK(*u);
514                unicode_latin1[Py_CHARMASK(*u)] = unicode;
515            }
516            Py_INCREF(unicode);
517            return (PyObject *)unicode;
518        }
519
520        return PyUnicode_DecodeUTF8(u, size, NULL);
521    }
522
523    unicode = _PyUnicode_New(size);
524    if (!unicode)
525        return NULL;
526
527    return (PyObject *)unicode;
528}
529
530PyObject *PyUnicode_FromString(const char *u)
531{
532    size_t size = strlen(u);
533    if (size > PY_SSIZE_T_MAX) {
534        PyErr_SetString(PyExc_OverflowError, "input too long");
535        return NULL;
536    }
537
538    return PyUnicode_FromStringAndSize(u, size);
539}
540
541#ifdef HAVE_WCHAR_H
542
543#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
544# define CONVERT_WCHAR_TO_SURROGATES
545#endif
546
547#ifdef CONVERT_WCHAR_TO_SURROGATES
548
549/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
550   to convert from UTF32 to UTF16. */
551
552PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
553                                 Py_ssize_t size)
554{
555    PyUnicodeObject *unicode;
556    register Py_ssize_t i;
557    Py_ssize_t alloc;
558    const wchar_t *orig_w;
559
560    if (w == NULL) {
561        PyErr_BadInternalCall();
562        return NULL;
563    }
564
565    alloc = size;
566    orig_w = w;
567    for (i = size; i > 0; i--) {
568        if (*w > 0xFFFF)
569            alloc++;
570        w++;
571    }
572    w = orig_w;
573    unicode = _PyUnicode_New(alloc);
574    if (!unicode)
575        return NULL;
576
577    /* Copy the wchar_t data into the new object */
578    {
579        register Py_UNICODE *u;
580        u = PyUnicode_AS_UNICODE(unicode);
581        for (i = size; i > 0; i--) {
582            if (*w > 0xFFFF) {
583                wchar_t ordinal = *w++;
584                ordinal -= 0x10000;
585                *u++ = 0xD800 | (ordinal >> 10);
586                *u++ = 0xDC00 | (ordinal & 0x3FF);
587            }
588            else
589                *u++ = *w++;
590        }
591    }
592    return (PyObject *)unicode;
593}
594
595#else
596
597PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
598                                 Py_ssize_t size)
599{
600    PyUnicodeObject *unicode;
601
602    if (w == NULL) {
603        PyErr_BadInternalCall();
604        return NULL;
605    }
606
607    unicode = _PyUnicode_New(size);
608    if (!unicode)
609        return NULL;
610
611    /* Copy the wchar_t data into the new object */
612#ifdef HAVE_USABLE_WCHAR_T
613    memcpy(unicode->str, w, size * sizeof(wchar_t));
614#else
615    {
616        register Py_UNICODE *u;
617        register Py_ssize_t i;
618        u = PyUnicode_AS_UNICODE(unicode);
619        for (i = size; i > 0; i--)
620            *u++ = *w++;
621    }
622#endif
623
624    return (PyObject *)unicode;
625}
626
627#endif /* CONVERT_WCHAR_TO_SURROGATES */
628
629#undef CONVERT_WCHAR_TO_SURROGATES
630
631static void
632makefmt(char *fmt, int longflag, int size_tflag, int zeropad, int width, int precision, char c)
633{
634    *fmt++ = '%';
635    if (width) {
636        if (zeropad)
637            *fmt++ = '0';
638        fmt += sprintf(fmt, "%d", width);
639    }
640    if (precision)
641        fmt += sprintf(fmt, ".%d", precision);
642    if (longflag)
643        *fmt++ = 'l';
644    else if (size_tflag) {
645        char *f = PY_FORMAT_SIZE_T;
646        while (*f)
647            *fmt++ = *f++;
648    }
649    *fmt++ = c;
650    *fmt = '\0';
651}
652
653#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
654
655PyObject *
656PyUnicode_FromFormatV(const char *format, va_list vargs)
657{
658    va_list count;
659    Py_ssize_t callcount = 0;
660    PyObject **callresults = NULL;
661    PyObject **callresult = NULL;
662    Py_ssize_t n = 0;
663    int width = 0;
664    int precision = 0;
665    int zeropad;
666    const char* f;
667    Py_UNICODE *s;
668    PyObject *string;
669    /* used by sprintf */
670    char buffer[21];
671    /* use abuffer instead of buffer, if we need more space
672     * (which can happen if there's a format specifier with width). */
673    char *abuffer = NULL;
674    char *realbuffer;
675    Py_ssize_t abuffersize = 0;
676    char fmt[60]; /* should be enough for %0width.precisionld */
677    const char *copy;
678
679#ifdef VA_LIST_IS_ARRAY
680    Py_MEMCPY(count, vargs, sizeof(va_list));
681#else
682#ifdef  __va_copy
683    __va_copy(count, vargs);
684#else
685    count = vargs;
686#endif
687#endif
688     /* step 1: count the number of %S/%R/%s format specifications
689      * (we call PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() for these
690      * objects once during step 3 and put the result in an array) */
691    for (f = format; *f; f++) {
692         if (*f == '%') {
693             if (*(f+1)=='%')
694                 continue;
695             if (*(f+1)=='S' || *(f+1)=='R')
696                 ++callcount;
697             while (isdigit((unsigned)*f))
698                 width = (width*10) + *f++ - '0';
699             while (*++f && *f != '%' && !isalpha((unsigned)*f))
700                 ;
701             if (*f == 's')
702                 ++callcount;
703         }
704    }
705    /* step 2: allocate memory for the results of
706     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
707    if (callcount) {
708        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
709        if (!callresults) {
710            PyErr_NoMemory();
711            return NULL;
712        }
713        callresult = callresults;
714    }
715    /* step 3: figure out how large a buffer we need */
716    for (f = format; *f; f++) {
717        if (*f == '%') {
718            const char* p = f;
719            width = 0;
720            while (isdigit((unsigned)*f))
721                width = (width*10) + *f++ - '0';
722            while (*++f && *f != '%' && !isalpha((unsigned)*f))
723                ;
724
725            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
726             * they don't affect the amount of space we reserve.
727             */
728            if ((*f == 'l' || *f == 'z') &&
729                (f[1] == 'd' || f[1] == 'u'))
730                ++f;
731
732            switch (*f) {
733            case 'c':
734                (void)va_arg(count, int);
735                /* fall through... */
736            case '%':
737                n++;
738                break;
739            case 'd': case 'u': case 'i': case 'x':
740                (void) va_arg(count, int);
741                /* 20 bytes is enough to hold a 64-bit
742                   integer.  Decimal takes the most space.
743                   This isn't enough for octal.
744                   If a width is specified we need more
745                   (which we allocate later). */
746                if (width < 20)
747                    width = 20;
748                n += width;
749                if (abuffersize < width)
750                    abuffersize = width;
751                break;
752            case 's':
753            {
754                /* UTF-8 */
755                const char *s = va_arg(count, const char*);
756                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
757                if (!str)
758                    goto fail;
759                n += PyUnicode_GET_SIZE(str);
760                /* Remember the str and switch to the next slot */
761                *callresult++ = str;
762                break;
763            }
764            case 'U':
765            {
766                PyObject *obj = va_arg(count, PyObject *);
767                assert(obj && PyUnicode_Check(obj));
768                n += PyUnicode_GET_SIZE(obj);
769                break;
770            }
771            case 'V':
772            {
773                PyObject *obj = va_arg(count, PyObject *);
774                const char *str = va_arg(count, const char *);
775                assert(obj || str);
776                assert(!obj || PyUnicode_Check(obj));
777                if (obj)
778                    n += PyUnicode_GET_SIZE(obj);
779                else
780                    n += strlen(str);
781                break;
782            }
783            case 'S':
784            {
785                PyObject *obj = va_arg(count, PyObject *);
786                PyObject *str;
787                assert(obj);
788                str = PyObject_Str(obj);
789                if (!str)
790                    goto fail;
791                n += PyUnicode_GET_SIZE(str);
792                /* Remember the str and switch to the next slot */
793                *callresult++ = str;
794                break;
795            }
796            case 'R':
797            {
798                PyObject *obj = va_arg(count, PyObject *);
799                PyObject *repr;
800                assert(obj);
801                repr = PyObject_Repr(obj);
802                if (!repr)
803                    goto fail;
804                n += PyUnicode_GET_SIZE(repr);
805                /* Remember the repr and switch to the next slot */
806                *callresult++ = repr;
807                break;
808            }
809            case 'p':
810                (void) va_arg(count, int);
811                /* maximum 64-bit pointer representation:
812                 * 0xffffffffffffffff
813                 * so 19 characters is enough.
814                 * XXX I count 18 -- what's the extra for?
815                 */
816                n += 19;
817                break;
818            default:
819                /* if we stumble upon an unknown
820                   formatting code, copy the rest of
821                   the format string to the output
822                   string. (we cannot just skip the
823                   code, since there's no way to know
824                   what's in the argument list) */
825                n += strlen(p);
826                goto expand;
827            }
828        } else
829            n++;
830    }
831  expand:
832    if (abuffersize > 20) {
833        abuffer = PyObject_Malloc(abuffersize);
834        if (!abuffer) {
835            PyErr_NoMemory();
836            goto fail;
837        }
838        realbuffer = abuffer;
839    }
840    else
841        realbuffer = buffer;
842    /* step 4: fill the buffer */
843    /* Since we've analyzed how much space we need for the worst case,
844       we don't have to resize the string.
845       There can be no errors beyond this point. */
846    string = PyUnicode_FromUnicode(NULL, n);
847    if (!string)
848        goto fail;
849
850    s = PyUnicode_AS_UNICODE(string);
851    callresult = callresults;
852
853    for (f = format; *f; f++) {
854        if (*f == '%') {
855            const char* p = f++;
856            int longflag = 0;
857            int size_tflag = 0;
858            zeropad = (*f == '0');
859            /* parse the width.precision part */
860            width = 0;
861            while (isdigit((unsigned)*f))
862                width = (width*10) + *f++ - '0';
863            precision = 0;
864            if (*f == '.') {
865                f++;
866                while (isdigit((unsigned)*f))
867                    precision = (precision*10) + *f++ - '0';
868            }
869            /* handle the long flag, but only for %ld and %lu.
870               others can be added when necessary. */
871            if (*f == 'l' && (f[1] == 'd' || f[1] == 'u')) {
872                longflag = 1;
873                ++f;
874            }
875            /* handle the size_t flag. */
876            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
877                size_tflag = 1;
878                ++f;
879            }
880
881            switch (*f) {
882            case 'c':
883                *s++ = va_arg(vargs, int);
884                break;
885            case 'd':
886                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'd');
887                if (longflag)
888                    sprintf(realbuffer, fmt, va_arg(vargs, long));
889                else if (size_tflag)
890                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
891                else
892                    sprintf(realbuffer, fmt, va_arg(vargs, int));
893                appendstring(realbuffer);
894                break;
895            case 'u':
896                makefmt(fmt, longflag, size_tflag, zeropad, width, precision, 'u');
897                if (longflag)
898                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
899                else if (size_tflag)
900                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
901                else
902                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
903                appendstring(realbuffer);
904                break;
905            case 'i':
906                makefmt(fmt, 0, 0, zeropad, width, precision, 'i');
907                sprintf(realbuffer, fmt, va_arg(vargs, int));
908                appendstring(realbuffer);
909                break;
910            case 'x':
911                makefmt(fmt, 0, 0, zeropad, width, precision, 'x');
912                sprintf(realbuffer, fmt, va_arg(vargs, int));
913                appendstring(realbuffer);
914                break;
915            case 's':
916            {
917                /* unused, since we already have the result */
918                (void) va_arg(vargs, char *);
919                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
920                                PyUnicode_GET_SIZE(*callresult));
921                s += PyUnicode_GET_SIZE(*callresult);
922                /* We're done with the unicode()/repr() => forget it */
923                Py_DECREF(*callresult);
924                /* switch to next unicode()/repr() result */
925                ++callresult;
926                break;
927            }
928            case 'U':
929            {
930                PyObject *obj = va_arg(vargs, PyObject *);
931                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
932                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
933                s += size;
934                break;
935            }
936            case 'V':
937            {
938                PyObject *obj = va_arg(vargs, PyObject *);
939                const char *str = va_arg(vargs, const char *);
940                if (obj) {
941                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
942                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
943                    s += size;
944                } else {
945                    appendstring(str);
946                }
947                break;
948            }
949            case 'S':
950            case 'R':
951            {
952                Py_UNICODE *ucopy;
953                Py_ssize_t usize;
954                Py_ssize_t upos;
955                /* unused, since we already have the result */
956                (void) va_arg(vargs, PyObject *);
957                ucopy = PyUnicode_AS_UNICODE(*callresult);
958                usize = PyUnicode_GET_SIZE(*callresult);
959                for (upos = 0; upos<usize;)
960                    *s++ = ucopy[upos++];
961                /* We're done with the unicode()/repr() => forget it */
962                Py_DECREF(*callresult);
963                /* switch to next unicode()/repr() result */
964                ++callresult;
965                break;
966            }
967            case 'p':
968                sprintf(buffer, "%p", va_arg(vargs, void*));
969                /* %p is ill-defined:  ensure leading 0x. */
970                if (buffer[1] == 'X')
971                    buffer[1] = 'x';
972                else if (buffer[1] != 'x') {
973                    memmove(buffer+2, buffer, strlen(buffer)+1);
974                    buffer[0] = '0';
975                    buffer[1] = 'x';
976                }
977                appendstring(buffer);
978                break;
979            case '%':
980                *s++ = '%';
981                break;
982            default:
983                appendstring(p);
984                goto end;
985            }
986        } else
987            *s++ = *f;
988    }
989
990  end:
991    if (callresults)
992        PyObject_Free(callresults);
993    if (abuffer)
994        PyObject_Free(abuffer);
995    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
996    return string;
997  fail:
998    if (callresults) {
999        PyObject **callresult2 = callresults;
1000        while (callresult2 < callresult) {
1001            Py_DECREF(*callresult2);
1002            ++callresult2;
1003        }
1004        PyObject_Free(callresults);
1005    }
1006    if (abuffer)
1007        PyObject_Free(abuffer);
1008    return NULL;
1009}
1010
1011#undef appendstring
1012
1013PyObject *
1014PyUnicode_FromFormat(const char *format, ...)
1015{
1016    PyObject* ret;
1017    va_list vargs;
1018
1019#ifdef HAVE_STDARG_PROTOTYPES
1020    va_start(vargs, format);
1021#else
1022    va_start(vargs);
1023#endif
1024    ret = PyUnicode_FromFormatV(format, vargs);
1025    va_end(vargs);
1026    return ret;
1027}
1028
1029Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1030                                wchar_t *w,
1031                                Py_ssize_t size)
1032{
1033    if (unicode == NULL) {
1034        PyErr_BadInternalCall();
1035        return -1;
1036    }
1037
1038    /* If possible, try to copy the 0-termination as well */
1039    if (size > PyUnicode_GET_SIZE(unicode))
1040        size = PyUnicode_GET_SIZE(unicode) + 1;
1041
1042#ifdef HAVE_USABLE_WCHAR_T
1043    memcpy(w, unicode->str, size * sizeof(wchar_t));
1044#else
1045    {
1046        register Py_UNICODE *u;
1047        register Py_ssize_t i;
1048        u = PyUnicode_AS_UNICODE(unicode);
1049        for (i = size; i > 0; i--)
1050            *w++ = *u++;
1051    }
1052#endif
1053
1054    if (size > PyUnicode_GET_SIZE(unicode))
1055        return PyUnicode_GET_SIZE(unicode);
1056    else
1057        return size;
1058}
1059
1060#endif
1061
1062PyObject *PyUnicode_FromOrdinal(int ordinal)
1063{
1064    Py_UNICODE s[1];
1065
1066#ifdef Py_UNICODE_WIDE
1067    if (ordinal < 0 || ordinal > 0x10ffff) {
1068        PyErr_SetString(PyExc_ValueError,
1069                        "unichr() arg not in range(0x110000) "
1070                        "(wide Python build)");
1071        return NULL;
1072    }
1073#else
1074    if (ordinal < 0 || ordinal > 0xffff) {
1075        PyErr_SetString(PyExc_ValueError,
1076                        "unichr() arg not in range(0x10000) "
1077                        "(narrow Python build)");
1078        return NULL;
1079    }
1080#endif
1081
1082    s[0] = (Py_UNICODE)ordinal;
1083    return PyUnicode_FromUnicode(s, 1);
1084}
1085
1086PyObject *PyUnicode_FromObject(register PyObject *obj)
1087{
1088    /* XXX Perhaps we should make this API an alias of
1089       PyObject_Unicode() instead ?! */
1090    if (PyUnicode_CheckExact(obj)) {
1091        Py_INCREF(obj);
1092        return obj;
1093    }
1094    if (PyUnicode_Check(obj)) {
1095        /* For a Unicode subtype that's not a Unicode object,
1096           return a true Unicode object with the same data. */
1097        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1098                                     PyUnicode_GET_SIZE(obj));
1099    }
1100    return PyUnicode_FromEncodedObject(obj, NULL, "strict");
1101}
1102
1103PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1104                                      const char *encoding,
1105                                      const char *errors)
1106{
1107    const char *s = NULL;
1108    Py_ssize_t len;
1109    PyObject *v;
1110
1111    if (obj == NULL) {
1112        PyErr_BadInternalCall();
1113        return NULL;
1114    }
1115
1116#if 0
1117    /* For b/w compatibility we also accept Unicode objects provided
1118       that no encodings is given and then redirect to
1119       PyObject_Unicode() which then applies the additional logic for
1120       Unicode subclasses.
1121
1122       NOTE: This API should really only be used for object which
1123       represent *encoded* Unicode !
1124
1125    */
1126    if (PyUnicode_Check(obj)) {
1127        if (encoding) {
1128            PyErr_SetString(PyExc_TypeError,
1129                            "decoding Unicode is not supported");
1130            return NULL;
1131        }
1132        return PyObject_Unicode(obj);
1133    }
1134#else
1135    if (PyUnicode_Check(obj)) {
1136        PyErr_SetString(PyExc_TypeError,
1137                        "decoding Unicode is not supported");
1138        return NULL;
1139    }
1140#endif
1141
1142    /* Coerce object */
1143    if (PyString_Check(obj)) {
1144        s = PyString_AS_STRING(obj);
1145        len = PyString_GET_SIZE(obj);
1146    }
1147    else if (PyByteArray_Check(obj)) {
1148        /* Python 2.x specific */
1149        PyErr_Format(PyExc_TypeError,
1150                     "decoding bytearray is not supported");
1151        return NULL;
1152    }
1153    else if (PyObject_AsCharBuffer(obj, &s, &len)) {
1154        /* Overwrite the error message with something more useful in
1155           case of a TypeError. */
1156        if (PyErr_ExceptionMatches(PyExc_TypeError))
1157            PyErr_Format(PyExc_TypeError,
1158                         "coercing to Unicode: need string or buffer, "
1159                         "%.80s found",
1160                         Py_TYPE(obj)->tp_name);
1161        goto onError;
1162    }
1163
1164    /* Convert to Unicode */
1165    if (len == 0) {
1166        Py_INCREF(unicode_empty);
1167        v = (PyObject *)unicode_empty;
1168    }
1169    else
1170        v = PyUnicode_Decode(s, len, encoding, errors);
1171
1172    return v;
1173
1174  onError:
1175    return NULL;
1176}
1177
1178PyObject *PyUnicode_Decode(const char *s,
1179                           Py_ssize_t size,
1180                           const char *encoding,
1181                           const char *errors)
1182{
1183    PyObject *buffer = NULL, *unicode;
1184
1185    if (encoding == NULL)
1186        encoding = PyUnicode_GetDefaultEncoding();
1187
1188    /* Shortcuts for common default encodings */
1189    if (strcmp(encoding, "utf-8") == 0)
1190        return PyUnicode_DecodeUTF8(s, size, errors);
1191    else if (strcmp(encoding, "latin-1") == 0)
1192        return PyUnicode_DecodeLatin1(s, size, errors);
1193#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1194    else if (strcmp(encoding, "mbcs") == 0)
1195        return PyUnicode_DecodeMBCS(s, size, errors);
1196#endif
1197    else if (strcmp(encoding, "ascii") == 0)
1198        return PyUnicode_DecodeASCII(s, size, errors);
1199
1200    /* Decode via the codec registry */
1201    buffer = PyBuffer_FromMemory((void *)s, size);
1202    if (buffer == NULL)
1203        goto onError;
1204    unicode = PyCodec_Decode(buffer, encoding, errors);
1205    if (unicode == NULL)
1206        goto onError;
1207    if (!PyUnicode_Check(unicode)) {
1208        PyErr_Format(PyExc_TypeError,
1209                     "decoder did not return an unicode object (type=%.400s)",
1210                     Py_TYPE(unicode)->tp_name);
1211        Py_DECREF(unicode);
1212        goto onError;
1213    }
1214    Py_DECREF(buffer);
1215    return unicode;
1216
1217  onError:
1218    Py_XDECREF(buffer);
1219    return NULL;
1220}
1221
1222PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1223                                    const char *encoding,
1224                                    const char *errors)
1225{
1226    PyObject *v;
1227
1228    if (!PyUnicode_Check(unicode)) {
1229        PyErr_BadArgument();
1230        goto onError;
1231    }
1232
1233    if (encoding == NULL)
1234        encoding = PyUnicode_GetDefaultEncoding();
1235
1236    /* Decode via the codec registry */
1237    v = PyCodec_Decode(unicode, encoding, errors);
1238    if (v == NULL)
1239        goto onError;
1240    return v;
1241
1242  onError:
1243    return NULL;
1244}
1245
1246PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1247                           Py_ssize_t size,
1248                           const char *encoding,
1249                           const char *errors)
1250{
1251    PyObject *v, *unicode;
1252
1253    unicode = PyUnicode_FromUnicode(s, size);
1254    if (unicode == NULL)
1255        return NULL;
1256    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1257    Py_DECREF(unicode);
1258    return v;
1259}
1260
1261PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1262                                    const char *encoding,
1263                                    const char *errors)
1264{
1265    PyObject *v;
1266
1267    if (!PyUnicode_Check(unicode)) {
1268        PyErr_BadArgument();
1269        goto onError;
1270    }
1271
1272    if (encoding == NULL)
1273        encoding = PyUnicode_GetDefaultEncoding();
1274
1275    /* Encode via the codec registry */
1276    v = PyCodec_Encode(unicode, encoding, errors);
1277    if (v == NULL)
1278        goto onError;
1279    return v;
1280
1281  onError:
1282    return NULL;
1283}
1284
1285PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1286                                    const char *encoding,
1287                                    const char *errors)
1288{
1289    PyObject *v;
1290
1291    if (!PyUnicode_Check(unicode)) {
1292        PyErr_BadArgument();
1293        goto onError;
1294    }
1295
1296    if (encoding == NULL)
1297        encoding = PyUnicode_GetDefaultEncoding();
1298
1299    /* Shortcuts for common default encodings */
1300    if (errors == NULL) {
1301        if (strcmp(encoding, "utf-8") == 0)
1302            return PyUnicode_AsUTF8String(unicode);
1303        else if (strcmp(encoding, "latin-1") == 0)
1304            return PyUnicode_AsLatin1String(unicode);
1305#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1306        else if (strcmp(encoding, "mbcs") == 0)
1307            return PyUnicode_AsMBCSString(unicode);
1308#endif
1309        else if (strcmp(encoding, "ascii") == 0)
1310            return PyUnicode_AsASCIIString(unicode);
1311    }
1312
1313    /* Encode via the codec registry */
1314    v = PyCodec_Encode(unicode, encoding, errors);
1315    if (v == NULL)
1316        goto onError;
1317    if (!PyString_Check(v)) {
1318        PyErr_Format(PyExc_TypeError,
1319                     "encoder did not return a string object (type=%.400s)",
1320                     Py_TYPE(v)->tp_name);
1321        Py_DECREF(v);
1322        goto onError;
1323    }
1324    return v;
1325
1326  onError:
1327    return NULL;
1328}
1329
1330PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1331                                            const char *errors)
1332{
1333    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1334
1335    if (v)
1336        return v;
1337    v = PyUnicode_AsEncodedString(unicode, NULL, errors);
1338    if (v && errors == NULL)
1339        ((PyUnicodeObject *)unicode)->defenc = v;
1340    return v;
1341}
1342
1343Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1344{
1345    if (!PyUnicode_Check(unicode)) {
1346        PyErr_BadArgument();
1347        goto onError;
1348    }
1349    return PyUnicode_AS_UNICODE(unicode);
1350
1351  onError:
1352    return NULL;
1353}
1354
1355Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1356{
1357    if (!PyUnicode_Check(unicode)) {
1358        PyErr_BadArgument();
1359        goto onError;
1360    }
1361    return PyUnicode_GET_SIZE(unicode);
1362
1363  onError:
1364    return -1;
1365}
1366
1367const char *PyUnicode_GetDefaultEncoding(void)
1368{
1369    return unicode_default_encoding;
1370}
1371
1372int PyUnicode_SetDefaultEncoding(const char *encoding)
1373{
1374    PyObject *v;
1375
1376    /* Make sure the encoding is valid. As side effect, this also
1377       loads the encoding into the codec registry cache. */
1378    v = _PyCodec_Lookup(encoding);
1379    if (v == NULL)
1380        goto onError;
1381    Py_DECREF(v);
1382    strncpy(unicode_default_encoding,
1383            encoding,
1384            sizeof(unicode_default_encoding));
1385    return 0;
1386
1387  onError:
1388    return -1;
1389}
1390
1391/* error handling callback helper:
1392   build arguments, call the callback and check the arguments,
1393   if no exception occurred, copy the replacement to the output
1394   and adjust various state variables.
1395   return 0 on success, -1 on error
1396*/
1397
1398static
1399int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1400                                     const char *encoding, const char *reason,
1401                                     const char *input, Py_ssize_t insize, Py_ssize_t *startinpos,
1402                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1403                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1404{
1405    static char *argparse = "O!n;decoding error handler must return (unicode, int) tuple";
1406
1407    PyObject *restuple = NULL;
1408    PyObject *repunicode = NULL;
1409    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1410    Py_ssize_t requiredsize;
1411    Py_ssize_t newpos;
1412    Py_UNICODE *repptr;
1413    Py_ssize_t repsize;
1414    int res = -1;
1415
1416    if (*errorHandler == NULL) {
1417        *errorHandler = PyCodec_LookupError(errors);
1418        if (*errorHandler == NULL)
1419            goto onError;
1420    }
1421
1422    if (*exceptionObject == NULL) {
1423        *exceptionObject = PyUnicodeDecodeError_Create(
1424            encoding, input, insize, *startinpos, *endinpos, reason);
1425        if (*exceptionObject == NULL)
1426            goto onError;
1427    }
1428    else {
1429        if (PyUnicodeDecodeError_SetStart(*exceptionObject, *startinpos))
1430            goto onError;
1431        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, *endinpos))
1432            goto onError;
1433        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1434            goto onError;
1435    }
1436
1437    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1438    if (restuple == NULL)
1439        goto onError;
1440    if (!PyTuple_Check(restuple)) {
1441        PyErr_SetString(PyExc_TypeError, &argparse[4]);
1442        goto onError;
1443    }
1444    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1445        goto onError;
1446    if (newpos<0)
1447        newpos = insize+newpos;
1448    if (newpos<0 || newpos>insize) {
1449        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1450        goto onError;
1451    }
1452
1453    /* need more space? (at least enough for what we
1454       have+the replacement+the rest of the string (starting
1455       at the new input position), so we won't have to check space
1456       when there are no errors in the rest of the string) */
1457    repptr = PyUnicode_AS_UNICODE(repunicode);
1458    repsize = PyUnicode_GET_SIZE(repunicode);
1459    requiredsize = *outpos + repsize + insize-newpos;
1460    if (requiredsize > outsize) {
1461        if (requiredsize<2*outsize)
1462            requiredsize = 2*outsize;
1463        if (_PyUnicode_Resize(output, requiredsize) < 0)
1464            goto onError;
1465        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1466    }
1467    *endinpos = newpos;
1468    *inptr = input + newpos;
1469    Py_UNICODE_COPY(*outptr, repptr, repsize);
1470    *outptr += repsize;
1471    *outpos += repsize;
1472    /* we made it! */
1473    res = 0;
1474
1475  onError:
1476    Py_XDECREF(restuple);
1477    return res;
1478}
1479
1480/* --- UTF-7 Codec -------------------------------------------------------- */
1481
1482/* See RFC2152 for details.  We encode conservatively and decode liberally. */
1483
1484/* Three simple macros defining base-64. */
1485
1486/* Is c a base-64 character? */
1487
1488#define IS_BASE64(c) \
1489    (isalnum(c) || (c) == '+' || (c) == '/')
1490
1491/* given that c is a base-64 character, what is its base-64 value? */
1492
1493#define FROM_BASE64(c)                                                  \
1494    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1495     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1496     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1497     (c) == '+' ? 62 : 63)
1498
1499/* What is the base-64 character of the bottom 6 bits of n? */
1500
1501#define TO_BASE64(n)  \
1502    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1503
1504/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1505 * decoded as itself.  We are permissive on decoding; the only ASCII
1506 * byte not decoding to itself is the + which begins a base64
1507 * string. */
1508
1509#define DECODE_DIRECT(c)                                \
1510    ((c) <= 127 && (c) != '+')
1511
1512/* The UTF-7 encoder treats ASCII characters differently according to
1513 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1514 * the above).  See RFC2152.  This array identifies these different
1515 * sets:
1516 * 0 : "Set D"
1517 *     alphanumeric and '(),-./:?
1518 * 1 : "Set O"
1519 *     !"#$%&*;<=>@[]^_`{|}
1520 * 2 : "whitespace"
1521 *     ht nl cr sp
1522 * 3 : special (must be base64 encoded)
1523 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1524 */
1525
1526static
1527char utf7_category[128] = {
1528/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1529    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1530/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1531    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1532/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1533    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1534/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1535    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1536/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1537    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1538/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1539    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1540/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1541    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1542/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1543    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1544};
1545
1546/* ENCODE_DIRECT: this character should be encoded as itself.  The
1547 * answer depends on whether we are encoding set O as itself, and also
1548 * on whether we are encoding whitespace as itself.  RFC2152 makes it
1549 * clear that the answers to these questions vary between
1550 * applications, so this code needs to be flexible.  */
1551
1552#define ENCODE_DIRECT(c, directO, directWS)             \
1553    ((c) < 128 && (c) > 0 &&                            \
1554     ((utf7_category[(c)] == 0) ||                      \
1555      (directWS && (utf7_category[(c)] == 2)) ||        \
1556      (directO && (utf7_category[(c)] == 1))))
1557
1558PyObject *PyUnicode_DecodeUTF7(const char *s,
1559                               Py_ssize_t size,
1560                               const char *errors)
1561{
1562    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1563}
1564
1565/* The decoder.  The only state we preserve is our read position,
1566 * i.e. how many characters we have consumed.  So if we end in the
1567 * middle of a shift sequence we have to back off the read position
1568 * and the output to the beginning of the sequence, otherwise we lose
1569 * all the shift state (seen bits, number of bits seen, high
1570 * surrogate). */
1571
1572PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
1573                                       Py_ssize_t size,
1574                                       const char *errors,
1575                                       Py_ssize_t *consumed)
1576{
1577    const char *starts = s;
1578    Py_ssize_t startinpos;
1579    Py_ssize_t endinpos;
1580    Py_ssize_t outpos;
1581    const char *e;
1582    PyUnicodeObject *unicode;
1583    Py_UNICODE *p;
1584    const char *errmsg = "";
1585    int inShift = 0;
1586    Py_UNICODE *shiftOutStart;
1587    unsigned int base64bits = 0;
1588    unsigned long base64buffer = 0;
1589    Py_UNICODE surrogate = 0;
1590    PyObject *errorHandler = NULL;
1591    PyObject *exc = NULL;
1592
1593    unicode = _PyUnicode_New(size);
1594    if (!unicode)
1595        return NULL;
1596    if (size == 0) {
1597        if (consumed)
1598            *consumed = 0;
1599        return (PyObject *)unicode;
1600    }
1601
1602    p = unicode->str;
1603    shiftOutStart = p;
1604    e = s + size;
1605
1606    while (s < e) {
1607        Py_UNICODE ch = (unsigned char) *s;
1608
1609        if (inShift) { /* in a base-64 section */
1610            if (IS_BASE64(ch)) { /* consume a base-64 character */
1611                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
1612                base64bits += 6;
1613                s++;
1614                if (base64bits >= 16) {
1615                    /* we have enough bits for a UTF-16 value */
1616                    Py_UNICODE outCh = (Py_UNICODE)
1617                                       (base64buffer >> (base64bits-16));
1618                    base64bits -= 16;
1619                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
1620                    if (surrogate) {
1621                        /* expecting a second surrogate */
1622                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1623#ifdef Py_UNICODE_WIDE
1624                            *p++ = (((surrogate & 0x3FF)<<10)
1625                                    | (outCh & 0x3FF)) + 0x10000;
1626#else
1627                            *p++ = surrogate;
1628                            *p++ = outCh;
1629#endif
1630                            surrogate = 0;
1631                        }
1632                        else {
1633                            surrogate = 0;
1634                            errmsg = "second surrogate missing";
1635                            goto utf7Error;
1636                        }
1637                    }
1638                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
1639                        /* first surrogate */
1640                        surrogate = outCh;
1641                    }
1642                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
1643                        errmsg = "unexpected second surrogate";
1644                        goto utf7Error;
1645                    }
1646                    else {
1647                        *p++ = outCh;
1648                    }
1649                }
1650            }
1651            else { /* now leaving a base-64 section */
1652                inShift = 0;
1653                s++;
1654                if (surrogate) {
1655                    errmsg = "second surrogate missing at end of shift sequence";
1656                    goto utf7Error;
1657                }
1658                if (base64bits > 0) { /* left-over bits */
1659                    if (base64bits >= 6) {
1660                        /* We've seen at least one base-64 character */
1661                        errmsg = "partial character in shift sequence";
1662                        goto utf7Error;
1663                    }
1664                    else {
1665                        /* Some bits remain; they should be zero */
1666                        if (base64buffer != 0) {
1667                            errmsg = "non-zero padding bits in shift sequence";
1668                            goto utf7Error;
1669                        }
1670                    }
1671                }
1672                if (ch != '-') {
1673                    /* '-' is absorbed; other terminating
1674                       characters are preserved */
1675                    *p++ = ch;
1676                }
1677            }
1678        }
1679        else if ( ch == '+' ) {
1680            startinpos = s-starts;
1681            s++; /* consume '+' */
1682            if (s < e && *s == '-') { /* '+-' encodes '+' */
1683                s++;
1684                *p++ = '+';
1685            }
1686            else { /* begin base64-encoded section */
1687                inShift = 1;
1688                shiftOutStart = p;
1689                base64bits = 0;
1690            }
1691        }
1692        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
1693            *p++ = ch;
1694            s++;
1695        }
1696        else {
1697            startinpos = s-starts;
1698            s++;
1699            errmsg = "unexpected special character";
1700            goto utf7Error;
1701        }
1702        continue;
1703utf7Error:
1704        outpos = p-PyUnicode_AS_UNICODE(unicode);
1705        endinpos = s-starts;
1706        if (unicode_decode_call_errorhandler(
1707                errors, &errorHandler,
1708                "utf7", errmsg,
1709                starts, size, &startinpos, &endinpos, &exc, &s,
1710                &unicode, &outpos, &p))
1711            goto onError;
1712    }
1713
1714    /* end of string */
1715
1716    if (inShift && !consumed) { /* in shift sequence, no more to follow */
1717        /* if we're in an inconsistent state, that's an error */
1718        if (surrogate ||
1719                (base64bits >= 6) ||
1720                (base64bits > 0 && base64buffer != 0)) {
1721            outpos = p-PyUnicode_AS_UNICODE(unicode);
1722            endinpos = size;
1723            if (unicode_decode_call_errorhandler(
1724                    errors, &errorHandler,
1725                    "utf7", "unterminated shift sequence",
1726                    starts, size, &startinpos, &endinpos, &exc, &s,
1727                    &unicode, &outpos, &p))
1728                goto onError;
1729        }
1730    }
1731
1732    /* return state */
1733    if (consumed) {
1734        if (inShift) {
1735            p = shiftOutStart; /* back off output */
1736            *consumed = startinpos;
1737        }
1738        else {
1739            *consumed = s-starts;
1740        }
1741    }
1742
1743    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
1744        goto onError;
1745
1746    Py_XDECREF(errorHandler);
1747    Py_XDECREF(exc);
1748    return (PyObject *)unicode;
1749
1750  onError:
1751    Py_XDECREF(errorHandler);
1752    Py_XDECREF(exc);
1753    Py_DECREF(unicode);
1754    return NULL;
1755}
1756
1757
1758PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
1759                               Py_ssize_t size,
1760                               int base64SetO,
1761                               int base64WhiteSpace,
1762                               const char *errors)
1763{
1764    PyObject *v;
1765    /* It might be possible to tighten this worst case */
1766    Py_ssize_t allocated = 8 * size;
1767    int inShift = 0;
1768    Py_ssize_t i = 0;
1769    unsigned int base64bits = 0;
1770    unsigned long base64buffer = 0;
1771    char * out;
1772    char * start;
1773
1774    if (allocated / 8 != size)
1775        return PyErr_NoMemory();
1776
1777    if (size == 0)
1778        return PyString_FromStringAndSize(NULL, 0);
1779
1780    v = PyString_FromStringAndSize(NULL, allocated);
1781    if (v == NULL)
1782        return NULL;
1783
1784    start = out = PyString_AS_STRING(v);
1785    for (;i < size; ++i) {
1786        Py_UNICODE ch = s[i];
1787
1788        if (inShift) {
1789            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1790                /* shifting out */
1791                if (base64bits) { /* output remaining bits */
1792                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
1793                    base64buffer = 0;
1794                    base64bits = 0;
1795                }
1796                inShift = 0;
1797                /* Characters not in the BASE64 set implicitly unshift the sequence
1798                   so no '-' is required, except if the character is itself a '-' */
1799                if (IS_BASE64(ch) || ch == '-') {
1800                    *out++ = '-';
1801                }
1802                *out++ = (char) ch;
1803            }
1804            else {
1805                goto encode_char;
1806            }
1807        }
1808        else { /* not in a shift sequence */
1809            if (ch == '+') {
1810                *out++ = '+';
1811                        *out++ = '-';
1812            }
1813            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
1814                *out++ = (char) ch;
1815            }
1816            else {
1817                *out++ = '+';
1818                inShift = 1;
1819                goto encode_char;
1820            }
1821        }
1822        continue;
1823encode_char:
1824#ifdef Py_UNICODE_WIDE
1825        if (ch >= 0x10000) {
1826            /* code first surrogate */
1827            base64bits += 16;
1828            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
1829            while (base64bits >= 6) {
1830                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1831                base64bits -= 6;
1832            }
1833            /* prepare second surrogate */
1834            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
1835        }
1836#endif
1837        base64bits += 16;
1838        base64buffer = (base64buffer << 16) | ch;
1839        while (base64bits >= 6) {
1840            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
1841            base64bits -= 6;
1842        }
1843    }
1844    if (base64bits)
1845        *out++= TO_BASE64(base64buffer << (6-base64bits) );
1846    if (inShift)
1847        *out++ = '-';
1848
1849    if (_PyString_Resize(&v, out - start))
1850        return NULL;
1851    return v;
1852}
1853
1854#undef IS_BASE64
1855#undef FROM_BASE64
1856#undef TO_BASE64
1857#undef DECODE_DIRECT
1858#undef ENCODE_DIRECT
1859
1860/* --- UTF-8 Codec -------------------------------------------------------- */
1861
1862static
1863char utf8_code_length[256] = {
1864    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
1865       illegal prefix.  See RFC 3629 for details */
1866    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
1867    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1868    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1869    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1870    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1871    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1872    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
1873    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
1874    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
1875    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1876    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
1877    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
1878    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
1879    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
1880    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
1881    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
1882};
1883
1884PyObject *PyUnicode_DecodeUTF8(const char *s,
1885                               Py_ssize_t size,
1886                               const char *errors)
1887{
1888    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
1889}
1890
1891PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
1892                                       Py_ssize_t size,
1893                                       const char *errors,
1894                                       Py_ssize_t *consumed)
1895{
1896    const char *starts = s;
1897    int n;
1898    int k;
1899    Py_ssize_t startinpos;
1900    Py_ssize_t endinpos;
1901    Py_ssize_t outpos;
1902    const char *e;
1903    PyUnicodeObject *unicode;
1904    Py_UNICODE *p;
1905    const char *errmsg = "";
1906    PyObject *errorHandler = NULL;
1907    PyObject *exc = NULL;
1908
1909    /* Note: size will always be longer than the resulting Unicode
1910       character count */
1911    unicode = _PyUnicode_New(size);
1912    if (!unicode)
1913        return NULL;
1914    if (size == 0) {
1915        if (consumed)
1916            *consumed = 0;
1917        return (PyObject *)unicode;
1918    }
1919
1920    /* Unpack UTF-8 encoded data */
1921    p = unicode->str;
1922    e = s + size;
1923
1924    while (s < e) {
1925        Py_UCS4 ch = (unsigned char)*s;
1926
1927        if (ch < 0x80) {
1928            *p++ = (Py_UNICODE)ch;
1929            s++;
1930            continue;
1931        }
1932
1933        n = utf8_code_length[ch];
1934
1935        if (s + n > e) {
1936            if (consumed)
1937                break;
1938            else {
1939                errmsg = "unexpected end of data";
1940                startinpos = s-starts;
1941                endinpos = startinpos+1;
1942                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
1943                    endinpos++;
1944                goto utf8Error;
1945            }
1946        }
1947
1948        switch (n) {
1949
1950        case 0:
1951            errmsg = "invalid start byte";
1952            startinpos = s-starts;
1953            endinpos = startinpos+1;
1954            goto utf8Error;
1955
1956        case 1:
1957            errmsg = "internal error";
1958            startinpos = s-starts;
1959            endinpos = startinpos+1;
1960            goto utf8Error;
1961
1962        case 2:
1963            if ((s[1] & 0xc0) != 0x80) {
1964                errmsg = "invalid continuation byte";
1965                startinpos = s-starts;
1966                endinpos = startinpos + 1;
1967                goto utf8Error;
1968            }
1969            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
1970            assert ((ch > 0x007F) && (ch <= 0x07FF));
1971            *p++ = (Py_UNICODE)ch;
1972            break;
1973
1974        case 3:
1975            /* XXX: surrogates shouldn't be valid UTF-8!
1976               see http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
1977               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt
1978               Uncomment the 2 lines below to make them invalid,
1979               codepoints: d800-dfff; UTF-8: \xed\xa0\x80-\xed\xbf\xbf. */
1980            if ((s[1] & 0xc0) != 0x80 ||
1981                (s[2] & 0xc0) != 0x80 ||
1982                ((unsigned char)s[0] == 0xE0 &&
1983                 (unsigned char)s[1] < 0xA0)/* ||
1984                ((unsigned char)s[0] == 0xED &&
1985                 (unsigned char)s[1] > 0x9F)*/) {
1986                errmsg = "invalid continuation byte";
1987                startinpos = s-starts;
1988                endinpos = startinpos + 1;
1989
1990                /* if s[1] first two bits are 1 and 0, then the invalid
1991                   continuation byte is s[2], so increment endinpos by 1,
1992                   if not, s[1] is invalid and endinpos doesn't need to
1993                   be incremented. */
1994                if ((s[1] & 0xC0) == 0x80)
1995                    endinpos++;
1996                goto utf8Error;
1997            }
1998            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
1999            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2000            *p++ = (Py_UNICODE)ch;
2001            break;
2002
2003        case 4:
2004            if ((s[1] & 0xc0) != 0x80 ||
2005                (s[2] & 0xc0) != 0x80 ||
2006                (s[3] & 0xc0) != 0x80 ||
2007                ((unsigned char)s[0] == 0xF0 &&
2008                 (unsigned char)s[1] < 0x90) ||
2009                ((unsigned char)s[0] == 0xF4 &&
2010                 (unsigned char)s[1] > 0x8F)) {
2011                errmsg = "invalid continuation byte";
2012                startinpos = s-starts;
2013                endinpos = startinpos + 1;
2014                if ((s[1] & 0xC0) == 0x80) {
2015                    endinpos++;
2016                    if ((s[2] & 0xC0) == 0x80)
2017                        endinpos++;
2018                }
2019                goto utf8Error;
2020            }
2021            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2022                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2023            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2024
2025#ifdef Py_UNICODE_WIDE
2026            *p++ = (Py_UNICODE)ch;
2027#else
2028            /*  compute and append the two surrogates: */
2029
2030            /*  translate from 10000..10FFFF to 0..FFFF */
2031            ch -= 0x10000;
2032
2033            /*  high surrogate = top 10 bits added to D800 */
2034            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2035
2036            /*  low surrogate = bottom 10 bits added to DC00 */
2037            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2038#endif
2039            break;
2040        }
2041        s += n;
2042        continue;
2043
2044      utf8Error:
2045        outpos = p-PyUnicode_AS_UNICODE(unicode);
2046        if (unicode_decode_call_errorhandler(
2047                errors, &errorHandler,
2048                "utf8", errmsg,
2049                starts, size, &startinpos, &endinpos, &exc, &s,
2050                &unicode, &outpos, &p))
2051            goto onError;
2052    }
2053    if (consumed)
2054        *consumed = s-starts;
2055
2056    /* Adjust length */
2057    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2058        goto onError;
2059
2060    Py_XDECREF(errorHandler);
2061    Py_XDECREF(exc);
2062    return (PyObject *)unicode;
2063
2064  onError:
2065    Py_XDECREF(errorHandler);
2066    Py_XDECREF(exc);
2067    Py_DECREF(unicode);
2068    return NULL;
2069}
2070
2071/* Allocation strategy:  if the string is short, convert into a stack buffer
2072   and allocate exactly as much space needed at the end.  Else allocate the
2073   maximum possible needed (4 result bytes per Unicode character), and return
2074   the excess memory at the end.
2075*/
2076PyObject *
2077PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2078                     Py_ssize_t size,
2079                     const char *errors)
2080{
2081#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2082
2083    Py_ssize_t i;           /* index into s of next input byte */
2084    PyObject *v;        /* result string object */
2085    char *p;            /* next free byte in output buffer */
2086    Py_ssize_t nallocated;  /* number of result bytes allocated */
2087    Py_ssize_t nneeded;        /* number of result bytes needed */
2088    char stackbuf[MAX_SHORT_UNICHARS * 4];
2089
2090    assert(s != NULL);
2091    assert(size >= 0);
2092
2093    if (size <= MAX_SHORT_UNICHARS) {
2094        /* Write into the stack buffer; nallocated can't overflow.
2095         * At the end, we'll allocate exactly as much heap space as it
2096         * turns out we need.
2097         */
2098        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2099        v = NULL;   /* will allocate after we're done */
2100        p = stackbuf;
2101    }
2102    else {
2103        /* Overallocate on the heap, and give the excess back at the end. */
2104        nallocated = size * 4;
2105        if (nallocated / 4 != size)  /* overflow! */
2106            return PyErr_NoMemory();
2107        v = PyString_FromStringAndSize(NULL, nallocated);
2108        if (v == NULL)
2109            return NULL;
2110        p = PyString_AS_STRING(v);
2111    }
2112
2113    for (i = 0; i < size;) {
2114        Py_UCS4 ch = s[i++];
2115
2116        if (ch < 0x80)
2117            /* Encode ASCII */
2118            *p++ = (char) ch;
2119
2120        else if (ch < 0x0800) {
2121            /* Encode Latin-1 */
2122            *p++ = (char)(0xc0 | (ch >> 6));
2123            *p++ = (char)(0x80 | (ch & 0x3f));
2124        }
2125        else {
2126            /* Encode UCS2 Unicode ordinals */
2127            if (ch < 0x10000) {
2128                /* Special case: check for high surrogate */
2129                if (0xD800 <= ch && ch <= 0xDBFF && i != size) {
2130                    Py_UCS4 ch2 = s[i];
2131                    /* Check for low surrogate and combine the two to
2132                       form a UCS4 value */
2133                    if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2134                        ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2135                        i++;
2136                        goto encodeUCS4;
2137                    }
2138                    /* Fall through: handles isolated high surrogates */
2139                }
2140                *p++ = (char)(0xe0 | (ch >> 12));
2141                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2142                *p++ = (char)(0x80 | (ch & 0x3f));
2143                continue;
2144            }
2145          encodeUCS4:
2146            /* Encode UCS4 Unicode ordinals */
2147            *p++ = (char)(0xf0 | (ch >> 18));
2148            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2149            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2150            *p++ = (char)(0x80 | (ch & 0x3f));
2151        }
2152    }
2153
2154    if (v == NULL) {
2155        /* This was stack allocated. */
2156        nneeded = p - stackbuf;
2157        assert(nneeded <= nallocated);
2158        v = PyString_FromStringAndSize(stackbuf, nneeded);
2159    }
2160    else {
2161        /* Cut back to size actually needed. */
2162        nneeded = p - PyString_AS_STRING(v);
2163        assert(nneeded <= nallocated);
2164        if (_PyString_Resize(&v, nneeded))
2165            return NULL;
2166    }
2167    return v;
2168
2169#undef MAX_SHORT_UNICHARS
2170}
2171
2172PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2173{
2174    if (!PyUnicode_Check(unicode)) {
2175        PyErr_BadArgument();
2176        return NULL;
2177    }
2178    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2179                                PyUnicode_GET_SIZE(unicode),
2180                                NULL);
2181}
2182
2183/* --- UTF-32 Codec ------------------------------------------------------- */
2184
2185PyObject *
2186PyUnicode_DecodeUTF32(const char *s,
2187                      Py_ssize_t size,
2188                      const char *errors,
2189                      int *byteorder)
2190{
2191    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2192}
2193
2194PyObject *
2195PyUnicode_DecodeUTF32Stateful(const char *s,
2196                              Py_ssize_t size,
2197                              const char *errors,
2198                              int *byteorder,
2199                              Py_ssize_t *consumed)
2200{
2201    const char *starts = s;
2202    Py_ssize_t startinpos;
2203    Py_ssize_t endinpos;
2204    Py_ssize_t outpos;
2205    PyUnicodeObject *unicode;
2206    Py_UNICODE *p;
2207#ifndef Py_UNICODE_WIDE
2208    int pairs = 0;
2209    const unsigned char *qq;
2210#else
2211    const int pairs = 0;
2212#endif
2213    const unsigned char *q, *e;
2214    int bo = 0;       /* assume native ordering by default */
2215    const char *errmsg = "";
2216    /* Offsets from q for retrieving bytes in the right order. */
2217#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2218    int iorder[] = {0, 1, 2, 3};
2219#else
2220    int iorder[] = {3, 2, 1, 0};
2221#endif
2222    PyObject *errorHandler = NULL;
2223    PyObject *exc = NULL;
2224
2225    q = (unsigned char *)s;
2226    e = q + size;
2227
2228    if (byteorder)
2229        bo = *byteorder;
2230
2231    /* Check for BOM marks (U+FEFF) in the input and adjust current
2232       byte order setting accordingly. In native mode, the leading BOM
2233       mark is skipped, in all other modes, it is copied to the output
2234       stream as-is (giving a ZWNBSP character). */
2235    if (bo == 0) {
2236        if (size >= 4) {
2237            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2238                (q[iorder[1]] << 8) | q[iorder[0]];
2239#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2240            if (bom == 0x0000FEFF) {
2241                q += 4;
2242                bo = -1;
2243            }
2244            else if (bom == 0xFFFE0000) {
2245                q += 4;
2246                bo = 1;
2247            }
2248#else
2249            if (bom == 0x0000FEFF) {
2250                q += 4;
2251                bo = 1;
2252            }
2253            else if (bom == 0xFFFE0000) {
2254                q += 4;
2255                bo = -1;
2256            }
2257#endif
2258        }
2259    }
2260
2261    if (bo == -1) {
2262        /* force LE */
2263        iorder[0] = 0;
2264        iorder[1] = 1;
2265        iorder[2] = 2;
2266        iorder[3] = 3;
2267    }
2268    else if (bo == 1) {
2269        /* force BE */
2270        iorder[0] = 3;
2271        iorder[1] = 2;
2272        iorder[2] = 1;
2273        iorder[3] = 0;
2274    }
2275
2276    /* On narrow builds we split characters outside the BMP into two
2277       codepoints => count how much extra space we need. */
2278#ifndef Py_UNICODE_WIDE
2279    for (qq = q; qq < e; qq += 4)
2280        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2281            pairs++;
2282#endif
2283
2284    /* This might be one to much, because of a BOM */
2285    unicode = _PyUnicode_New((size+3)/4+pairs);
2286    if (!unicode)
2287        return NULL;
2288    if (size == 0)
2289        return (PyObject *)unicode;
2290
2291    /* Unpack UTF-32 encoded data */
2292    p = unicode->str;
2293
2294    while (q < e) {
2295        Py_UCS4 ch;
2296        /* remaining bytes at the end? (size should be divisible by 4) */
2297        if (e-q<4) {
2298            if (consumed)
2299                break;
2300            errmsg = "truncated data";
2301            startinpos = ((const char *)q)-starts;
2302            endinpos = ((const char *)e)-starts;
2303            goto utf32Error;
2304            /* The remaining input chars are ignored if the callback
2305               chooses to skip the input */
2306        }
2307        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2308            (q[iorder[1]] << 8) | q[iorder[0]];
2309
2310        if (ch >= 0x110000)
2311        {
2312            errmsg = "codepoint not in range(0x110000)";
2313            startinpos = ((const char *)q)-starts;
2314            endinpos = startinpos+4;
2315            goto utf32Error;
2316        }
2317#ifndef Py_UNICODE_WIDE
2318        if (ch >= 0x10000)
2319        {
2320            *p++ = 0xD800 | ((ch-0x10000) >> 10);
2321            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2322        }
2323        else
2324#endif
2325            *p++ = ch;
2326        q += 4;
2327        continue;
2328      utf32Error:
2329        outpos = p-PyUnicode_AS_UNICODE(unicode);
2330        if (unicode_decode_call_errorhandler(
2331                errors, &errorHandler,
2332                "utf32", errmsg,
2333                starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2334                &unicode, &outpos, &p))
2335            goto onError;
2336    }
2337
2338    if (byteorder)
2339        *byteorder = bo;
2340
2341    if (consumed)
2342        *consumed = (const char *)q-starts;
2343
2344    /* Adjust length */
2345    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2346        goto onError;
2347
2348    Py_XDECREF(errorHandler);
2349    Py_XDECREF(exc);
2350    return (PyObject *)unicode;
2351
2352  onError:
2353    Py_DECREF(unicode);
2354    Py_XDECREF(errorHandler);
2355    Py_XDECREF(exc);
2356    return NULL;
2357}
2358
2359PyObject *
2360PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2361                      Py_ssize_t size,
2362                      const char *errors,
2363                      int byteorder)
2364{
2365    PyObject *v;
2366    unsigned char *p;
2367    Py_ssize_t nsize, bytesize;
2368#ifndef Py_UNICODE_WIDE
2369    Py_ssize_t i, pairs;
2370#else
2371    const int pairs = 0;
2372#endif
2373    /* Offsets from p for storing byte pairs in the right order. */
2374#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2375    int iorder[] = {0, 1, 2, 3};
2376#else
2377    int iorder[] = {3, 2, 1, 0};
2378#endif
2379
2380#define STORECHAR(CH)                           \
2381    do {                                        \
2382        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2383        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2384        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2385        p[iorder[0]] = (CH) & 0xff;             \
2386        p += 4;                                 \
2387    } while(0)
2388
2389    /* In narrow builds we can output surrogate pairs as one codepoint,
2390       so we need less space. */
2391#ifndef Py_UNICODE_WIDE
2392    for (i = pairs = 0; i < size-1; i++)
2393        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2394            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2395            pairs++;
2396#endif
2397    nsize = (size - pairs + (byteorder == 0));
2398    bytesize = nsize * 4;
2399    if (bytesize / 4 != nsize)
2400        return PyErr_NoMemory();
2401    v = PyString_FromStringAndSize(NULL, bytesize);
2402    if (v == NULL)
2403        return NULL;
2404
2405    p = (unsigned char *)PyString_AS_STRING(v);
2406    if (byteorder == 0)
2407        STORECHAR(0xFEFF);
2408    if (size == 0)
2409        return v;
2410
2411    if (byteorder == -1) {
2412        /* force LE */
2413        iorder[0] = 0;
2414        iorder[1] = 1;
2415        iorder[2] = 2;
2416        iorder[3] = 3;
2417    }
2418    else if (byteorder == 1) {
2419        /* force BE */
2420        iorder[0] = 3;
2421        iorder[1] = 2;
2422        iorder[2] = 1;
2423        iorder[3] = 0;
2424    }
2425
2426    while (size-- > 0) {
2427        Py_UCS4 ch = *s++;
2428#ifndef Py_UNICODE_WIDE
2429        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2430            Py_UCS4 ch2 = *s;
2431            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2432                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2433                s++;
2434                size--;
2435            }
2436        }
2437#endif
2438        STORECHAR(ch);
2439    }
2440    return v;
2441#undef STORECHAR
2442}
2443
2444PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
2445{
2446    if (!PyUnicode_Check(unicode)) {
2447        PyErr_BadArgument();
2448        return NULL;
2449    }
2450    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
2451                                 PyUnicode_GET_SIZE(unicode),
2452                                 NULL,
2453                                 0);
2454}
2455
2456/* --- UTF-16 Codec ------------------------------------------------------- */
2457
2458PyObject *
2459PyUnicode_DecodeUTF16(const char *s,
2460                      Py_ssize_t size,
2461                      const char *errors,
2462                      int *byteorder)
2463{
2464    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
2465}
2466
2467PyObject *
2468PyUnicode_DecodeUTF16Stateful(const char *s,
2469                              Py_ssize_t size,
2470                              const char *errors,
2471                              int *byteorder,
2472                              Py_ssize_t *consumed)
2473{
2474    const char *starts = s;
2475    Py_ssize_t startinpos;
2476    Py_ssize_t endinpos;
2477    Py_ssize_t outpos;
2478    PyUnicodeObject *unicode;
2479    Py_UNICODE *p;
2480    const unsigned char *q, *e;
2481    int bo = 0;       /* assume native ordering by default */
2482    const char *errmsg = "";
2483    /* Offsets from q for retrieving byte pairs in the right order. */
2484#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2485    int ihi = 1, ilo = 0;
2486#else
2487    int ihi = 0, ilo = 1;
2488#endif
2489    PyObject *errorHandler = NULL;
2490    PyObject *exc = NULL;
2491
2492    /* Note: size will always be longer than the resulting Unicode
2493       character count */
2494    unicode = _PyUnicode_New(size);
2495    if (!unicode)
2496        return NULL;
2497    if (size == 0)
2498        return (PyObject *)unicode;
2499
2500    /* Unpack UTF-16 encoded data */
2501    p = unicode->str;
2502    q = (unsigned char *)s;
2503    e = q + size;
2504
2505    if (byteorder)
2506        bo = *byteorder;
2507
2508    /* Check for BOM marks (U+FEFF) in the input and adjust current
2509       byte order setting accordingly. In native mode, the leading BOM
2510       mark is skipped, in all other modes, it is copied to the output
2511       stream as-is (giving a ZWNBSP character). */
2512    if (bo == 0) {
2513        if (size >= 2) {
2514            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
2515#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2516            if (bom == 0xFEFF) {
2517                q += 2;
2518                bo = -1;
2519            }
2520            else if (bom == 0xFFFE) {
2521                q += 2;
2522                bo = 1;
2523            }
2524#else
2525            if (bom == 0xFEFF) {
2526                q += 2;
2527                bo = 1;
2528            }
2529            else if (bom == 0xFFFE) {
2530                q += 2;
2531                bo = -1;
2532            }
2533#endif
2534        }
2535    }
2536
2537    if (bo == -1) {
2538        /* force LE */
2539        ihi = 1;
2540        ilo = 0;
2541    }
2542    else if (bo == 1) {
2543        /* force BE */
2544        ihi = 0;
2545        ilo = 1;
2546    }
2547
2548    while (q < e) {
2549        Py_UNICODE ch;
2550        /* remaining bytes at the end? (size should be even) */
2551        if (e-q<2) {
2552            if (consumed)
2553                break;
2554            errmsg = "truncated data";
2555            startinpos = ((const char *)q)-starts;
2556            endinpos = ((const char *)e)-starts;
2557            goto utf16Error;
2558            /* The remaining input chars are ignored if the callback
2559               chooses to skip the input */
2560        }
2561        ch = (q[ihi] << 8) | q[ilo];
2562
2563        q += 2;
2564
2565        if (ch < 0xD800 || ch > 0xDFFF) {
2566            *p++ = ch;
2567            continue;
2568        }
2569
2570        /* UTF-16 code pair: */
2571        if (q >= e) {
2572            errmsg = "unexpected end of data";
2573            startinpos = (((const char *)q)-2)-starts;
2574            endinpos = ((const char *)e)-starts;
2575            goto utf16Error;
2576        }
2577        if (0xD800 <= ch && ch <= 0xDBFF) {
2578            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
2579            q += 2;
2580            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2581#ifndef Py_UNICODE_WIDE
2582                *p++ = ch;
2583                *p++ = ch2;
2584#else
2585                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2586#endif
2587                continue;
2588            }
2589            else {
2590                errmsg = "illegal UTF-16 surrogate";
2591                startinpos = (((const char *)q)-4)-starts;
2592                endinpos = startinpos+2;
2593                goto utf16Error;
2594            }
2595
2596        }
2597        errmsg = "illegal encoding";
2598        startinpos = (((const char *)q)-2)-starts;
2599        endinpos = startinpos+2;
2600        /* Fall through to report the error */
2601
2602      utf16Error:
2603        outpos = p-PyUnicode_AS_UNICODE(unicode);
2604        if (unicode_decode_call_errorhandler(
2605                errors, &errorHandler,
2606                "utf16", errmsg,
2607                starts, size, &startinpos, &endinpos, &exc, (const char **)&q,
2608                &unicode, &outpos, &p))
2609            goto onError;
2610    }
2611
2612    if (byteorder)
2613        *byteorder = bo;
2614
2615    if (consumed)
2616        *consumed = (const char *)q-starts;
2617
2618    /* Adjust length */
2619    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2620        goto onError;
2621
2622    Py_XDECREF(errorHandler);
2623    Py_XDECREF(exc);
2624    return (PyObject *)unicode;
2625
2626  onError:
2627    Py_DECREF(unicode);
2628    Py_XDECREF(errorHandler);
2629    Py_XDECREF(exc);
2630    return NULL;
2631}
2632
2633PyObject *
2634PyUnicode_EncodeUTF16(const Py_UNICODE *s,
2635                      Py_ssize_t size,
2636                      const char *errors,
2637                      int byteorder)
2638{
2639    PyObject *v;
2640    unsigned char *p;
2641    Py_ssize_t nsize, bytesize;
2642#ifdef Py_UNICODE_WIDE
2643    Py_ssize_t i, pairs;
2644#else
2645    const int pairs = 0;
2646#endif
2647    /* Offsets from p for storing byte pairs in the right order. */
2648#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2649    int ihi = 1, ilo = 0;
2650#else
2651    int ihi = 0, ilo = 1;
2652#endif
2653
2654#define STORECHAR(CH)                           \
2655    do {                                        \
2656        p[ihi] = ((CH) >> 8) & 0xff;            \
2657        p[ilo] = (CH) & 0xff;                   \
2658        p += 2;                                 \
2659    } while(0)
2660
2661#ifdef Py_UNICODE_WIDE
2662    for (i = pairs = 0; i < size; i++)
2663        if (s[i] >= 0x10000)
2664            pairs++;
2665#endif
2666    /* 2 * (size + pairs + (byteorder == 0)) */
2667    if (size > PY_SSIZE_T_MAX ||
2668        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
2669        return PyErr_NoMemory();
2670    nsize = size + pairs + (byteorder == 0);
2671    bytesize = nsize * 2;
2672    if (bytesize / 2 != nsize)
2673        return PyErr_NoMemory();
2674    v = PyString_FromStringAndSize(NULL, bytesize);
2675    if (v == NULL)
2676        return NULL;
2677
2678    p = (unsigned char *)PyString_AS_STRING(v);
2679    if (byteorder == 0)
2680        STORECHAR(0xFEFF);
2681    if (size == 0)
2682        return v;
2683
2684    if (byteorder == -1) {
2685        /* force LE */
2686        ihi = 1;
2687        ilo = 0;
2688    }
2689    else if (byteorder == 1) {
2690        /* force BE */
2691        ihi = 0;
2692        ilo = 1;
2693    }
2694
2695    while (size-- > 0) {
2696        Py_UNICODE ch = *s++;
2697        Py_UNICODE ch2 = 0;
2698#ifdef Py_UNICODE_WIDE
2699        if (ch >= 0x10000) {
2700            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
2701            ch  = 0xD800 | ((ch-0x10000) >> 10);
2702        }
2703#endif
2704        STORECHAR(ch);
2705        if (ch2)
2706            STORECHAR(ch2);
2707    }
2708    return v;
2709#undef STORECHAR
2710}
2711
2712PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
2713{
2714    if (!PyUnicode_Check(unicode)) {
2715        PyErr_BadArgument();
2716        return NULL;
2717    }
2718    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
2719                                 PyUnicode_GET_SIZE(unicode),
2720                                 NULL,
2721                                 0);
2722}
2723
2724/* --- Unicode Escape Codec ----------------------------------------------- */
2725
2726static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
2727
2728PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
2729                                        Py_ssize_t size,
2730                                        const char *errors)
2731{
2732    const char *starts = s;
2733    Py_ssize_t startinpos;
2734    Py_ssize_t endinpos;
2735    Py_ssize_t outpos;
2736    int i;
2737    PyUnicodeObject *v;
2738    Py_UNICODE *p;
2739    const char *end;
2740    char* message;
2741    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
2742    PyObject *errorHandler = NULL;
2743    PyObject *exc = NULL;
2744
2745    /* Escaped strings will always be longer than the resulting
2746       Unicode string, so we start with size here and then reduce the
2747       length after conversion to the true value.
2748       (but if the error callback returns a long replacement string
2749       we'll have to allocate more space) */
2750    v = _PyUnicode_New(size);
2751    if (v == NULL)
2752        goto onError;
2753    if (size == 0)
2754        return (PyObject *)v;
2755
2756    p = PyUnicode_AS_UNICODE(v);
2757    end = s + size;
2758
2759    while (s < end) {
2760        unsigned char c;
2761        Py_UNICODE x;
2762        int digits;
2763
2764        /* Non-escape characters are interpreted as Unicode ordinals */
2765        if (*s != '\\') {
2766            *p++ = (unsigned char) *s++;
2767            continue;
2768        }
2769
2770        startinpos = s-starts;
2771        /* \ - Escapes */
2772        s++;
2773        c = *s++;
2774        if (s > end)
2775            c = '\0'; /* Invalid after \ */
2776        switch (c) {
2777
2778            /* \x escapes */
2779        case '\n': break;
2780        case '\\': *p++ = '\\'; break;
2781        case '\'': *p++ = '\''; break;
2782        case '\"': *p++ = '\"'; break;
2783        case 'b': *p++ = '\b'; break;
2784        case 'f': *p++ = '\014'; break; /* FF */
2785        case 't': *p++ = '\t'; break;
2786        case 'n': *p++ = '\n'; break;
2787        case 'r': *p++ = '\r'; break;
2788        case 'v': *p++ = '\013'; break; /* VT */
2789        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
2790
2791            /* \OOO (octal) escapes */
2792        case '0': case '1': case '2': case '3':
2793        case '4': case '5': case '6': case '7':
2794            x = s[-1] - '0';
2795            if (s < end && '0' <= *s && *s <= '7') {
2796                x = (x<<3) + *s++ - '0';
2797                if (s < end && '0' <= *s && *s <= '7')
2798                    x = (x<<3) + *s++ - '0';
2799            }
2800            *p++ = x;
2801            break;
2802
2803            /* hex escapes */
2804            /* \xXX */
2805        case 'x':
2806            digits = 2;
2807            message = "truncated \\xXX escape";
2808            goto hexescape;
2809
2810            /* \uXXXX */
2811        case 'u':
2812            digits = 4;
2813            message = "truncated \\uXXXX escape";
2814            goto hexescape;
2815
2816            /* \UXXXXXXXX */
2817        case 'U':
2818            digits = 8;
2819            message = "truncated \\UXXXXXXXX escape";
2820        hexescape:
2821            chr = 0;
2822            outpos = p-PyUnicode_AS_UNICODE(v);
2823            if (s+digits>end) {
2824                endinpos = size;
2825                if (unicode_decode_call_errorhandler(
2826                        errors, &errorHandler,
2827                        "unicodeescape", "end of string in escape sequence",
2828                        starts, size, &startinpos, &endinpos, &exc, &s,
2829                        &v, &outpos, &p))
2830                    goto onError;
2831                goto nextByte;
2832            }
2833            for (i = 0; i < digits; ++i) {
2834                c = (unsigned char) s[i];
2835                if (!isxdigit(c)) {
2836                    endinpos = (s+i+1)-starts;
2837                    if (unicode_decode_call_errorhandler(
2838                            errors, &errorHandler,
2839                            "unicodeescape", message,
2840                            starts, size, &startinpos, &endinpos, &exc, &s,
2841                            &v, &outpos, &p))
2842                        goto onError;
2843                    goto nextByte;
2844                }
2845                chr = (chr<<4) & ~0xF;
2846                if (c >= '0' && c <= '9')
2847                    chr += c - '0';
2848                else if (c >= 'a' && c <= 'f')
2849                    chr += 10 + c - 'a';
2850                else
2851                    chr += 10 + c - 'A';
2852            }
2853            s += i;
2854            if (chr == 0xffffffff && PyErr_Occurred())
2855                /* _decoding_error will have already written into the
2856                   target buffer. */
2857                break;
2858        store:
2859            /* when we get here, chr is a 32-bit unicode character */
2860            if (chr <= 0xffff)
2861                /* UCS-2 character */
2862                *p++ = (Py_UNICODE) chr;
2863            else if (chr <= 0x10ffff) {
2864                /* UCS-4 character. Either store directly, or as
2865                   surrogate pair. */
2866#ifdef Py_UNICODE_WIDE
2867                *p++ = chr;
2868#else
2869                chr -= 0x10000L;
2870                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
2871                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
2872#endif
2873            } else {
2874                endinpos = s-starts;
2875                outpos = p-PyUnicode_AS_UNICODE(v);
2876                if (unicode_decode_call_errorhandler(
2877                        errors, &errorHandler,
2878                        "unicodeescape", "illegal Unicode character",
2879                        starts, size, &startinpos, &endinpos, &exc, &s,
2880                        &v, &outpos, &p))
2881                    goto onError;
2882            }
2883            break;
2884
2885            /* \N{name} */
2886        case 'N':
2887            message = "malformed \\N character escape";
2888            if (ucnhash_CAPI == NULL) {
2889                /* load the unicode data module */
2890                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
2891                if (ucnhash_CAPI == NULL)
2892                    goto ucnhashError;
2893            }
2894            if (*s == '{') {
2895                const char *start = s+1;
2896                /* look for the closing brace */
2897                while (*s != '}' && s < end)
2898                    s++;
2899                if (s > start && s < end && *s == '}') {
2900                    /* found a name.  look it up in the unicode database */
2901                    message = "unknown Unicode character name";
2902                    s++;
2903                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
2904                        goto store;
2905                }
2906            }
2907            endinpos = s-starts;
2908            outpos = p-PyUnicode_AS_UNICODE(v);
2909            if (unicode_decode_call_errorhandler(
2910                    errors, &errorHandler,
2911                    "unicodeescape", message,
2912                    starts, size, &startinpos, &endinpos, &exc, &s,
2913                    &v, &outpos, &p))
2914                goto onError;
2915            break;
2916
2917        default:
2918            if (s > end) {
2919                message = "\\ at end of string";
2920                s--;
2921                endinpos = s-starts;
2922                outpos = p-PyUnicode_AS_UNICODE(v);
2923                if (unicode_decode_call_errorhandler(
2924                        errors, &errorHandler,
2925                        "unicodeescape", message,
2926                        starts, size, &startinpos, &endinpos, &exc, &s,
2927                        &v, &outpos, &p))
2928                    goto onError;
2929            }
2930            else {
2931                *p++ = '\\';
2932                *p++ = (unsigned char)s[-1];
2933            }
2934            break;
2935        }
2936      nextByte:
2937        ;
2938    }
2939    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
2940        goto onError;
2941    Py_XDECREF(errorHandler);
2942    Py_XDECREF(exc);
2943    return (PyObject *)v;
2944
2945  ucnhashError:
2946    PyErr_SetString(
2947        PyExc_UnicodeError,
2948        "\\N escapes not supported (can't load unicodedata module)"
2949        );
2950    Py_XDECREF(v);
2951    Py_XDECREF(errorHandler);
2952    Py_XDECREF(exc);
2953    return NULL;
2954
2955  onError:
2956    Py_XDECREF(v);
2957    Py_XDECREF(errorHandler);
2958    Py_XDECREF(exc);
2959    return NULL;
2960}
2961
2962/* Return a Unicode-Escape string version of the Unicode object.
2963
2964   If quotes is true, the string is enclosed in u"" or u'' quotes as
2965   appropriate.
2966
2967*/
2968
2969Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
2970                                             Py_ssize_t size,
2971                                             Py_UNICODE ch)
2972{
2973    /* like wcschr, but doesn't stop at NULL characters */
2974
2975    while (size-- > 0) {
2976        if (*s == ch)
2977            return s;
2978        s++;
2979    }
2980
2981    return NULL;
2982}
2983
2984static
2985PyObject *unicodeescape_string(const Py_UNICODE *s,
2986                               Py_ssize_t size,
2987                               int quotes)
2988{
2989    PyObject *repr;
2990    char *p;
2991
2992    static const char *hexdigit = "0123456789abcdef";
2993#ifdef Py_UNICODE_WIDE
2994    const Py_ssize_t expandsize = 10;
2995#else
2996    const Py_ssize_t expandsize = 6;
2997#endif
2998
2999    /* XXX(nnorwitz): rather than over-allocating, it would be
3000       better to choose a different scheme.  Perhaps scan the
3001       first N-chars of the string and allocate based on that size.
3002    */
3003    /* Initial allocation is based on the longest-possible unichr
3004       escape.
3005
3006       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3007       unichr, so in this case it's the longest unichr escape. In
3008       narrow (UTF-16) builds this is five chars per source unichr
3009       since there are two unichrs in the surrogate pair, so in narrow
3010       (UTF-16) builds it's not the longest unichr escape.
3011
3012       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3013       so in the narrow (UTF-16) build case it's the longest unichr
3014       escape.
3015    */
3016
3017    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3018        return PyErr_NoMemory();
3019
3020    repr = PyString_FromStringAndSize(NULL,
3021                                      2
3022                                      + expandsize*size
3023                                      + 1);
3024    if (repr == NULL)
3025        return NULL;
3026
3027    p = PyString_AS_STRING(repr);
3028
3029    if (quotes) {
3030        *p++ = 'u';
3031        *p++ = (findchar(s, size, '\'') &&
3032                !findchar(s, size, '"')) ? '"' : '\'';
3033    }
3034    while (size-- > 0) {
3035        Py_UNICODE ch = *s++;
3036
3037        /* Escape quotes and backslashes */
3038        if ((quotes &&
3039             ch == (Py_UNICODE) PyString_AS_STRING(repr)[1]) || ch == '\\') {
3040            *p++ = '\\';
3041            *p++ = (char) ch;
3042            continue;
3043        }
3044
3045#ifdef Py_UNICODE_WIDE
3046        /* Map 21-bit characters to '\U00xxxxxx' */
3047        else if (ch >= 0x10000) {
3048            *p++ = '\\';
3049            *p++ = 'U';
3050            *p++ = hexdigit[(ch >> 28) & 0x0000000F];
3051            *p++ = hexdigit[(ch >> 24) & 0x0000000F];
3052            *p++ = hexdigit[(ch >> 20) & 0x0000000F];
3053            *p++ = hexdigit[(ch >> 16) & 0x0000000F];
3054            *p++ = hexdigit[(ch >> 12) & 0x0000000F];
3055            *p++ = hexdigit[(ch >> 8) & 0x0000000F];
3056            *p++ = hexdigit[(ch >> 4) & 0x0000000F];
3057            *p++ = hexdigit[ch & 0x0000000F];
3058            continue;
3059        }
3060#else
3061        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3062        else if (ch >= 0xD800 && ch < 0xDC00) {
3063            Py_UNICODE ch2;
3064            Py_UCS4 ucs;
3065
3066            ch2 = *s++;
3067            size--;
3068            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3069                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3070                *p++ = '\\';
3071                *p++ = 'U';
3072                *p++ = hexdigit[(ucs >> 28) & 0x0000000F];
3073                *p++ = hexdigit[(ucs >> 24) & 0x0000000F];
3074                *p++ = hexdigit[(ucs >> 20) & 0x0000000F];
3075                *p++ = hexdigit[(ucs >> 16) & 0x0000000F];
3076                *p++ = hexdigit[(ucs >> 12) & 0x0000000F];
3077                *p++ = hexdigit[(ucs >> 8) & 0x0000000F];
3078                *p++ = hexdigit[(ucs >> 4) & 0x0000000F];
3079                *p++ = hexdigit[ucs & 0x0000000F];
3080                continue;
3081            }
3082            /* Fall through: isolated surrogates are copied as-is */
3083            s--;
3084            size++;
3085        }
3086#endif
3087
3088        /* Map 16-bit characters to '\uxxxx' */
3089        if (ch >= 256) {
3090            *p++ = '\\';
3091            *p++ = 'u';
3092            *p++ = hexdigit[(ch >> 12) & 0x000F];
3093            *p++ = hexdigit[(ch >> 8) & 0x000F];
3094            *p++ = hexdigit[(ch >> 4) & 0x000F];
3095            *p++ = hexdigit[ch & 0x000F];
3096        }
3097
3098        /* Map special whitespace to '\t', \n', '\r' */
3099        else if (ch == '\t') {
3100            *p++ = '\\';
3101            *p++ = 't';
3102        }
3103        else if (ch == '\n') {
3104            *p++ = '\\';
3105            *p++ = 'n';
3106        }
3107        else if (ch == '\r') {
3108            *p++ = '\\';
3109            *p++ = 'r';
3110        }
3111
3112        /* Map non-printable US ASCII to '\xhh' */
3113        else if (ch < ' ' || ch >= 0x7F) {
3114            *p++ = '\\';
3115            *p++ = 'x';
3116            *p++ = hexdigit[(ch >> 4) & 0x000F];
3117            *p++ = hexdigit[ch & 0x000F];
3118        }
3119
3120        /* Copy everything else as-is */
3121        else
3122            *p++ = (char) ch;
3123    }
3124    if (quotes)
3125        *p++ = PyString_AS_STRING(repr)[1];
3126
3127    *p = '\0';
3128    if (_PyString_Resize(&repr, p - PyString_AS_STRING(repr)))
3129        return NULL;
3130    return repr;
3131}
3132
3133PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3134                                        Py_ssize_t size)
3135{
3136    return unicodeescape_string(s, size, 0);
3137}
3138
3139PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3140{
3141    if (!PyUnicode_Check(unicode)) {
3142        PyErr_BadArgument();
3143        return NULL;
3144    }
3145    return PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3146                                         PyUnicode_GET_SIZE(unicode));
3147}
3148
3149/* --- Raw Unicode Escape Codec ------------------------------------------- */
3150
3151PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3152                                           Py_ssize_t size,
3153                                           const char *errors)
3154{
3155    const char *starts = s;
3156    Py_ssize_t startinpos;
3157    Py_ssize_t endinpos;
3158    Py_ssize_t outpos;
3159    PyUnicodeObject *v;
3160    Py_UNICODE *p;
3161    const char *end;
3162    const char *bs;
3163    PyObject *errorHandler = NULL;
3164    PyObject *exc = NULL;
3165
3166    /* Escaped strings will always be longer than the resulting
3167       Unicode string, so we start with size here and then reduce the
3168       length after conversion to the true value. (But decoding error
3169       handler might have to resize the string) */
3170    v = _PyUnicode_New(size);
3171    if (v == NULL)
3172        goto onError;
3173    if (size == 0)
3174        return (PyObject *)v;
3175    p = PyUnicode_AS_UNICODE(v);
3176    end = s + size;
3177    while (s < end) {
3178        unsigned char c;
3179        Py_UCS4 x;
3180        int i;
3181        int count;
3182
3183        /* Non-escape characters are interpreted as Unicode ordinals */
3184        if (*s != '\\') {
3185            *p++ = (unsigned char)*s++;
3186            continue;
3187        }
3188        startinpos = s-starts;
3189
3190        /* \u-escapes are only interpreted iff the number of leading
3191           backslashes if odd */
3192        bs = s;
3193        for (;s < end;) {
3194            if (*s != '\\')
3195                break;
3196            *p++ = (unsigned char)*s++;
3197        }
3198        if (((s - bs) & 1) == 0 ||
3199            s >= end ||
3200            (*s != 'u' && *s != 'U')) {
3201            continue;
3202        }
3203        p--;
3204        count = *s=='u' ? 4 : 8;
3205        s++;
3206
3207        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3208        outpos = p-PyUnicode_AS_UNICODE(v);
3209        for (x = 0, i = 0; i < count; ++i, ++s) {
3210            c = (unsigned char)*s;
3211            if (!isxdigit(c)) {
3212                endinpos = s-starts;
3213                if (unicode_decode_call_errorhandler(
3214                        errors, &errorHandler,
3215                        "rawunicodeescape", "truncated \\uXXXX",
3216                        starts, size, &startinpos, &endinpos, &exc, &s,
3217                        &v, &outpos, &p))
3218                    goto onError;
3219                goto nextByte;
3220            }
3221            x = (x<<4) & ~0xF;
3222            if (c >= '0' && c <= '9')
3223                x += c - '0';
3224            else if (c >= 'a' && c <= 'f')
3225                x += 10 + c - 'a';
3226            else
3227                x += 10 + c - 'A';
3228        }
3229        if (x <= 0xffff)
3230            /* UCS-2 character */
3231            *p++ = (Py_UNICODE) x;
3232        else if (x <= 0x10ffff) {
3233            /* UCS-4 character. Either store directly, or as
3234               surrogate pair. */
3235#ifdef Py_UNICODE_WIDE
3236            *p++ = (Py_UNICODE) x;
3237#else
3238            x -= 0x10000L;
3239            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3240            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3241#endif
3242        } else {
3243            endinpos = s-starts;
3244            outpos = p-PyUnicode_AS_UNICODE(v);
3245            if (unicode_decode_call_errorhandler(
3246                    errors, &errorHandler,
3247                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3248                    starts, size, &startinpos, &endinpos, &exc, &s,
3249                    &v, &outpos, &p))
3250                goto onError;
3251        }
3252      nextByte:
3253        ;
3254    }
3255    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3256        goto onError;
3257    Py_XDECREF(errorHandler);
3258    Py_XDECREF(exc);
3259    return (PyObject *)v;
3260
3261  onError:
3262    Py_XDECREF(v);
3263    Py_XDECREF(errorHandler);
3264    Py_XDECREF(exc);
3265    return NULL;
3266}
3267
3268PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3269                                           Py_ssize_t size)
3270{
3271    PyObject *repr;
3272    char *p;
3273    char *q;
3274
3275    static const char *hexdigit = "0123456789abcdef";
3276#ifdef Py_UNICODE_WIDE
3277    const Py_ssize_t expandsize = 10;
3278#else
3279    const Py_ssize_t expandsize = 6;
3280#endif
3281
3282    if (size > PY_SSIZE_T_MAX / expandsize)
3283        return PyErr_NoMemory();
3284
3285    repr = PyString_FromStringAndSize(NULL, expandsize * size);
3286    if (repr == NULL)
3287        return NULL;
3288    if (size == 0)
3289        return repr;
3290
3291    p = q = PyString_AS_STRING(repr);
3292    while (size-- > 0) {
3293        Py_UNICODE ch = *s++;
3294#ifdef Py_UNICODE_WIDE
3295        /* Map 32-bit characters to '\Uxxxxxxxx' */
3296        if (ch >= 0x10000) {
3297            *p++ = '\\';
3298            *p++ = 'U';
3299            *p++ = hexdigit[(ch >> 28) & 0xf];
3300            *p++ = hexdigit[(ch >> 24) & 0xf];
3301            *p++ = hexdigit[(ch >> 20) & 0xf];
3302            *p++ = hexdigit[(ch >> 16) & 0xf];
3303            *p++ = hexdigit[(ch >> 12) & 0xf];
3304            *p++ = hexdigit[(ch >> 8) & 0xf];
3305            *p++ = hexdigit[(ch >> 4) & 0xf];
3306            *p++ = hexdigit[ch & 15];
3307        }
3308        else
3309#else
3310            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3311            if (ch >= 0xD800 && ch < 0xDC00) {
3312                Py_UNICODE ch2;
3313                Py_UCS4 ucs;
3314
3315                ch2 = *s++;
3316                size--;
3317                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3318                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3319                    *p++ = '\\';
3320                    *p++ = 'U';
3321                    *p++ = hexdigit[(ucs >> 28) & 0xf];
3322                    *p++ = hexdigit[(ucs >> 24) & 0xf];
3323                    *p++ = hexdigit[(ucs >> 20) & 0xf];
3324                    *p++ = hexdigit[(ucs >> 16) & 0xf];
3325                    *p++ = hexdigit[(ucs >> 12) & 0xf];
3326                    *p++ = hexdigit[(ucs >> 8) & 0xf];
3327                    *p++ = hexdigit[(ucs >> 4) & 0xf];
3328                    *p++ = hexdigit[ucs & 0xf];
3329                    continue;
3330                }
3331                /* Fall through: isolated surrogates are copied as-is */
3332                s--;
3333                size++;
3334            }
3335#endif
3336        /* Map 16-bit characters to '\uxxxx' */
3337        if (ch >= 256) {
3338            *p++ = '\\';
3339            *p++ = 'u';
3340            *p++ = hexdigit[(ch >> 12) & 0xf];
3341            *p++ = hexdigit[(ch >> 8) & 0xf];
3342            *p++ = hexdigit[(ch >> 4) & 0xf];
3343            *p++ = hexdigit[ch & 15];
3344        }
3345        /* Copy everything else as-is */
3346        else
3347            *p++ = (char) ch;
3348    }
3349    *p = '\0';
3350    if (_PyString_Resize(&repr, p - q))
3351        return NULL;
3352    return repr;
3353}
3354
3355PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
3356{
3357    if (!PyUnicode_Check(unicode)) {
3358        PyErr_BadArgument();
3359        return NULL;
3360    }
3361    return PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3362                                            PyUnicode_GET_SIZE(unicode));
3363}
3364
3365/* --- Unicode Internal Codec ------------------------------------------- */
3366
3367PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
3368                                           Py_ssize_t size,
3369                                           const char *errors)
3370{
3371    const char *starts = s;
3372    Py_ssize_t startinpos;
3373    Py_ssize_t endinpos;
3374    Py_ssize_t outpos;
3375    PyUnicodeObject *v;
3376    Py_UNICODE *p;
3377    const char *end;
3378    const char *reason;
3379    PyObject *errorHandler = NULL;
3380    PyObject *exc = NULL;
3381
3382#ifdef Py_UNICODE_WIDE
3383    Py_UNICODE unimax = PyUnicode_GetMax();
3384#endif
3385
3386    /* XXX overflow detection missing */
3387    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
3388    if (v == NULL)
3389        goto onError;
3390    if (PyUnicode_GetSize((PyObject *)v) == 0)
3391        return (PyObject *)v;
3392    p = PyUnicode_AS_UNICODE(v);
3393    end = s + size;
3394
3395    while (s < end) {
3396        memcpy(p, s, sizeof(Py_UNICODE));
3397        /* We have to sanity check the raw data, otherwise doom looms for
3398           some malformed UCS-4 data. */
3399        if (
3400#ifdef Py_UNICODE_WIDE
3401            *p > unimax || *p < 0 ||
3402#endif
3403            end-s < Py_UNICODE_SIZE
3404            )
3405        {
3406            startinpos = s - starts;
3407            if (end-s < Py_UNICODE_SIZE) {
3408                endinpos = end-starts;
3409                reason = "truncated input";
3410            }
3411            else {
3412                endinpos = s - starts + Py_UNICODE_SIZE;
3413                reason = "illegal code point (> 0x10FFFF)";
3414            }
3415            outpos = p - PyUnicode_AS_UNICODE(v);
3416            if (unicode_decode_call_errorhandler(
3417                    errors, &errorHandler,
3418                    "unicode_internal", reason,
3419                    starts, size, &startinpos, &endinpos, &exc, &s,
3420                    &v, &outpos, &p)) {
3421                goto onError;
3422            }
3423        }
3424        else {
3425            p++;
3426            s += Py_UNICODE_SIZE;
3427        }
3428    }
3429
3430    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3431        goto onError;
3432    Py_XDECREF(errorHandler);
3433    Py_XDECREF(exc);
3434    return (PyObject *)v;
3435
3436  onError:
3437    Py_XDECREF(v);
3438    Py_XDECREF(errorHandler);
3439    Py_XDECREF(exc);
3440    return NULL;
3441}
3442
3443/* --- Latin-1 Codec ------------------------------------------------------ */
3444
3445PyObject *PyUnicode_DecodeLatin1(const char *s,
3446                                 Py_ssize_t size,
3447                                 const char *errors)
3448{
3449    PyUnicodeObject *v;
3450    Py_UNICODE *p;
3451
3452    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
3453    if (size == 1) {
3454        Py_UNICODE r = *(unsigned char*)s;
3455        return PyUnicode_FromUnicode(&r, 1);
3456    }
3457
3458    v = _PyUnicode_New(size);
3459    if (v == NULL)
3460        goto onError;
3461    if (size == 0)
3462        return (PyObject *)v;
3463    p = PyUnicode_AS_UNICODE(v);
3464    while (size-- > 0)
3465        *p++ = (unsigned char)*s++;
3466    return (PyObject *)v;
3467
3468  onError:
3469    Py_XDECREF(v);
3470    return NULL;
3471}
3472
3473/* create or adjust a UnicodeEncodeError */
3474static void make_encode_exception(PyObject **exceptionObject,
3475                                  const char *encoding,
3476                                  const Py_UNICODE *unicode, Py_ssize_t size,
3477                                  Py_ssize_t startpos, Py_ssize_t endpos,
3478                                  const char *reason)
3479{
3480    if (*exceptionObject == NULL) {
3481        *exceptionObject = PyUnicodeEncodeError_Create(
3482            encoding, unicode, size, startpos, endpos, reason);
3483    }
3484    else {
3485        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
3486            goto onError;
3487        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
3488            goto onError;
3489        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
3490            goto onError;
3491        return;
3492      onError:
3493        Py_DECREF(*exceptionObject);
3494        *exceptionObject = NULL;
3495    }
3496}
3497
3498/* raises a UnicodeEncodeError */
3499static void raise_encode_exception(PyObject **exceptionObject,
3500                                   const char *encoding,
3501                                   const Py_UNICODE *unicode, Py_ssize_t size,
3502                                   Py_ssize_t startpos, Py_ssize_t endpos,
3503                                   const char *reason)
3504{
3505    make_encode_exception(exceptionObject,
3506                          encoding, unicode, size, startpos, endpos, reason);
3507    if (*exceptionObject != NULL)
3508        PyCodec_StrictErrors(*exceptionObject);
3509}
3510
3511/* error handling callback helper:
3512   build arguments, call the callback and check the arguments,
3513   put the result into newpos and return the replacement string, which
3514   has to be freed by the caller */
3515static PyObject *unicode_encode_call_errorhandler(const char *errors,
3516                                                  PyObject **errorHandler,
3517                                                  const char *encoding, const char *reason,
3518                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
3519                                                  Py_ssize_t startpos, Py_ssize_t endpos,
3520                                                  Py_ssize_t *newpos)
3521{
3522    static char *argparse = "O!n;encoding error handler must return (unicode, int) tuple";
3523
3524    PyObject *restuple;
3525    PyObject *resunicode;
3526
3527    if (*errorHandler == NULL) {
3528        *errorHandler = PyCodec_LookupError(errors);
3529        if (*errorHandler == NULL)
3530            return NULL;
3531    }
3532
3533    make_encode_exception(exceptionObject,
3534                          encoding, unicode, size, startpos, endpos, reason);
3535    if (*exceptionObject == NULL)
3536        return NULL;
3537
3538    restuple = PyObject_CallFunctionObjArgs(
3539        *errorHandler, *exceptionObject, NULL);
3540    if (restuple == NULL)
3541        return NULL;
3542    if (!PyTuple_Check(restuple)) {
3543        PyErr_SetString(PyExc_TypeError, &argparse[4]);
3544        Py_DECREF(restuple);
3545        return NULL;
3546    }
3547    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
3548                          &resunicode, newpos)) {
3549        Py_DECREF(restuple);
3550        return NULL;
3551    }
3552    if (*newpos<0)
3553        *newpos = size+*newpos;
3554    if (*newpos<0 || *newpos>size) {
3555        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
3556        Py_DECREF(restuple);
3557        return NULL;
3558    }
3559    Py_INCREF(resunicode);
3560    Py_DECREF(restuple);
3561    return resunicode;
3562}
3563
3564static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
3565                                     Py_ssize_t size,
3566                                     const char *errors,
3567                                     int limit)
3568{
3569    /* output object */
3570    PyObject *res;
3571    /* pointers to the beginning and end+1 of input */
3572    const Py_UNICODE *startp = p;
3573    const Py_UNICODE *endp = p + size;
3574    /* pointer to the beginning of the unencodable characters */
3575    /* const Py_UNICODE *badp = NULL; */
3576    /* pointer into the output */
3577    char *str;
3578    /* current output position */
3579    Py_ssize_t respos = 0;
3580    Py_ssize_t ressize;
3581    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
3582    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
3583    PyObject *errorHandler = NULL;
3584    PyObject *exc = NULL;
3585    /* the following variable is used for caching string comparisons
3586     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
3587    int known_errorHandler = -1;
3588
3589    /* allocate enough for a simple encoding without
3590       replacements, if we need more, we'll resize */
3591    res = PyString_FromStringAndSize(NULL, size);
3592    if (res == NULL)
3593        goto onError;
3594    if (size == 0)
3595        return res;
3596    str = PyString_AS_STRING(res);
3597    ressize = size;
3598
3599    while (p<endp) {
3600        Py_UNICODE c = *p;
3601
3602        /* can we encode this? */
3603        if (c<limit) {
3604            /* no overflow check, because we know that the space is enough */
3605            *str++ = (char)c;
3606            ++p;
3607        }
3608        else {
3609            Py_ssize_t unicodepos = p-startp;
3610            Py_ssize_t requiredsize;
3611            PyObject *repunicode;
3612            Py_ssize_t repsize;
3613            Py_ssize_t newpos;
3614            Py_ssize_t respos;
3615            Py_UNICODE *uni2;
3616            /* startpos for collecting unencodable chars */
3617            const Py_UNICODE *collstart = p;
3618            const Py_UNICODE *collend = p;
3619            /* find all unecodable characters */
3620            while ((collend < endp) && ((*collend)>=limit))
3621                ++collend;
3622            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
3623            if (known_errorHandler==-1) {
3624                if ((errors==NULL) || (!strcmp(errors, "strict")))
3625                    known_errorHandler = 1;
3626                else if (!strcmp(errors, "replace"))
3627                    known_errorHandler = 2;
3628                else if (!strcmp(errors, "ignore"))
3629                    known_errorHandler = 3;
3630                else if (!strcmp(errors, "xmlcharrefreplace"))
3631                    known_errorHandler = 4;
3632                else
3633                    known_errorHandler = 0;
3634            }
3635            switch (known_errorHandler) {
3636            case 1: /* strict */
3637                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
3638                goto onError;
3639            case 2: /* replace */
3640                while (collstart++<collend)
3641                    *str++ = '?'; /* fall through */
3642            case 3: /* ignore */
3643                p = collend;
3644                break;
3645            case 4: /* xmlcharrefreplace */
3646                respos = str-PyString_AS_STRING(res);
3647                /* determine replacement size (temporarily (mis)uses p) */
3648                for (p = collstart, repsize = 0; p < collend; ++p) {
3649                    if (*p<10)
3650                        repsize += 2+1+1;
3651                    else if (*p<100)
3652                        repsize += 2+2+1;
3653                    else if (*p<1000)
3654                        repsize += 2+3+1;
3655                    else if (*p<10000)
3656                        repsize += 2+4+1;
3657#ifndef Py_UNICODE_WIDE
3658                    else
3659                        repsize += 2+5+1;
3660#else
3661                    else if (*p<100000)
3662                        repsize += 2+5+1;
3663                    else if (*p<1000000)
3664                        repsize += 2+6+1;
3665                    else
3666                        repsize += 2+7+1;
3667#endif
3668                }
3669                requiredsize = respos+repsize+(endp-collend);
3670                if (requiredsize > ressize) {
3671                    if (requiredsize<2*ressize)
3672                        requiredsize = 2*ressize;
3673                    if (_PyString_Resize(&res, requiredsize))
3674                        goto onError;
3675                    str = PyString_AS_STRING(res) + respos;
3676                    ressize = requiredsize;
3677                }
3678                /* generate replacement (temporarily (mis)uses p) */
3679                for (p = collstart; p < collend; ++p) {
3680                    str += sprintf(str, "&#%d;", (int)*p);
3681                }
3682                p = collend;
3683                break;
3684            default:
3685                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
3686                                                              encoding, reason, startp, size, &exc,
3687                                                              collstart-startp, collend-startp, &newpos);
3688                if (repunicode == NULL)
3689                    goto onError;
3690                /* need more space? (at least enough for what we have+the
3691                   replacement+the rest of the string, so we won't have to
3692                   check space for encodable characters) */
3693                respos = str-PyString_AS_STRING(res);
3694                repsize = PyUnicode_GET_SIZE(repunicode);
3695                requiredsize = respos+repsize+(endp-collend);
3696                if (requiredsize > ressize) {
3697                    if (requiredsize<2*ressize)
3698                        requiredsize = 2*ressize;
3699                    if (_PyString_Resize(&res, requiredsize)) {
3700                        Py_DECREF(repunicode);
3701                        goto onError;
3702                    }
3703                    str = PyString_AS_STRING(res) + respos;
3704                    ressize = requiredsize;
3705                }
3706                /* check if there is anything unencodable in the replacement
3707                   and copy it to the output */
3708                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
3709                    c = *uni2;
3710                    if (c >= limit) {
3711                        raise_encode_exception(&exc, encoding, startp, size,
3712                                               unicodepos, unicodepos+1, reason);
3713                        Py_DECREF(repunicode);
3714                        goto onError;
3715                    }
3716                    *str = (char)c;
3717                }
3718                p = startp + newpos;
3719                Py_DECREF(repunicode);
3720            }
3721        }
3722    }
3723    /* Resize if we allocated to much */
3724    respos = str-PyString_AS_STRING(res);
3725    if (respos<ressize)
3726        /* If this falls res will be NULL */
3727        _PyString_Resize(&res, respos);
3728    Py_XDECREF(errorHandler);
3729    Py_XDECREF(exc);
3730    return res;
3731
3732  onError:
3733    Py_XDECREF(res);
3734    Py_XDECREF(errorHandler);
3735    Py_XDECREF(exc);
3736    return NULL;
3737}
3738
3739PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
3740                                 Py_ssize_t size,
3741                                 const char *errors)
3742{
3743    return unicode_encode_ucs1(p, size, errors, 256);
3744}
3745
3746PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
3747{
3748    if (!PyUnicode_Check(unicode)) {
3749        PyErr_BadArgument();
3750        return NULL;
3751    }
3752    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
3753                                  PyUnicode_GET_SIZE(unicode),
3754                                  NULL);
3755}
3756
3757/* --- 7-bit ASCII Codec -------------------------------------------------- */
3758
3759PyObject *PyUnicode_DecodeASCII(const char *s,
3760                                Py_ssize_t size,
3761                                const char *errors)
3762{
3763    const char *starts = s;
3764    PyUnicodeObject *v;
3765    Py_UNICODE *p;
3766    Py_ssize_t startinpos;
3767    Py_ssize_t endinpos;
3768    Py_ssize_t outpos;
3769    const char *e;
3770    PyObject *errorHandler = NULL;
3771    PyObject *exc = NULL;
3772
3773    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
3774    if (size == 1 && *(unsigned char*)s < 128) {
3775        Py_UNICODE r = *(unsigned char*)s;
3776        return PyUnicode_FromUnicode(&r, 1);
3777    }
3778
3779    v = _PyUnicode_New(size);
3780    if (v == NULL)
3781        goto onError;
3782    if (size == 0)
3783        return (PyObject *)v;
3784    p = PyUnicode_AS_UNICODE(v);
3785    e = s + size;
3786    while (s < e) {
3787        register unsigned char c = (unsigned char)*s;
3788        if (c < 128) {
3789            *p++ = c;
3790            ++s;
3791        }
3792        else {
3793            startinpos = s-starts;
3794            endinpos = startinpos + 1;
3795            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
3796            if (unicode_decode_call_errorhandler(
3797                    errors, &errorHandler,
3798                    "ascii", "ordinal not in range(128)",
3799                    starts, size, &startinpos, &endinpos, &exc, &s,
3800                    &v, &outpos, &p))
3801                goto onError;
3802        }
3803    }
3804    if (p - PyUnicode_AS_UNICODE(v) < PyString_GET_SIZE(v))
3805        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3806            goto onError;
3807    Py_XDECREF(errorHandler);
3808    Py_XDECREF(exc);
3809    return (PyObject *)v;
3810
3811  onError:
3812    Py_XDECREF(v);
3813    Py_XDECREF(errorHandler);
3814    Py_XDECREF(exc);
3815    return NULL;
3816}
3817
3818PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
3819                                Py_ssize_t size,
3820                                const char *errors)
3821{
3822    return unicode_encode_ucs1(p, size, errors, 128);
3823}
3824
3825PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
3826{
3827    if (!PyUnicode_Check(unicode)) {
3828        PyErr_BadArgument();
3829        return NULL;
3830    }
3831    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
3832                                 PyUnicode_GET_SIZE(unicode),
3833                                 NULL);
3834}
3835
3836#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
3837
3838/* --- MBCS codecs for Windows -------------------------------------------- */
3839
3840#if SIZEOF_INT < SIZEOF_SIZE_T
3841#define NEED_RETRY
3842#endif
3843
3844/* XXX This code is limited to "true" double-byte encodings, as
3845   a) it assumes an incomplete character consists of a single byte, and
3846   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
3847   encodings, see IsDBCSLeadByteEx documentation. */
3848
3849static int is_dbcs_lead_byte(const char *s, int offset)
3850{
3851    const char *curr = s + offset;
3852
3853    if (IsDBCSLeadByte(*curr)) {
3854        const char *prev = CharPrev(s, curr);
3855        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
3856    }
3857    return 0;
3858}
3859
3860/*
3861 * Decode MBCS string into unicode object. If 'final' is set, converts
3862 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
3863 */
3864static int decode_mbcs(PyUnicodeObject **v,
3865                       const char *s, /* MBCS string */
3866                       int size, /* sizeof MBCS string */
3867                       int final)
3868{
3869    Py_UNICODE *p;
3870    Py_ssize_t n = 0;
3871    int usize = 0;
3872
3873    assert(size >= 0);
3874
3875    /* Skip trailing lead-byte unless 'final' is set */
3876    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
3877        --size;
3878
3879    /* First get the size of the result */
3880    if (size > 0) {
3881        usize = MultiByteToWideChar(CP_ACP, 0, s, size, NULL, 0);
3882        if (usize == 0) {
3883            PyErr_SetFromWindowsErrWithFilename(0, NULL);
3884            return -1;
3885        }
3886    }
3887
3888    if (*v == NULL) {
3889        /* Create unicode object */
3890        *v = _PyUnicode_New(usize);
3891        if (*v == NULL)
3892            return -1;
3893    }
3894    else {
3895        /* Extend unicode object */
3896        n = PyUnicode_GET_SIZE(*v);
3897        if (_PyUnicode_Resize(v, n + usize) < 0)
3898            return -1;
3899    }
3900
3901    /* Do the conversion */
3902    if (size > 0) {
3903        p = PyUnicode_AS_UNICODE(*v) + n;
3904        if (0 == MultiByteToWideChar(CP_ACP, 0, s, size, p, usize)) {
3905            PyErr_SetFromWindowsErrWithFilename(0, NULL);
3906            return -1;
3907        }
3908    }
3909
3910    return size;
3911}
3912
3913PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
3914                                       Py_ssize_t size,
3915                                       const char *errors,
3916                                       Py_ssize_t *consumed)
3917{
3918    PyUnicodeObject *v = NULL;
3919    int done;
3920
3921    if (consumed)
3922        *consumed = 0;
3923
3924#ifdef NEED_RETRY
3925  retry:
3926    if (size > INT_MAX)
3927        done = decode_mbcs(&v, s, INT_MAX, 0);
3928    else
3929#endif
3930        done = decode_mbcs(&v, s, (int)size, !consumed);
3931
3932    if (done < 0) {
3933        Py_XDECREF(v);
3934        return NULL;
3935    }
3936
3937    if (consumed)
3938        *consumed += done;
3939
3940#ifdef NEED_RETRY
3941    if (size > INT_MAX) {
3942        s += done;
3943        size -= done;
3944        goto retry;
3945    }
3946#endif
3947
3948    return (PyObject *)v;
3949}
3950
3951PyObject *PyUnicode_DecodeMBCS(const char *s,
3952                               Py_ssize_t size,
3953                               const char *errors)
3954{
3955    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
3956}
3957
3958/*
3959 * Convert unicode into string object (MBCS).
3960 * Returns 0 if succeed, -1 otherwise.
3961 */
3962static int encode_mbcs(PyObject **repr,
3963                       const Py_UNICODE *p, /* unicode */
3964                       int size) /* size of unicode */
3965{
3966    int mbcssize = 0;
3967    Py_ssize_t n = 0;
3968
3969    assert(size >= 0);
3970
3971    /* First get the size of the result */
3972    if (size > 0) {
3973        mbcssize = WideCharToMultiByte(CP_ACP, 0, p, size, NULL, 0, NULL, NULL);
3974        if (mbcssize == 0) {
3975            PyErr_SetFromWindowsErrWithFilename(0, NULL);
3976            return -1;
3977        }
3978    }
3979
3980    if (*repr == NULL) {
3981        /* Create string object */
3982        *repr = PyString_FromStringAndSize(NULL, mbcssize);
3983        if (*repr == NULL)
3984            return -1;
3985    }
3986    else {
3987        /* Extend string object */
3988        n = PyString_Size(*repr);
3989        if (_PyString_Resize(repr, n + mbcssize) < 0)
3990            return -1;
3991    }
3992
3993    /* Do the conversion */
3994    if (size > 0) {
3995        char *s = PyString_AS_STRING(*repr) + n;
3996        if (0 == WideCharToMultiByte(CP_ACP, 0, p, size, s, mbcssize, NULL, NULL)) {
3997            PyErr_SetFromWindowsErrWithFilename(0, NULL);
3998            return -1;
3999        }
4000    }
4001
4002    return 0;
4003}
4004
4005PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4006                               Py_ssize_t size,
4007                               const char *errors)
4008{
4009    PyObject *repr = NULL;
4010    int ret;
4011
4012#ifdef NEED_RETRY
4013  retry:
4014    if (size > INT_MAX)
4015        ret = encode_mbcs(&repr, p, INT_MAX);
4016    else
4017#endif
4018        ret = encode_mbcs(&repr, p, (int)size);
4019
4020    if (ret < 0) {
4021        Py_XDECREF(repr);
4022        return NULL;
4023    }
4024
4025#ifdef NEED_RETRY
4026    if (size > INT_MAX) {
4027        p += INT_MAX;
4028        size -= INT_MAX;
4029        goto retry;
4030    }
4031#endif
4032
4033    return repr;
4034}
4035
4036PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4037{
4038    if (!PyUnicode_Check(unicode)) {
4039        PyErr_BadArgument();
4040        return NULL;
4041    }
4042    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4043                                PyUnicode_GET_SIZE(unicode),
4044                                NULL);
4045}
4046
4047#undef NEED_RETRY
4048
4049#endif /* MS_WINDOWS */
4050
4051/* --- Character Mapping Codec -------------------------------------------- */
4052
4053PyObject *PyUnicode_DecodeCharmap(const char *s,
4054                                  Py_ssize_t size,
4055                                  PyObject *mapping,
4056                                  const char *errors)
4057{
4058    const char *starts = s;
4059    Py_ssize_t startinpos;
4060    Py_ssize_t endinpos;
4061    Py_ssize_t outpos;
4062    const char *e;
4063    PyUnicodeObject *v;
4064    Py_UNICODE *p;
4065    Py_ssize_t extrachars = 0;
4066    PyObject *errorHandler = NULL;
4067    PyObject *exc = NULL;
4068    Py_UNICODE *mapstring = NULL;
4069    Py_ssize_t maplen = 0;
4070
4071    /* Default to Latin-1 */
4072    if (mapping == NULL)
4073        return PyUnicode_DecodeLatin1(s, size, errors);
4074
4075    v = _PyUnicode_New(size);
4076    if (v == NULL)
4077        goto onError;
4078    if (size == 0)
4079        return (PyObject *)v;
4080    p = PyUnicode_AS_UNICODE(v);
4081    e = s + size;
4082    if (PyUnicode_CheckExact(mapping)) {
4083        mapstring = PyUnicode_AS_UNICODE(mapping);
4084        maplen = PyUnicode_GET_SIZE(mapping);
4085        while (s < e) {
4086            unsigned char ch = *s;
4087            Py_UNICODE x = 0xfffe; /* illegal value */
4088
4089            if (ch < maplen)
4090                x = mapstring[ch];
4091
4092            if (x == 0xfffe) {
4093                /* undefined mapping */
4094                outpos = p-PyUnicode_AS_UNICODE(v);
4095                startinpos = s-starts;
4096                endinpos = startinpos+1;
4097                if (unicode_decode_call_errorhandler(
4098                        errors, &errorHandler,
4099                        "charmap", "character maps to <undefined>",
4100                        starts, size, &startinpos, &endinpos, &exc, &s,
4101                        &v, &outpos, &p)) {
4102                    goto onError;
4103                }
4104                continue;
4105            }
4106            *p++ = x;
4107            ++s;
4108        }
4109    }
4110    else {
4111        while (s < e) {
4112            unsigned char ch = *s;
4113            PyObject *w, *x;
4114
4115            /* Get mapping (char ordinal -> integer, Unicode char or None) */
4116            w = PyInt_FromLong((long)ch);
4117            if (w == NULL)
4118                goto onError;
4119            x = PyObject_GetItem(mapping, w);
4120            Py_DECREF(w);
4121            if (x == NULL) {
4122                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4123                    /* No mapping found means: mapping is undefined. */
4124                    PyErr_Clear();
4125                    x = Py_None;
4126                    Py_INCREF(x);
4127                } else
4128                    goto onError;
4129            }
4130
4131            /* Apply mapping */
4132            if (PyInt_Check(x)) {
4133                long value = PyInt_AS_LONG(x);
4134                if (value < 0 || value > 65535) {
4135                    PyErr_SetString(PyExc_TypeError,
4136                                    "character mapping must be in range(65536)");
4137                    Py_DECREF(x);
4138                    goto onError;
4139                }
4140                *p++ = (Py_UNICODE)value;
4141            }
4142            else if (x == Py_None) {
4143                /* undefined mapping */
4144                outpos = p-PyUnicode_AS_UNICODE(v);
4145                startinpos = s-starts;
4146                endinpos = startinpos+1;
4147                if (unicode_decode_call_errorhandler(
4148                        errors, &errorHandler,
4149                        "charmap", "character maps to <undefined>",
4150                        starts, size, &startinpos, &endinpos, &exc, &s,
4151                        &v, &outpos, &p)) {
4152                    Py_DECREF(x);
4153                    goto onError;
4154                }
4155                Py_DECREF(x);
4156                continue;
4157            }
4158            else if (PyUnicode_Check(x)) {
4159                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4160
4161                if (targetsize == 1)
4162                    /* 1-1 mapping */
4163                    *p++ = *PyUnicode_AS_UNICODE(x);
4164
4165                else if (targetsize > 1) {
4166                    /* 1-n mapping */
4167                    if (targetsize > extrachars) {
4168                        /* resize first */
4169                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4170                        Py_ssize_t needed = (targetsize - extrachars) + \
4171                            (targetsize << 2);
4172                        extrachars += needed;
4173                        /* XXX overflow detection missing */
4174                        if (_PyUnicode_Resize(&v,
4175                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4176                            Py_DECREF(x);
4177                            goto onError;
4178                        }
4179                        p = PyUnicode_AS_UNICODE(v) + oldpos;
4180                    }
4181                    Py_UNICODE_COPY(p,
4182                                    PyUnicode_AS_UNICODE(x),
4183                                    targetsize);
4184                    p += targetsize;
4185                    extrachars -= targetsize;
4186                }
4187                /* 1-0 mapping: skip the character */
4188            }
4189            else {
4190                /* wrong return value */
4191                PyErr_SetString(PyExc_TypeError,
4192                                "character mapping must return integer, None or unicode");
4193                Py_DECREF(x);
4194                goto onError;
4195            }
4196            Py_DECREF(x);
4197            ++s;
4198        }
4199    }
4200    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4201        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4202            goto onError;
4203    Py_XDECREF(errorHandler);
4204    Py_XDECREF(exc);
4205    return (PyObject *)v;
4206
4207  onError:
4208    Py_XDECREF(errorHandler);
4209    Py_XDECREF(exc);
4210    Py_XDECREF(v);
4211    return NULL;
4212}
4213
4214/* Charmap encoding: the lookup table */
4215
4216struct encoding_map{
4217    PyObject_HEAD
4218    unsigned char level1[32];
4219    int count2, count3;
4220    unsigned char level23[1];
4221};
4222
4223static PyObject*
4224encoding_map_size(PyObject *obj, PyObject* args)
4225{
4226    struct encoding_map *map = (struct encoding_map*)obj;
4227    return PyInt_FromLong(sizeof(*map) - 1 + 16*map->count2 +
4228                          128*map->count3);
4229}
4230
4231static PyMethodDef encoding_map_methods[] = {
4232    {"size", encoding_map_size, METH_NOARGS,
4233     PyDoc_STR("Return the size (in bytes) of this object") },
4234    { 0 }
4235};
4236
4237static void
4238encoding_map_dealloc(PyObject* o)
4239{
4240    PyObject_FREE(o);
4241}
4242
4243static PyTypeObject EncodingMapType = {
4244    PyVarObject_HEAD_INIT(NULL, 0)
4245    "EncodingMap",          /*tp_name*/
4246    sizeof(struct encoding_map),   /*tp_basicsize*/
4247    0,                      /*tp_itemsize*/
4248    /* methods */
4249    encoding_map_dealloc,   /*tp_dealloc*/
4250    0,                      /*tp_print*/
4251    0,                      /*tp_getattr*/
4252    0,                      /*tp_setattr*/
4253    0,                      /*tp_compare*/
4254    0,                      /*tp_repr*/
4255    0,                      /*tp_as_number*/
4256    0,                      /*tp_as_sequence*/
4257    0,                      /*tp_as_mapping*/
4258    0,                      /*tp_hash*/
4259    0,                      /*tp_call*/
4260    0,                      /*tp_str*/
4261    0,                      /*tp_getattro*/
4262    0,                      /*tp_setattro*/
4263    0,                      /*tp_as_buffer*/
4264    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
4265    0,                      /*tp_doc*/
4266    0,                      /*tp_traverse*/
4267    0,                      /*tp_clear*/
4268    0,                      /*tp_richcompare*/
4269    0,                      /*tp_weaklistoffset*/
4270    0,                      /*tp_iter*/
4271    0,                      /*tp_iternext*/
4272    encoding_map_methods,   /*tp_methods*/
4273    0,                      /*tp_members*/
4274    0,                      /*tp_getset*/
4275    0,                      /*tp_base*/
4276    0,                      /*tp_dict*/
4277    0,                      /*tp_descr_get*/
4278    0,                      /*tp_descr_set*/
4279    0,                      /*tp_dictoffset*/
4280    0,                      /*tp_init*/
4281    0,                      /*tp_alloc*/
4282    0,                      /*tp_new*/
4283    0,                      /*tp_free*/
4284    0,                      /*tp_is_gc*/
4285};
4286
4287PyObject*
4288PyUnicode_BuildEncodingMap(PyObject* string)
4289{
4290    Py_UNICODE *decode;
4291    PyObject *result;
4292    struct encoding_map *mresult;
4293    int i;
4294    int need_dict = 0;
4295    unsigned char level1[32];
4296    unsigned char level2[512];
4297    unsigned char *mlevel1, *mlevel2, *mlevel3;
4298    int count2 = 0, count3 = 0;
4299
4300    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
4301        PyErr_BadArgument();
4302        return NULL;
4303    }
4304    decode = PyUnicode_AS_UNICODE(string);
4305    memset(level1, 0xFF, sizeof level1);
4306    memset(level2, 0xFF, sizeof level2);
4307
4308    /* If there isn't a one-to-one mapping of NULL to \0,
4309       or if there are non-BMP characters, we need to use
4310       a mapping dictionary. */
4311    if (decode[0] != 0)
4312        need_dict = 1;
4313    for (i = 1; i < 256; i++) {
4314        int l1, l2;
4315        if (decode[i] == 0
4316#ifdef Py_UNICODE_WIDE
4317            || decode[i] > 0xFFFF
4318#endif
4319            ) {
4320            need_dict = 1;
4321            break;
4322        }
4323        if (decode[i] == 0xFFFE)
4324            /* unmapped character */
4325            continue;
4326        l1 = decode[i] >> 11;
4327        l2 = decode[i] >> 7;
4328        if (level1[l1] == 0xFF)
4329            level1[l1] = count2++;
4330        if (level2[l2] == 0xFF)
4331            level2[l2] = count3++;
4332    }
4333
4334    if (count2 >= 0xFF || count3 >= 0xFF)
4335        need_dict = 1;
4336
4337    if (need_dict) {
4338        PyObject *result = PyDict_New();
4339        PyObject *key, *value;
4340        if (!result)
4341            return NULL;
4342        for (i = 0; i < 256; i++) {
4343            value = NULL;
4344            key = PyInt_FromLong(decode[i]);
4345            value = PyInt_FromLong(i);
4346            if (!key || !value)
4347                goto failed1;
4348            if (PyDict_SetItem(result, key, value) == -1)
4349                goto failed1;
4350            Py_DECREF(key);
4351            Py_DECREF(value);
4352        }
4353        return result;
4354      failed1:
4355        Py_XDECREF(key);
4356        Py_XDECREF(value);
4357        Py_DECREF(result);
4358        return NULL;
4359    }
4360
4361    /* Create a three-level trie */
4362    result = PyObject_MALLOC(sizeof(struct encoding_map) +
4363                             16*count2 + 128*count3 - 1);
4364    if (!result)
4365        return PyErr_NoMemory();
4366    PyObject_Init(result, &EncodingMapType);
4367    mresult = (struct encoding_map*)result;
4368    mresult->count2 = count2;
4369    mresult->count3 = count3;
4370    mlevel1 = mresult->level1;
4371    mlevel2 = mresult->level23;
4372    mlevel3 = mresult->level23 + 16*count2;
4373    memcpy(mlevel1, level1, 32);
4374    memset(mlevel2, 0xFF, 16*count2);
4375    memset(mlevel3, 0, 128*count3);
4376    count3 = 0;
4377    for (i = 1; i < 256; i++) {
4378        int o1, o2, o3, i2, i3;
4379        if (decode[i] == 0xFFFE)
4380            /* unmapped character */
4381            continue;
4382        o1 = decode[i]>>11;
4383        o2 = (decode[i]>>7) & 0xF;
4384        i2 = 16*mlevel1[o1] + o2;
4385        if (mlevel2[i2] == 0xFF)
4386            mlevel2[i2] = count3++;
4387        o3 = decode[i] & 0x7F;
4388        i3 = 128*mlevel2[i2] + o3;
4389        mlevel3[i3] = i;
4390    }
4391    return result;
4392}
4393
4394static int
4395encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
4396{
4397    struct encoding_map *map = (struct encoding_map*)mapping;
4398    int l1 = c>>11;
4399    int l2 = (c>>7) & 0xF;
4400    int l3 = c & 0x7F;
4401    int i;
4402
4403#ifdef Py_UNICODE_WIDE
4404    if (c > 0xFFFF) {
4405        return -1;
4406    }
4407#endif
4408    if (c == 0)
4409        return 0;
4410    /* level 1*/
4411    i = map->level1[l1];
4412    if (i == 0xFF) {
4413        return -1;
4414    }
4415    /* level 2*/
4416    i = map->level23[16*i+l2];
4417    if (i == 0xFF) {
4418        return -1;
4419    }
4420    /* level 3 */
4421    i = map->level23[16*map->count2 + 128*i + l3];
4422    if (i == 0) {
4423        return -1;
4424    }
4425    return i;
4426}
4427
4428/* Lookup the character ch in the mapping. If the character
4429   can't be found, Py_None is returned (or NULL, if another
4430   error occurred). */
4431static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
4432{
4433    PyObject *w = PyInt_FromLong((long)c);
4434    PyObject *x;
4435
4436    if (w == NULL)
4437        return NULL;
4438    x = PyObject_GetItem(mapping, w);
4439    Py_DECREF(w);
4440    if (x == NULL) {
4441        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4442            /* No mapping found means: mapping is undefined. */
4443            PyErr_Clear();
4444            x = Py_None;
4445            Py_INCREF(x);
4446            return x;
4447        } else
4448            return NULL;
4449    }
4450    else if (x == Py_None)
4451        return x;
4452    else if (PyInt_Check(x)) {
4453        long value = PyInt_AS_LONG(x);
4454        if (value < 0 || value > 255) {
4455            PyErr_SetString(PyExc_TypeError,
4456                            "character mapping must be in range(256)");
4457            Py_DECREF(x);
4458            return NULL;
4459        }
4460        return x;
4461    }
4462    else if (PyString_Check(x))
4463        return x;
4464    else {
4465        /* wrong return value */
4466        PyErr_SetString(PyExc_TypeError,
4467                        "character mapping must return integer, None or str");
4468        Py_DECREF(x);
4469        return NULL;
4470    }
4471}
4472
4473static int
4474charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
4475{
4476    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4477    /* exponentially overallocate to minimize reallocations */
4478    if (requiredsize < 2*outsize)
4479        requiredsize = 2*outsize;
4480    if (_PyString_Resize(outobj, requiredsize)) {
4481        return 0;
4482    }
4483    return 1;
4484}
4485
4486typedef enum charmapencode_result {
4487    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
4488}charmapencode_result;
4489/* lookup the character, put the result in the output string and adjust
4490   various state variables. Reallocate the output string if not enough
4491   space is available. Return a new reference to the object that
4492   was put in the output buffer, or Py_None, if the mapping was undefined
4493   (in which case no character was written) or NULL, if a
4494   reallocation error occurred. The caller must decref the result */
4495static
4496charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
4497                                          PyObject **outobj, Py_ssize_t *outpos)
4498{
4499    PyObject *rep;
4500    char *outstart;
4501    Py_ssize_t outsize = PyString_GET_SIZE(*outobj);
4502
4503    if (Py_TYPE(mapping) == &EncodingMapType) {
4504        int res = encoding_map_lookup(c, mapping);
4505        Py_ssize_t requiredsize = *outpos+1;
4506        if (res == -1)
4507            return enc_FAILED;
4508        if (outsize<requiredsize)
4509            if (!charmapencode_resize(outobj, outpos, requiredsize))
4510                return enc_EXCEPTION;
4511        outstart = PyString_AS_STRING(*outobj);
4512        outstart[(*outpos)++] = (char)res;
4513        return enc_SUCCESS;
4514    }
4515
4516    rep = charmapencode_lookup(c, mapping);
4517    if (rep==NULL)
4518        return enc_EXCEPTION;
4519    else if (rep==Py_None) {
4520        Py_DECREF(rep);
4521        return enc_FAILED;
4522    } else {
4523        if (PyInt_Check(rep)) {
4524            Py_ssize_t requiredsize = *outpos+1;
4525            if (outsize<requiredsize)
4526                if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4527                    Py_DECREF(rep);
4528                    return enc_EXCEPTION;
4529                }
4530            outstart = PyString_AS_STRING(*outobj);
4531            outstart[(*outpos)++] = (char)PyInt_AS_LONG(rep);
4532        }
4533        else {
4534            const char *repchars = PyString_AS_STRING(rep);
4535            Py_ssize_t repsize = PyString_GET_SIZE(rep);
4536            Py_ssize_t requiredsize = *outpos+repsize;
4537            if (outsize<requiredsize)
4538                if (!charmapencode_resize(outobj, outpos, requiredsize)) {
4539                    Py_DECREF(rep);
4540                    return enc_EXCEPTION;
4541                }
4542            outstart = PyString_AS_STRING(*outobj);
4543            memcpy(outstart + *outpos, repchars, repsize);
4544            *outpos += repsize;
4545        }
4546    }
4547    Py_DECREF(rep);
4548    return enc_SUCCESS;
4549}
4550
4551/* handle an error in PyUnicode_EncodeCharmap
4552   Return 0 on success, -1 on error */
4553static
4554int charmap_encoding_error(
4555    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
4556    PyObject **exceptionObject,
4557    int *known_errorHandler, PyObject **errorHandler, const char *errors,
4558    PyObject **res, Py_ssize_t *respos)
4559{
4560    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
4561    Py_ssize_t repsize;
4562    Py_ssize_t newpos;
4563    Py_UNICODE *uni2;
4564    /* startpos for collecting unencodable chars */
4565    Py_ssize_t collstartpos = *inpos;
4566    Py_ssize_t collendpos = *inpos+1;
4567    Py_ssize_t collpos;
4568    char *encoding = "charmap";
4569    char *reason = "character maps to <undefined>";
4570    charmapencode_result x;
4571
4572    /* find all unencodable characters */
4573    while (collendpos < size) {
4574        PyObject *rep;
4575        if (Py_TYPE(mapping) == &EncodingMapType) {
4576            int res = encoding_map_lookup(p[collendpos], mapping);
4577            if (res != -1)
4578                break;
4579            ++collendpos;
4580            continue;
4581        }
4582
4583        rep = charmapencode_lookup(p[collendpos], mapping);
4584        if (rep==NULL)
4585            return -1;
4586        else if (rep!=Py_None) {
4587            Py_DECREF(rep);
4588            break;
4589        }
4590        Py_DECREF(rep);
4591        ++collendpos;
4592    }
4593    /* cache callback name lookup
4594     * (if not done yet, i.e. it's the first error) */
4595    if (*known_errorHandler==-1) {
4596        if ((errors==NULL) || (!strcmp(errors, "strict")))
4597            *known_errorHandler = 1;
4598        else if (!strcmp(errors, "replace"))
4599            *known_errorHandler = 2;
4600        else if (!strcmp(errors, "ignore"))
4601            *known_errorHandler = 3;
4602        else if (!strcmp(errors, "xmlcharrefreplace"))
4603            *known_errorHandler = 4;
4604        else
4605            *known_errorHandler = 0;
4606    }
4607    switch (*known_errorHandler) {
4608    case 1: /* strict */
4609        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4610        return -1;
4611    case 2: /* replace */
4612        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
4613            x = charmapencode_output('?', mapping, res, respos);
4614            if (x==enc_EXCEPTION) {
4615                return -1;
4616            }
4617            else if (x==enc_FAILED) {
4618                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4619                return -1;
4620            }
4621        }
4622        /* fall through */
4623    case 3: /* ignore */
4624        *inpos = collendpos;
4625        break;
4626    case 4: /* xmlcharrefreplace */
4627        /* generate replacement (temporarily (mis)uses p) */
4628        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
4629            char buffer[2+29+1+1];
4630            char *cp;
4631            sprintf(buffer, "&#%d;", (int)p[collpos]);
4632            for (cp = buffer; *cp; ++cp) {
4633                x = charmapencode_output(*cp, mapping, res, respos);
4634                if (x==enc_EXCEPTION)
4635                    return -1;
4636                else if (x==enc_FAILED) {
4637                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4638                    return -1;
4639                }
4640            }
4641        }
4642        *inpos = collendpos;
4643        break;
4644    default:
4645        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
4646                                                      encoding, reason, p, size, exceptionObject,
4647                                                      collstartpos, collendpos, &newpos);
4648        if (repunicode == NULL)
4649            return -1;
4650        /* generate replacement  */
4651        repsize = PyUnicode_GET_SIZE(repunicode);
4652        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
4653            x = charmapencode_output(*uni2, mapping, res, respos);
4654            if (x==enc_EXCEPTION) {
4655                return -1;
4656            }
4657            else if (x==enc_FAILED) {
4658                Py_DECREF(repunicode);
4659                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
4660                return -1;
4661            }
4662        }
4663        *inpos = newpos;
4664        Py_DECREF(repunicode);
4665    }
4666    return 0;
4667}
4668
4669PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
4670                                  Py_ssize_t size,
4671                                  PyObject *mapping,
4672                                  const char *errors)
4673{
4674    /* output object */
4675    PyObject *res = NULL;
4676    /* current input position */
4677    Py_ssize_t inpos = 0;
4678    /* current output position */
4679    Py_ssize_t respos = 0;
4680    PyObject *errorHandler = NULL;
4681    PyObject *exc = NULL;
4682    /* the following variable is used for caching string comparisons
4683     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4684     * 3=ignore, 4=xmlcharrefreplace */
4685    int known_errorHandler = -1;
4686
4687    /* Default to Latin-1 */
4688    if (mapping == NULL)
4689        return PyUnicode_EncodeLatin1(p, size, errors);
4690
4691    /* allocate enough for a simple encoding without
4692       replacements, if we need more, we'll resize */
4693    res = PyString_FromStringAndSize(NULL, size);
4694    if (res == NULL)
4695        goto onError;
4696    if (size == 0)
4697        return res;
4698
4699    while (inpos<size) {
4700        /* try to encode it */
4701        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
4702        if (x==enc_EXCEPTION) /* error */
4703            goto onError;
4704        if (x==enc_FAILED) { /* unencodable character */
4705            if (charmap_encoding_error(p, size, &inpos, mapping,
4706                                       &exc,
4707                                       &known_errorHandler, &errorHandler, errors,
4708                                       &res, &respos)) {
4709                goto onError;
4710            }
4711        }
4712        else
4713            /* done with this character => adjust input position */
4714            ++inpos;
4715    }
4716
4717    /* Resize if we allocated to much */
4718    if (respos<PyString_GET_SIZE(res)) {
4719        if (_PyString_Resize(&res, respos))
4720            goto onError;
4721    }
4722    Py_XDECREF(exc);
4723    Py_XDECREF(errorHandler);
4724    return res;
4725
4726  onError:
4727    Py_XDECREF(res);
4728    Py_XDECREF(exc);
4729    Py_XDECREF(errorHandler);
4730    return NULL;
4731}
4732
4733PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
4734                                    PyObject *mapping)
4735{
4736    if (!PyUnicode_Check(unicode) || mapping == NULL) {
4737        PyErr_BadArgument();
4738        return NULL;
4739    }
4740    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
4741                                   PyUnicode_GET_SIZE(unicode),
4742                                   mapping,
4743                                   NULL);
4744}
4745
4746/* create or adjust a UnicodeTranslateError */
4747static void make_translate_exception(PyObject **exceptionObject,
4748                                     const Py_UNICODE *unicode, Py_ssize_t size,
4749                                     Py_ssize_t startpos, Py_ssize_t endpos,
4750                                     const char *reason)
4751{
4752    if (*exceptionObject == NULL) {
4753        *exceptionObject = PyUnicodeTranslateError_Create(
4754            unicode, size, startpos, endpos, reason);
4755    }
4756    else {
4757        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
4758            goto onError;
4759        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
4760            goto onError;
4761        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
4762            goto onError;
4763        return;
4764      onError:
4765        Py_DECREF(*exceptionObject);
4766        *exceptionObject = NULL;
4767    }
4768}
4769
4770/* raises a UnicodeTranslateError */
4771static void raise_translate_exception(PyObject **exceptionObject,
4772                                      const Py_UNICODE *unicode, Py_ssize_t size,
4773                                      Py_ssize_t startpos, Py_ssize_t endpos,
4774                                      const char *reason)
4775{
4776    make_translate_exception(exceptionObject,
4777                             unicode, size, startpos, endpos, reason);
4778    if (*exceptionObject != NULL)
4779        PyCodec_StrictErrors(*exceptionObject);
4780}
4781
4782/* error handling callback helper:
4783   build arguments, call the callback and check the arguments,
4784   put the result into newpos and return the replacement string, which
4785   has to be freed by the caller */
4786static PyObject *unicode_translate_call_errorhandler(const char *errors,
4787                                                     PyObject **errorHandler,
4788                                                     const char *reason,
4789                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4790                                                     Py_ssize_t startpos, Py_ssize_t endpos,
4791                                                     Py_ssize_t *newpos)
4792{
4793    static char *argparse = "O!n;translating error handler must return (unicode, int) tuple";
4794
4795    Py_ssize_t i_newpos;
4796    PyObject *restuple;
4797    PyObject *resunicode;
4798
4799    if (*errorHandler == NULL) {
4800        *errorHandler = PyCodec_LookupError(errors);
4801        if (*errorHandler == NULL)
4802            return NULL;
4803    }
4804
4805    make_translate_exception(exceptionObject,
4806                             unicode, size, startpos, endpos, reason);
4807    if (*exceptionObject == NULL)
4808        return NULL;
4809
4810    restuple = PyObject_CallFunctionObjArgs(
4811        *errorHandler, *exceptionObject, NULL);
4812    if (restuple == NULL)
4813        return NULL;
4814    if (!PyTuple_Check(restuple)) {
4815        PyErr_SetString(PyExc_TypeError, &argparse[4]);
4816        Py_DECREF(restuple);
4817        return NULL;
4818    }
4819    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
4820                          &resunicode, &i_newpos)) {
4821        Py_DECREF(restuple);
4822        return NULL;
4823    }
4824    if (i_newpos<0)
4825        *newpos = size+i_newpos;
4826    else
4827        *newpos = i_newpos;
4828    if (*newpos<0 || *newpos>size) {
4829        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4830        Py_DECREF(restuple);
4831        return NULL;
4832    }
4833    Py_INCREF(resunicode);
4834    Py_DECREF(restuple);
4835    return resunicode;
4836}
4837
4838/* Lookup the character ch in the mapping and put the result in result,
4839   which must be decrefed by the caller.
4840   Return 0 on success, -1 on error */
4841static
4842int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
4843{
4844    PyObject *w = PyInt_FromLong((long)c);
4845    PyObject *x;
4846
4847    if (w == NULL)
4848        return -1;
4849    x = PyObject_GetItem(mapping, w);
4850    Py_DECREF(w);
4851    if (x == NULL) {
4852        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4853            /* No mapping found means: use 1:1 mapping. */
4854            PyErr_Clear();
4855            *result = NULL;
4856            return 0;
4857        } else
4858            return -1;
4859    }
4860    else if (x == Py_None) {
4861        *result = x;
4862        return 0;
4863    }
4864    else if (PyInt_Check(x)) {
4865        long value = PyInt_AS_LONG(x);
4866        long max = PyUnicode_GetMax();
4867        if (value < 0 || value > max) {
4868            PyErr_Format(PyExc_TypeError,
4869                         "character mapping must be in range(0x%lx)", max+1);
4870            Py_DECREF(x);
4871            return -1;
4872        }
4873        *result = x;
4874        return 0;
4875    }
4876    else if (PyUnicode_Check(x)) {
4877        *result = x;
4878        return 0;
4879    }
4880    else {
4881        /* wrong return value */
4882        PyErr_SetString(PyExc_TypeError,
4883                        "character mapping must return integer, None or unicode");
4884        Py_DECREF(x);
4885        return -1;
4886    }
4887}
4888/* ensure that *outobj is at least requiredsize characters long,
4889   if not reallocate and adjust various state variables.
4890   Return 0 on success, -1 on error */
4891static
4892int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
4893                               Py_ssize_t requiredsize)
4894{
4895    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
4896    if (requiredsize > oldsize) {
4897        /* remember old output position */
4898        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
4899        /* exponentially overallocate to minimize reallocations */
4900        if (requiredsize < 2 * oldsize)
4901            requiredsize = 2 * oldsize;
4902        if (PyUnicode_Resize(outobj, requiredsize) < 0)
4903            return -1;
4904        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
4905    }
4906    return 0;
4907}
4908/* lookup the character, put the result in the output string and adjust
4909   various state variables. Return a new reference to the object that
4910   was put in the output buffer in *result, or Py_None, if the mapping was
4911   undefined (in which case no character was written).
4912   The called must decref result.
4913   Return 0 on success, -1 on error. */
4914static
4915int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
4916                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
4917                            PyObject **res)
4918{
4919    if (charmaptranslate_lookup(*curinp, mapping, res))
4920        return -1;
4921    if (*res==NULL) {
4922        /* not found => default to 1:1 mapping */
4923        *(*outp)++ = *curinp;
4924    }
4925    else if (*res==Py_None)
4926        ;
4927    else if (PyInt_Check(*res)) {
4928        /* no overflow check, because we know that the space is enough */
4929        *(*outp)++ = (Py_UNICODE)PyInt_AS_LONG(*res);
4930    }
4931    else if (PyUnicode_Check(*res)) {
4932        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
4933        if (repsize==1) {
4934            /* no overflow check, because we know that the space is enough */
4935            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
4936        }
4937        else if (repsize!=0) {
4938            /* more than one character */
4939            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
4940                (insize - (curinp-startinp)) +
4941                repsize - 1;
4942            if (charmaptranslate_makespace(outobj, outp, requiredsize))
4943                return -1;
4944            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
4945            *outp += repsize;
4946        }
4947    }
4948    else
4949        return -1;
4950    return 0;
4951}
4952
4953PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
4954                                     Py_ssize_t size,
4955                                     PyObject *mapping,
4956                                     const char *errors)
4957{
4958    /* output object */
4959    PyObject *res = NULL;
4960    /* pointers to the beginning and end+1 of input */
4961    const Py_UNICODE *startp = p;
4962    const Py_UNICODE *endp = p + size;
4963    /* pointer into the output */
4964    Py_UNICODE *str;
4965    /* current output position */
4966    Py_ssize_t respos = 0;
4967    char *reason = "character maps to <undefined>";
4968    PyObject *errorHandler = NULL;
4969    PyObject *exc = NULL;
4970    /* the following variable is used for caching string comparisons
4971     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
4972     * 3=ignore, 4=xmlcharrefreplace */
4973    int known_errorHandler = -1;
4974
4975    if (mapping == NULL) {
4976        PyErr_BadArgument();
4977        return NULL;
4978    }
4979
4980    /* allocate enough for a simple 1:1 translation without
4981       replacements, if we need more, we'll resize */
4982    res = PyUnicode_FromUnicode(NULL, size);
4983    if (res == NULL)
4984        goto onError;
4985    if (size == 0)
4986        return res;
4987    str = PyUnicode_AS_UNICODE(res);
4988
4989    while (p<endp) {
4990        /* try to encode it */
4991        PyObject *x = NULL;
4992        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
4993            Py_XDECREF(x);
4994            goto onError;
4995        }
4996        Py_XDECREF(x);
4997        if (x!=Py_None) /* it worked => adjust input pointer */
4998            ++p;
4999        else { /* untranslatable character */
5000            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5001            Py_ssize_t repsize;
5002            Py_ssize_t newpos;
5003            Py_UNICODE *uni2;
5004            /* startpos for collecting untranslatable chars */
5005            const Py_UNICODE *collstart = p;
5006            const Py_UNICODE *collend = p+1;
5007            const Py_UNICODE *coll;
5008
5009            /* find all untranslatable characters */
5010            while (collend < endp) {
5011                if (charmaptranslate_lookup(*collend, mapping, &x))
5012                    goto onError;
5013                Py_XDECREF(x);
5014                if (x!=Py_None)
5015                    break;
5016                ++collend;
5017            }
5018            /* cache callback name lookup
5019             * (if not done yet, i.e. it's the first error) */
5020            if (known_errorHandler==-1) {
5021                if ((errors==NULL) || (!strcmp(errors, "strict")))
5022                    known_errorHandler = 1;
5023                else if (!strcmp(errors, "replace"))
5024                    known_errorHandler = 2;
5025                else if (!strcmp(errors, "ignore"))
5026                    known_errorHandler = 3;
5027                else if (!strcmp(errors, "xmlcharrefreplace"))
5028                    known_errorHandler = 4;
5029                else
5030                    known_errorHandler = 0;
5031            }
5032            switch (known_errorHandler) {
5033            case 1: /* strict */
5034                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5035                goto onError;
5036            case 2: /* replace */
5037                /* No need to check for space, this is a 1:1 replacement */
5038                for (coll = collstart; coll<collend; ++coll)
5039                    *str++ = '?';
5040                /* fall through */
5041            case 3: /* ignore */
5042                p = collend;
5043                break;
5044            case 4: /* xmlcharrefreplace */
5045                /* generate replacement (temporarily (mis)uses p) */
5046                for (p = collstart; p < collend; ++p) {
5047                    char buffer[2+29+1+1];
5048                    char *cp;
5049                    sprintf(buffer, "&#%d;", (int)*p);
5050                    if (charmaptranslate_makespace(&res, &str,
5051                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5052                        goto onError;
5053                    for (cp = buffer; *cp; ++cp)
5054                        *str++ = *cp;
5055                }
5056                p = collend;
5057                break;
5058            default:
5059                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5060                                                                 reason, startp, size, &exc,
5061                                                                 collstart-startp, collend-startp, &newpos);
5062                if (repunicode == NULL)
5063                    goto onError;
5064                /* generate replacement  */
5065                repsize = PyUnicode_GET_SIZE(repunicode);
5066                if (charmaptranslate_makespace(&res, &str,
5067                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5068                    Py_DECREF(repunicode);
5069                    goto onError;
5070                }
5071                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5072                    *str++ = *uni2;
5073                p = startp + newpos;
5074                Py_DECREF(repunicode);
5075            }
5076        }
5077    }
5078    /* Resize if we allocated to much */
5079    respos = str-PyUnicode_AS_UNICODE(res);
5080    if (respos<PyUnicode_GET_SIZE(res)) {
5081        if (PyUnicode_Resize(&res, respos) < 0)
5082            goto onError;
5083    }
5084    Py_XDECREF(exc);
5085    Py_XDECREF(errorHandler);
5086    return res;
5087
5088  onError:
5089    Py_XDECREF(res);
5090    Py_XDECREF(exc);
5091    Py_XDECREF(errorHandler);
5092    return NULL;
5093}
5094
5095PyObject *PyUnicode_Translate(PyObject *str,
5096                              PyObject *mapping,
5097                              const char *errors)
5098{
5099    PyObject *result;
5100
5101    str = PyUnicode_FromObject(str);
5102    if (str == NULL)
5103        goto onError;
5104    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5105                                        PyUnicode_GET_SIZE(str),
5106                                        mapping,
5107                                        errors);
5108    Py_DECREF(str);
5109    return result;
5110
5111  onError:
5112    Py_XDECREF(str);
5113    return NULL;
5114}
5115
5116/* --- Decimal Encoder ---------------------------------------------------- */
5117
5118int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5119                            Py_ssize_t length,
5120                            char *output,
5121                            const char *errors)
5122{
5123    Py_UNICODE *p, *end;
5124    PyObject *errorHandler = NULL;
5125    PyObject *exc = NULL;
5126    const char *encoding = "decimal";
5127    const char *reason = "invalid decimal Unicode string";
5128    /* the following variable is used for caching string comparisons
5129     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5130    int known_errorHandler = -1;
5131
5132    if (output == NULL) {
5133        PyErr_BadArgument();
5134        return -1;
5135    }
5136
5137    p = s;
5138    end = s + length;
5139    while (p < end) {
5140        register Py_UNICODE ch = *p;
5141        int decimal;
5142        PyObject *repunicode;
5143        Py_ssize_t repsize;
5144        Py_ssize_t newpos;
5145        Py_UNICODE *uni2;
5146        Py_UNICODE *collstart;
5147        Py_UNICODE *collend;
5148
5149        if (Py_UNICODE_ISSPACE(ch)) {
5150            *output++ = ' ';
5151            ++p;
5152            continue;
5153        }
5154        decimal = Py_UNICODE_TODECIMAL(ch);
5155        if (decimal >= 0) {
5156            *output++ = '0' + decimal;
5157            ++p;
5158            continue;
5159        }
5160        if (0 < ch && ch < 256) {
5161            *output++ = (char)ch;
5162            ++p;
5163            continue;
5164        }
5165        /* All other characters are considered unencodable */
5166        collstart = p;
5167        collend = p+1;
5168        while (collend < end) {
5169            if ((0 < *collend && *collend < 256) ||
5170                !Py_UNICODE_ISSPACE(*collend) ||
5171                Py_UNICODE_TODECIMAL(*collend))
5172                break;
5173        }
5174        /* cache callback name lookup
5175         * (if not done yet, i.e. it's the first error) */
5176        if (known_errorHandler==-1) {
5177            if ((errors==NULL) || (!strcmp(errors, "strict")))
5178                known_errorHandler = 1;
5179            else if (!strcmp(errors, "replace"))
5180                known_errorHandler = 2;
5181            else if (!strcmp(errors, "ignore"))
5182                known_errorHandler = 3;
5183            else if (!strcmp(errors, "xmlcharrefreplace"))
5184                known_errorHandler = 4;
5185            else
5186                known_errorHandler = 0;
5187        }
5188        switch (known_errorHandler) {
5189        case 1: /* strict */
5190            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5191            goto onError;
5192        case 2: /* replace */
5193            for (p = collstart; p < collend; ++p)
5194                *output++ = '?';
5195            /* fall through */
5196        case 3: /* ignore */
5197            p = collend;
5198            break;
5199        case 4: /* xmlcharrefreplace */
5200            /* generate replacement (temporarily (mis)uses p) */
5201            for (p = collstart; p < collend; ++p)
5202                output += sprintf(output, "&#%d;", (int)*p);
5203            p = collend;
5204            break;
5205        default:
5206            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
5207                                                          encoding, reason, s, length, &exc,
5208                                                          collstart-s, collend-s, &newpos);
5209            if (repunicode == NULL)
5210                goto onError;
5211            /* generate replacement  */
5212            repsize = PyUnicode_GET_SIZE(repunicode);
5213            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5214                Py_UNICODE ch = *uni2;
5215                if (Py_UNICODE_ISSPACE(ch))
5216                    *output++ = ' ';
5217                else {
5218                    decimal = Py_UNICODE_TODECIMAL(ch);
5219                    if (decimal >= 0)
5220                        *output++ = '0' + decimal;
5221                    else if (0 < ch && ch < 256)
5222                        *output++ = (char)ch;
5223                    else {
5224                        Py_DECREF(repunicode);
5225                        raise_encode_exception(&exc, encoding,
5226                                               s, length, collstart-s, collend-s, reason);
5227                        goto onError;
5228                    }
5229                }
5230            }
5231            p = s + newpos;
5232            Py_DECREF(repunicode);
5233        }
5234    }
5235    /* 0-terminate the output string */
5236    *output++ = '\0';
5237    Py_XDECREF(exc);
5238    Py_XDECREF(errorHandler);
5239    return 0;
5240
5241  onError:
5242    Py_XDECREF(exc);
5243    Py_XDECREF(errorHandler);
5244    return -1;
5245}
5246
5247/* --- Helpers ------------------------------------------------------------ */
5248
5249#include "stringlib/unicodedefs.h"
5250#include "stringlib/fastsearch.h"
5251
5252#include "stringlib/count.h"
5253#include "stringlib/find.h"
5254#include "stringlib/partition.h"
5255#include "stringlib/split.h"
5256
5257/* helper macro to fixup start/end slice values */
5258#define ADJUST_INDICES(start, end, len)         \
5259    if (end > len)                              \
5260        end = len;                              \
5261    else if (end < 0) {                         \
5262        end += len;                             \
5263        if (end < 0)                            \
5264            end = 0;                            \
5265    }                                           \
5266    if (start < 0) {                            \
5267        start += len;                           \
5268        if (start < 0)                          \
5269            start = 0;                          \
5270    }
5271
5272Py_ssize_t PyUnicode_Count(PyObject *str,
5273                           PyObject *substr,
5274                           Py_ssize_t start,
5275                           Py_ssize_t end)
5276{
5277    Py_ssize_t result;
5278    PyUnicodeObject* str_obj;
5279    PyUnicodeObject* sub_obj;
5280
5281    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
5282    if (!str_obj)
5283        return -1;
5284    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
5285    if (!sub_obj) {
5286        Py_DECREF(str_obj);
5287        return -1;
5288    }
5289
5290    ADJUST_INDICES(start, end, str_obj->length);
5291    result = stringlib_count(
5292        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
5293        PY_SSIZE_T_MAX
5294        );
5295
5296    Py_DECREF(sub_obj);
5297    Py_DECREF(str_obj);
5298
5299    return result;
5300}
5301
5302Py_ssize_t PyUnicode_Find(PyObject *str,
5303                          PyObject *sub,
5304                          Py_ssize_t start,
5305                          Py_ssize_t end,
5306                          int direction)
5307{
5308    Py_ssize_t result;
5309
5310    str = PyUnicode_FromObject(str);
5311    if (!str)
5312        return -2;
5313    sub = PyUnicode_FromObject(sub);
5314    if (!sub) {
5315        Py_DECREF(str);
5316        return -2;
5317    }
5318
5319    if (direction > 0)
5320        result = stringlib_find_slice(
5321            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5322            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5323            start, end
5324            );
5325    else
5326        result = stringlib_rfind_slice(
5327            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
5328            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
5329            start, end
5330            );
5331
5332    Py_DECREF(str);
5333    Py_DECREF(sub);
5334
5335    return result;
5336}
5337
5338static
5339int tailmatch(PyUnicodeObject *self,
5340              PyUnicodeObject *substring,
5341              Py_ssize_t start,
5342              Py_ssize_t end,
5343              int direction)
5344{
5345    if (substring->length == 0)
5346        return 1;
5347
5348    ADJUST_INDICES(start, end, self->length);
5349    end -= substring->length;
5350    if (end < start)
5351        return 0;
5352
5353    if (direction > 0) {
5354        if (Py_UNICODE_MATCH(self, end, substring))
5355            return 1;
5356    } else {
5357        if (Py_UNICODE_MATCH(self, start, substring))
5358            return 1;
5359    }
5360
5361    return 0;
5362}
5363
5364Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
5365                               PyObject *substr,
5366                               Py_ssize_t start,
5367                               Py_ssize_t end,
5368                               int direction)
5369{
5370    Py_ssize_t result;
5371
5372    str = PyUnicode_FromObject(str);
5373    if (str == NULL)
5374        return -1;
5375    substr = PyUnicode_FromObject(substr);
5376    if (substr == NULL) {
5377        Py_DECREF(str);
5378        return -1;
5379    }
5380
5381    result = tailmatch((PyUnicodeObject *)str,
5382                       (PyUnicodeObject *)substr,
5383                       start, end, direction);
5384    Py_DECREF(str);
5385    Py_DECREF(substr);
5386    return result;
5387}
5388
5389/* Apply fixfct filter to the Unicode object self and return a
5390   reference to the modified object */
5391
5392static
5393PyObject *fixup(PyUnicodeObject *self,
5394                int (*fixfct)(PyUnicodeObject *s))
5395{
5396
5397    PyUnicodeObject *u;
5398
5399    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5400    if (u == NULL)
5401        return NULL;
5402
5403    Py_UNICODE_COPY(u->str, self->str, self->length);
5404
5405    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
5406        /* fixfct should return TRUE if it modified the buffer. If
5407           FALSE, return a reference to the original buffer instead
5408           (to save space, not time) */
5409        Py_INCREF(self);
5410        Py_DECREF(u);
5411        return (PyObject*) self;
5412    }
5413    return (PyObject*) u;
5414}
5415
5416static
5417int fixupper(PyUnicodeObject *self)
5418{
5419    Py_ssize_t len = self->length;
5420    Py_UNICODE *s = self->str;
5421    int status = 0;
5422
5423    while (len-- > 0) {
5424        register Py_UNICODE ch;
5425
5426        ch = Py_UNICODE_TOUPPER(*s);
5427        if (ch != *s) {
5428            status = 1;
5429            *s = ch;
5430        }
5431        s++;
5432    }
5433
5434    return status;
5435}
5436
5437static
5438int fixlower(PyUnicodeObject *self)
5439{
5440    Py_ssize_t len = self->length;
5441    Py_UNICODE *s = self->str;
5442    int status = 0;
5443
5444    while (len-- > 0) {
5445        register Py_UNICODE ch;
5446
5447        ch = Py_UNICODE_TOLOWER(*s);
5448        if (ch != *s) {
5449            status = 1;
5450            *s = ch;
5451        }
5452        s++;
5453    }
5454
5455    return status;
5456}
5457
5458static
5459int fixswapcase(PyUnicodeObject *self)
5460{
5461    Py_ssize_t len = self->length;
5462    Py_UNICODE *s = self->str;
5463    int status = 0;
5464
5465    while (len-- > 0) {
5466        if (Py_UNICODE_ISUPPER(*s)) {
5467            *s = Py_UNICODE_TOLOWER(*s);
5468            status = 1;
5469        } else if (Py_UNICODE_ISLOWER(*s)) {
5470            *s = Py_UNICODE_TOUPPER(*s);
5471            status = 1;
5472        }
5473        s++;
5474    }
5475
5476    return status;
5477}
5478
5479static
5480int fixcapitalize(PyUnicodeObject *self)
5481{
5482    Py_ssize_t len = self->length;
5483    Py_UNICODE *s = self->str;
5484    int status = 0;
5485
5486    if (len == 0)
5487        return 0;
5488    if (Py_UNICODE_ISLOWER(*s)) {
5489        *s = Py_UNICODE_TOUPPER(*s);
5490        status = 1;
5491    }
5492    s++;
5493    while (--len > 0) {
5494        if (Py_UNICODE_ISUPPER(*s)) {
5495            *s = Py_UNICODE_TOLOWER(*s);
5496            status = 1;
5497        }
5498        s++;
5499    }
5500    return status;
5501}
5502
5503static
5504int fixtitle(PyUnicodeObject *self)
5505{
5506    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
5507    register Py_UNICODE *e;
5508    int previous_is_cased;
5509
5510    /* Shortcut for single character strings */
5511    if (PyUnicode_GET_SIZE(self) == 1) {
5512        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
5513        if (*p != ch) {
5514            *p = ch;
5515            return 1;
5516        }
5517        else
5518            return 0;
5519    }
5520
5521    e = p + PyUnicode_GET_SIZE(self);
5522    previous_is_cased = 0;
5523    for (; p < e; p++) {
5524        register const Py_UNICODE ch = *p;
5525
5526        if (previous_is_cased)
5527            *p = Py_UNICODE_TOLOWER(ch);
5528        else
5529            *p = Py_UNICODE_TOTITLE(ch);
5530
5531        if (Py_UNICODE_ISLOWER(ch) ||
5532            Py_UNICODE_ISUPPER(ch) ||
5533            Py_UNICODE_ISTITLE(ch))
5534            previous_is_cased = 1;
5535        else
5536            previous_is_cased = 0;
5537    }
5538    return 1;
5539}
5540
5541PyObject *
5542PyUnicode_Join(PyObject *separator, PyObject *seq)
5543{
5544    PyObject *internal_separator = NULL;
5545    const Py_UNICODE blank = ' ';
5546    const Py_UNICODE *sep = &blank;
5547    Py_ssize_t seplen = 1;
5548    PyUnicodeObject *res = NULL; /* the result */
5549    Py_ssize_t res_alloc = 100;  /* # allocated bytes for string in res */
5550    Py_ssize_t res_used;         /* # used bytes */
5551    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
5552    PyObject *fseq;          /* PySequence_Fast(seq) */
5553    Py_ssize_t seqlen;              /* len(fseq) -- number of items in sequence */
5554    PyObject *item;
5555    Py_ssize_t i;
5556
5557    fseq = PySequence_Fast(seq, "");
5558    if (fseq == NULL) {
5559        return NULL;
5560    }
5561
5562    /* Grrrr.  A codec may be invoked to convert str objects to
5563     * Unicode, and so it's possible to call back into Python code
5564     * during PyUnicode_FromObject(), and so it's possible for a sick
5565     * codec to change the size of fseq (if seq is a list).  Therefore
5566     * we have to keep refetching the size -- can't assume seqlen
5567     * is invariant.
5568     */
5569    seqlen = PySequence_Fast_GET_SIZE(fseq);
5570    /* If empty sequence, return u"". */
5571    if (seqlen == 0) {
5572        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
5573        goto Done;
5574    }
5575    /* If singleton sequence with an exact Unicode, return that. */
5576    if (seqlen == 1) {
5577        item = PySequence_Fast_GET_ITEM(fseq, 0);
5578        if (PyUnicode_CheckExact(item)) {
5579            Py_INCREF(item);
5580            res = (PyUnicodeObject *)item;
5581            goto Done;
5582        }
5583    }
5584
5585    /* At least two items to join, or one that isn't exact Unicode. */
5586    if (seqlen > 1) {
5587        /* Set up sep and seplen -- they're needed. */
5588        if (separator == NULL) {
5589            sep = &blank;
5590            seplen = 1;
5591        }
5592        else {
5593            internal_separator = PyUnicode_FromObject(separator);
5594            if (internal_separator == NULL)
5595                goto onError;
5596            sep = PyUnicode_AS_UNICODE(internal_separator);
5597            seplen = PyUnicode_GET_SIZE(internal_separator);
5598            /* In case PyUnicode_FromObject() mutated seq. */
5599            seqlen = PySequence_Fast_GET_SIZE(fseq);
5600        }
5601    }
5602
5603    /* Get space. */
5604    res = _PyUnicode_New(res_alloc);
5605    if (res == NULL)
5606        goto onError;
5607    res_p = PyUnicode_AS_UNICODE(res);
5608    res_used = 0;
5609
5610    for (i = 0; i < seqlen; ++i) {
5611        Py_ssize_t itemlen;
5612        Py_ssize_t new_res_used;
5613
5614        item = PySequence_Fast_GET_ITEM(fseq, i);
5615        /* Convert item to Unicode. */
5616        if (! PyUnicode_Check(item) && ! PyString_Check(item)) {
5617            PyErr_Format(PyExc_TypeError,
5618                         "sequence item %zd: expected string or Unicode,"
5619                         " %.80s found",
5620                         i, Py_TYPE(item)->tp_name);
5621            goto onError;
5622        }
5623        item = PyUnicode_FromObject(item);
5624        if (item == NULL)
5625            goto onError;
5626        /* We own a reference to item from here on. */
5627
5628        /* In case PyUnicode_FromObject() mutated seq. */
5629        seqlen = PySequence_Fast_GET_SIZE(fseq);
5630
5631        /* Make sure we have enough space for the separator and the item. */
5632        itemlen = PyUnicode_GET_SIZE(item);
5633        new_res_used = res_used + itemlen;
5634        if (new_res_used < 0)
5635            goto Overflow;
5636        if (i < seqlen - 1) {
5637            new_res_used += seplen;
5638            if (new_res_used < 0)
5639                goto Overflow;
5640        }
5641        if (new_res_used > res_alloc) {
5642            /* double allocated size until it's big enough */
5643            do {
5644                res_alloc += res_alloc;
5645                if (res_alloc <= 0)
5646                    goto Overflow;
5647            } while (new_res_used > res_alloc);
5648            if (_PyUnicode_Resize(&res, res_alloc) < 0) {
5649                Py_DECREF(item);
5650                goto onError;
5651            }
5652            res_p = PyUnicode_AS_UNICODE(res) + res_used;
5653        }
5654
5655        /* Copy item, and maybe the separator. */
5656        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
5657        res_p += itemlen;
5658        if (i < seqlen - 1) {
5659            Py_UNICODE_COPY(res_p, sep, seplen);
5660            res_p += seplen;
5661        }
5662        Py_DECREF(item);
5663        res_used = new_res_used;
5664    }
5665
5666    /* Shrink res to match the used area; this probably can't fail,
5667     * but it's cheap to check.
5668     */
5669    if (_PyUnicode_Resize(&res, res_used) < 0)
5670        goto onError;
5671
5672  Done:
5673    Py_XDECREF(internal_separator);
5674    Py_DECREF(fseq);
5675    return (PyObject *)res;
5676
5677  Overflow:
5678    PyErr_SetString(PyExc_OverflowError,
5679                    "join() result is too long for a Python string");
5680    Py_DECREF(item);
5681    /* fall through */
5682
5683  onError:
5684    Py_XDECREF(internal_separator);
5685    Py_DECREF(fseq);
5686    Py_XDECREF(res);
5687    return NULL;
5688}
5689
5690static
5691PyUnicodeObject *pad(PyUnicodeObject *self,
5692                     Py_ssize_t left,
5693                     Py_ssize_t right,
5694                     Py_UNICODE fill)
5695{
5696    PyUnicodeObject *u;
5697
5698    if (left < 0)
5699        left = 0;
5700    if (right < 0)
5701        right = 0;
5702
5703    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
5704        Py_INCREF(self);
5705        return self;
5706    }
5707
5708    if (left > PY_SSIZE_T_MAX - self->length ||
5709        right > PY_SSIZE_T_MAX - (left + self->length)) {
5710        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
5711        return NULL;
5712    }
5713    u = _PyUnicode_New(left + self->length + right);
5714    if (u) {
5715        if (left)
5716            Py_UNICODE_FILL(u->str, fill, left);
5717        Py_UNICODE_COPY(u->str + left, self->str, self->length);
5718        if (right)
5719            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
5720    }
5721
5722    return u;
5723}
5724
5725PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
5726{
5727    PyObject *list;
5728
5729    string = PyUnicode_FromObject(string);
5730    if (string == NULL)
5731        return NULL;
5732
5733    list = stringlib_splitlines(
5734        (PyObject*) string, PyUnicode_AS_UNICODE(string),
5735        PyUnicode_GET_SIZE(string), keepends);
5736
5737    Py_DECREF(string);
5738    return list;
5739}
5740
5741static
5742PyObject *split(PyUnicodeObject *self,
5743                PyUnicodeObject *substring,
5744                Py_ssize_t maxcount)
5745{
5746    if (maxcount < 0)
5747        maxcount = PY_SSIZE_T_MAX;
5748
5749    if (substring == NULL)
5750        return stringlib_split_whitespace(
5751            (PyObject*) self,  self->str, self->length, maxcount
5752            );
5753
5754    return stringlib_split(
5755        (PyObject*) self,  self->str, self->length,
5756        substring->str, substring->length,
5757        maxcount
5758        );
5759}
5760
5761static
5762PyObject *rsplit(PyUnicodeObject *self,
5763                 PyUnicodeObject *substring,
5764                 Py_ssize_t maxcount)
5765{
5766    if (maxcount < 0)
5767        maxcount = PY_SSIZE_T_MAX;
5768
5769    if (substring == NULL)
5770        return stringlib_rsplit_whitespace(
5771            (PyObject*) self,  self->str, self->length, maxcount
5772            );
5773
5774    return stringlib_rsplit(
5775        (PyObject*) self,  self->str, self->length,
5776        substring->str, substring->length,
5777        maxcount
5778        );
5779}
5780
5781static
5782PyObject *replace(PyUnicodeObject *self,
5783                  PyUnicodeObject *str1,
5784                  PyUnicodeObject *str2,
5785                  Py_ssize_t maxcount)
5786{
5787    PyUnicodeObject *u;
5788
5789    if (maxcount < 0)
5790        maxcount = PY_SSIZE_T_MAX;
5791    else if (maxcount == 0 || self->length == 0)
5792        goto nothing;
5793
5794    if (str1->length == str2->length) {
5795        Py_ssize_t i;
5796        /* same length */
5797        if (str1->length == 0)
5798            goto nothing;
5799        if (str1->length == 1) {
5800            /* replace characters */
5801            Py_UNICODE u1, u2;
5802            if (!findchar(self->str, self->length, str1->str[0]))
5803                goto nothing;
5804            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5805            if (!u)
5806                return NULL;
5807            Py_UNICODE_COPY(u->str, self->str, self->length);
5808            u1 = str1->str[0];
5809            u2 = str2->str[0];
5810            for (i = 0; i < u->length; i++)
5811                if (u->str[i] == u1) {
5812                    if (--maxcount < 0)
5813                        break;
5814                    u->str[i] = u2;
5815                }
5816        } else {
5817            i = stringlib_find(
5818                self->str, self->length, str1->str, str1->length, 0
5819                );
5820            if (i < 0)
5821                goto nothing;
5822            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
5823            if (!u)
5824                return NULL;
5825            Py_UNICODE_COPY(u->str, self->str, self->length);
5826
5827            /* change everything in-place, starting with this one */
5828            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5829            i += str1->length;
5830
5831            while ( --maxcount > 0) {
5832                i = stringlib_find(self->str+i, self->length-i,
5833                                   str1->str, str1->length,
5834                                   i);
5835                if (i == -1)
5836                    break;
5837                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
5838                i += str1->length;
5839            }
5840        }
5841    } else {
5842
5843        Py_ssize_t n, i, j;
5844        Py_ssize_t product, new_size, delta;
5845        Py_UNICODE *p;
5846
5847        /* replace strings */
5848        n = stringlib_count(self->str, self->length, str1->str, str1->length,
5849                            maxcount);
5850        if (n == 0)
5851            goto nothing;
5852        /* new_size = self->length + n * (str2->length - str1->length)); */
5853        delta = (str2->length - str1->length);
5854        if (delta == 0) {
5855            new_size = self->length;
5856        } else {
5857            product = n * (str2->length - str1->length);
5858            if ((product / (str2->length - str1->length)) != n) {
5859                PyErr_SetString(PyExc_OverflowError,
5860                                "replace string is too long");
5861                return NULL;
5862            }
5863            new_size = self->length + product;
5864            if (new_size < 0) {
5865                PyErr_SetString(PyExc_OverflowError,
5866                                "replace string is too long");
5867                return NULL;
5868            }
5869        }
5870        u = _PyUnicode_New(new_size);
5871        if (!u)
5872            return NULL;
5873        i = 0;
5874        p = u->str;
5875        if (str1->length > 0) {
5876            while (n-- > 0) {
5877                /* look for next match */
5878                j = stringlib_find(self->str+i, self->length-i,
5879                                   str1->str, str1->length,
5880                                   i);
5881                if (j == -1)
5882                    break;
5883                else if (j > i) {
5884                    /* copy unchanged part [i:j] */
5885                    Py_UNICODE_COPY(p, self->str+i, j-i);
5886                    p += j - i;
5887                }
5888                /* copy substitution string */
5889                if (str2->length > 0) {
5890                    Py_UNICODE_COPY(p, str2->str, str2->length);
5891                    p += str2->length;
5892                }
5893                i = j + str1->length;
5894            }
5895            if (i < self->length)
5896                /* copy tail [i:] */
5897                Py_UNICODE_COPY(p, self->str+i, self->length-i);
5898        } else {
5899            /* interleave */
5900            while (n > 0) {
5901                Py_UNICODE_COPY(p, str2->str, str2->length);
5902                p += str2->length;
5903                if (--n <= 0)
5904                    break;
5905                *p++ = self->str[i++];
5906            }
5907            Py_UNICODE_COPY(p, self->str+i, self->length-i);
5908        }
5909    }
5910    return (PyObject *) u;
5911
5912  nothing:
5913    /* nothing to replace; return original string (when possible) */
5914    if (PyUnicode_CheckExact(self)) {
5915        Py_INCREF(self);
5916        return (PyObject *) self;
5917    }
5918    return PyUnicode_FromUnicode(self->str, self->length);
5919}
5920
5921/* --- Unicode Object Methods --------------------------------------------- */
5922
5923PyDoc_STRVAR(title__doc__,
5924             "S.title() -> unicode\n\
5925\n\
5926Return a titlecased version of S, i.e. words start with title case\n\
5927characters, all remaining cased characters have lower case.");
5928
5929static PyObject*
5930unicode_title(PyUnicodeObject *self)
5931{
5932    return fixup(self, fixtitle);
5933}
5934
5935PyDoc_STRVAR(capitalize__doc__,
5936             "S.capitalize() -> unicode\n\
5937\n\
5938Return a capitalized version of S, i.e. make the first character\n\
5939have upper case and the rest lower case.");
5940
5941static PyObject*
5942unicode_capitalize(PyUnicodeObject *self)
5943{
5944    return fixup(self, fixcapitalize);
5945}
5946
5947#if 0
5948PyDoc_STRVAR(capwords__doc__,
5949             "S.capwords() -> unicode\n\
5950\n\
5951Apply .capitalize() to all words in S and return the result with\n\
5952normalized whitespace (all whitespace strings are replaced by ' ').");
5953
5954static PyObject*
5955unicode_capwords(PyUnicodeObject *self)
5956{
5957    PyObject *list;
5958    PyObject *item;
5959    Py_ssize_t i;
5960
5961    /* Split into words */
5962    list = split(self, NULL, -1);
5963    if (!list)
5964        return NULL;
5965
5966    /* Capitalize each word */
5967    for (i = 0; i < PyList_GET_SIZE(list); i++) {
5968        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
5969                     fixcapitalize);
5970        if (item == NULL)
5971            goto onError;
5972        Py_DECREF(PyList_GET_ITEM(list, i));
5973        PyList_SET_ITEM(list, i, item);
5974    }
5975
5976    /* Join the words to form a new string */
5977    item = PyUnicode_Join(NULL, list);
5978
5979  onError:
5980    Py_DECREF(list);
5981    return (PyObject *)item;
5982}
5983#endif
5984
5985/* Argument converter.  Coerces to a single unicode character */
5986
5987static int
5988convert_uc(PyObject *obj, void *addr)
5989{
5990    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
5991    PyObject *uniobj;
5992    Py_UNICODE *unistr;
5993
5994    uniobj = PyUnicode_FromObject(obj);
5995    if (uniobj == NULL) {
5996        PyErr_SetString(PyExc_TypeError,
5997                        "The fill character cannot be converted to Unicode");
5998        return 0;
5999    }
6000    if (PyUnicode_GET_SIZE(uniobj) != 1) {
6001        PyErr_SetString(PyExc_TypeError,
6002                        "The fill character must be exactly one character long");
6003        Py_DECREF(uniobj);
6004        return 0;
6005    }
6006    unistr = PyUnicode_AS_UNICODE(uniobj);
6007    *fillcharloc = unistr[0];
6008    Py_DECREF(uniobj);
6009    return 1;
6010}
6011
6012PyDoc_STRVAR(center__doc__,
6013             "S.center(width[, fillchar]) -> unicode\n\
6014\n\
6015Return S centered in a Unicode string of length width. Padding is\n\
6016done using the specified fill character (default is a space)");
6017
6018static PyObject *
6019unicode_center(PyUnicodeObject *self, PyObject *args)
6020{
6021    Py_ssize_t marg, left;
6022    Py_ssize_t width;
6023    Py_UNICODE fillchar = ' ';
6024
6025    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6026        return NULL;
6027
6028    if (self->length >= width && PyUnicode_CheckExact(self)) {
6029        Py_INCREF(self);
6030        return (PyObject*) self;
6031    }
6032
6033    marg = width - self->length;
6034    left = marg / 2 + (marg & width & 1);
6035
6036    return (PyObject*) pad(self, left, marg - left, fillchar);
6037}
6038
6039#if 0
6040
6041/* This code should go into some future Unicode collation support
6042   module. The basic comparison should compare ordinals on a naive
6043   basis (this is what Java does and thus Jython too). */
6044
6045/* speedy UTF-16 code point order comparison */
6046/* gleaned from: */
6047/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6048
6049static short utf16Fixup[32] =
6050{
6051    0, 0, 0, 0, 0, 0, 0, 0,
6052    0, 0, 0, 0, 0, 0, 0, 0,
6053    0, 0, 0, 0, 0, 0, 0, 0,
6054    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6055};
6056
6057static int
6058unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6059{
6060    Py_ssize_t len1, len2;
6061
6062    Py_UNICODE *s1 = str1->str;
6063    Py_UNICODE *s2 = str2->str;
6064
6065    len1 = str1->length;
6066    len2 = str2->length;
6067
6068    while (len1 > 0 && len2 > 0) {
6069        Py_UNICODE c1, c2;
6070
6071        c1 = *s1++;
6072        c2 = *s2++;
6073
6074        if (c1 > (1<<11) * 26)
6075            c1 += utf16Fixup[c1>>11];
6076        if (c2 > (1<<11) * 26)
6077            c2 += utf16Fixup[c2>>11];
6078        /* now c1 and c2 are in UTF-32-compatible order */
6079
6080        if (c1 != c2)
6081            return (c1 < c2) ? -1 : 1;
6082
6083        len1--; len2--;
6084    }
6085
6086    return (len1 < len2) ? -1 : (len1 != len2);
6087}
6088
6089#else
6090
6091static int
6092unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6093{
6094    register Py_ssize_t len1, len2;
6095
6096    Py_UNICODE *s1 = str1->str;
6097    Py_UNICODE *s2 = str2->str;
6098
6099    len1 = str1->length;
6100    len2 = str2->length;
6101
6102    while (len1 > 0 && len2 > 0) {
6103        Py_UNICODE c1, c2;
6104
6105        c1 = *s1++;
6106        c2 = *s2++;
6107
6108        if (c1 != c2)
6109            return (c1 < c2) ? -1 : 1;
6110
6111        len1--; len2--;
6112    }
6113
6114    return (len1 < len2) ? -1 : (len1 != len2);
6115}
6116
6117#endif
6118
6119int PyUnicode_Compare(PyObject *left,
6120                      PyObject *right)
6121{
6122    PyUnicodeObject *u = NULL, *v = NULL;
6123    int result;
6124
6125    /* Coerce the two arguments */
6126    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6127    if (u == NULL)
6128        goto onError;
6129    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6130    if (v == NULL)
6131        goto onError;
6132
6133    /* Shortcut for empty or interned objects */
6134    if (v == u) {
6135        Py_DECREF(u);
6136        Py_DECREF(v);
6137        return 0;
6138    }
6139
6140    result = unicode_compare(u, v);
6141
6142    Py_DECREF(u);
6143    Py_DECREF(v);
6144    return result;
6145
6146  onError:
6147    Py_XDECREF(u);
6148    Py_XDECREF(v);
6149    return -1;
6150}
6151
6152PyObject *PyUnicode_RichCompare(PyObject *left,
6153                                PyObject *right,
6154                                int op)
6155{
6156    int result;
6157
6158    result = PyUnicode_Compare(left, right);
6159    if (result == -1 && PyErr_Occurred())
6160        goto onError;
6161
6162    /* Convert the return value to a Boolean */
6163    switch (op) {
6164    case Py_EQ:
6165        result = (result == 0);
6166        break;
6167    case Py_NE:
6168        result = (result != 0);
6169        break;
6170    case Py_LE:
6171        result = (result <= 0);
6172        break;
6173    case Py_GE:
6174        result = (result >= 0);
6175        break;
6176    case Py_LT:
6177        result = (result == -1);
6178        break;
6179    case Py_GT:
6180        result = (result == 1);
6181        break;
6182    }
6183    return PyBool_FromLong(result);
6184
6185  onError:
6186
6187    /* Standard case
6188
6189       Type errors mean that PyUnicode_FromObject() could not convert
6190       one of the arguments (usually the right hand side) to Unicode,
6191       ie. we can't handle the comparison request. However, it is
6192       possible that the other object knows a comparison method, which
6193       is why we return Py_NotImplemented to give the other object a
6194       chance.
6195
6196    */
6197    if (PyErr_ExceptionMatches(PyExc_TypeError)) {
6198        PyErr_Clear();
6199        Py_INCREF(Py_NotImplemented);
6200        return Py_NotImplemented;
6201    }
6202    if (op != Py_EQ && op != Py_NE)
6203        return NULL;
6204
6205    /* Equality comparison.
6206
6207       This is a special case: we silence any PyExc_UnicodeDecodeError
6208       and instead turn it into a PyErr_UnicodeWarning.
6209
6210    */
6211    if (!PyErr_ExceptionMatches(PyExc_UnicodeDecodeError))
6212        return NULL;
6213    PyErr_Clear();
6214    if (PyErr_Warn(PyExc_UnicodeWarning,
6215                   (op == Py_EQ) ?
6216                   "Unicode equal comparison "
6217                   "failed to convert both arguments to Unicode - "
6218                   "interpreting them as being unequal" :
6219                   "Unicode unequal comparison "
6220                   "failed to convert both arguments to Unicode - "
6221                   "interpreting them as being unequal"
6222            ) < 0)
6223        return NULL;
6224    result = (op == Py_NE);
6225    return PyBool_FromLong(result);
6226}
6227
6228int PyUnicode_Contains(PyObject *container,
6229                       PyObject *element)
6230{
6231    PyObject *str, *sub;
6232    int result;
6233
6234    /* Coerce the two arguments */
6235    sub = PyUnicode_FromObject(element);
6236    if (!sub) {
6237        return -1;
6238    }
6239
6240    str = PyUnicode_FromObject(container);
6241    if (!str) {
6242        Py_DECREF(sub);
6243        return -1;
6244    }
6245
6246    result = stringlib_contains_obj(str, sub);
6247
6248    Py_DECREF(str);
6249    Py_DECREF(sub);
6250
6251    return result;
6252}
6253
6254/* Concat to string or Unicode object giving a new Unicode object. */
6255
6256PyObject *PyUnicode_Concat(PyObject *left,
6257                           PyObject *right)
6258{
6259    PyUnicodeObject *u = NULL, *v = NULL, *w;
6260
6261    /* Coerce the two arguments */
6262    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
6263    if (u == NULL)
6264        goto onError;
6265    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
6266    if (v == NULL)
6267        goto onError;
6268
6269    /* Shortcuts */
6270    if (v == unicode_empty) {
6271        Py_DECREF(v);
6272        return (PyObject *)u;
6273    }
6274    if (u == unicode_empty) {
6275        Py_DECREF(u);
6276        return (PyObject *)v;
6277    }
6278
6279    /* Concat the two Unicode strings */
6280    w = _PyUnicode_New(u->length + v->length);
6281    if (w == NULL)
6282        goto onError;
6283    Py_UNICODE_COPY(w->str, u->str, u->length);
6284    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
6285
6286    Py_DECREF(u);
6287    Py_DECREF(v);
6288    return (PyObject *)w;
6289
6290  onError:
6291    Py_XDECREF(u);
6292    Py_XDECREF(v);
6293    return NULL;
6294}
6295
6296PyDoc_STRVAR(count__doc__,
6297             "S.count(sub[, start[, end]]) -> int\n\
6298\n\
6299Return the number of non-overlapping occurrences of substring sub in\n\
6300Unicode string S[start:end].  Optional arguments start and end are\n\
6301interpreted as in slice notation.");
6302
6303static PyObject *
6304unicode_count(PyUnicodeObject *self, PyObject *args)
6305{
6306    PyUnicodeObject *substring;
6307    Py_ssize_t start = 0;
6308    Py_ssize_t end = PY_SSIZE_T_MAX;
6309    PyObject *result;
6310
6311    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
6312                                            &start, &end))
6313        return NULL;
6314
6315    ADJUST_INDICES(start, end, self->length);
6316    result = PyInt_FromSsize_t(
6317        stringlib_count(self->str + start, end - start,
6318                        substring->str, substring->length,
6319                        PY_SSIZE_T_MAX)
6320        );
6321
6322    Py_DECREF(substring);
6323
6324    return result;
6325}
6326
6327PyDoc_STRVAR(encode__doc__,
6328             "S.encode([encoding[,errors]]) -> string or unicode\n\
6329\n\
6330Encodes S using the codec registered for encoding. encoding defaults\n\
6331to the default encoding. errors may be given to set a different error\n\
6332handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6333a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
6334'xmlcharrefreplace' as well as any other name registered with\n\
6335codecs.register_error that can handle UnicodeEncodeErrors.");
6336
6337static PyObject *
6338unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6339{
6340    static char *kwlist[] = {"encoding", "errors", 0};
6341    char *encoding = NULL;
6342    char *errors = NULL;
6343    PyObject *v;
6344
6345    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
6346                                     kwlist, &encoding, &errors))
6347        return NULL;
6348    v = PyUnicode_AsEncodedObject((PyObject *)self, encoding, errors);
6349    if (v == NULL)
6350        goto onError;
6351    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6352        PyErr_Format(PyExc_TypeError,
6353                     "encoder did not return a string/unicode object "
6354                     "(type=%.400s)",
6355                     Py_TYPE(v)->tp_name);
6356        Py_DECREF(v);
6357        return NULL;
6358    }
6359    return v;
6360
6361  onError:
6362    return NULL;
6363}
6364
6365PyDoc_STRVAR(decode__doc__,
6366             "S.decode([encoding[,errors]]) -> string or unicode\n\
6367\n\
6368Decodes S using the codec registered for encoding. encoding defaults\n\
6369to the default encoding. errors may be given to set a different error\n\
6370handling scheme. Default is 'strict' meaning that encoding errors raise\n\
6371a UnicodeDecodeError. Other possible values are 'ignore' and 'replace'\n\
6372as well as any other name registerd with codecs.register_error that is\n\
6373able to handle UnicodeDecodeErrors.");
6374
6375static PyObject *
6376unicode_decode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
6377{
6378    static char *kwlist[] = {"encoding", "errors", 0};
6379    char *encoding = NULL;
6380    char *errors = NULL;
6381    PyObject *v;
6382
6383    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:decode",
6384                                     kwlist, &encoding, &errors))
6385        return NULL;
6386    v = PyUnicode_AsDecodedObject((PyObject *)self, encoding, errors);
6387    if (v == NULL)
6388        goto onError;
6389    if (!PyString_Check(v) && !PyUnicode_Check(v)) {
6390        PyErr_Format(PyExc_TypeError,
6391                     "decoder did not return a string/unicode object "
6392                     "(type=%.400s)",
6393                     Py_TYPE(v)->tp_name);
6394        Py_DECREF(v);
6395        return NULL;
6396    }
6397    return v;
6398
6399  onError:
6400    return NULL;
6401}
6402
6403PyDoc_STRVAR(expandtabs__doc__,
6404             "S.expandtabs([tabsize]) -> unicode\n\
6405\n\
6406Return a copy of S where all tab characters are expanded using spaces.\n\
6407If tabsize is not given, a tab size of 8 characters is assumed.");
6408
6409static PyObject*
6410unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
6411{
6412    Py_UNICODE *e;
6413    Py_UNICODE *p;
6414    Py_UNICODE *q;
6415    Py_UNICODE *qe;
6416    Py_ssize_t i, j, incr;
6417    PyUnicodeObject *u;
6418    int tabsize = 8;
6419
6420    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
6421        return NULL;
6422
6423    /* First pass: determine size of output string */
6424    i = 0; /* chars up to and including most recent \n or \r */
6425    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
6426    e = self->str + self->length; /* end of input */
6427    for (p = self->str; p < e; p++)
6428        if (*p == '\t') {
6429            if (tabsize > 0) {
6430                incr = tabsize - (j % tabsize); /* cannot overflow */
6431                if (j > PY_SSIZE_T_MAX - incr)
6432                    goto overflow1;
6433                j += incr;
6434            }
6435        }
6436        else {
6437            if (j > PY_SSIZE_T_MAX - 1)
6438                goto overflow1;
6439            j++;
6440            if (*p == '\n' || *p == '\r') {
6441                if (i > PY_SSIZE_T_MAX - j)
6442                    goto overflow1;
6443                i += j;
6444                j = 0;
6445            }
6446        }
6447
6448    if (i > PY_SSIZE_T_MAX - j)
6449        goto overflow1;
6450
6451    /* Second pass: create output string and fill it */
6452    u = _PyUnicode_New(i + j);
6453    if (!u)
6454        return NULL;
6455
6456    j = 0; /* same as in first pass */
6457    q = u->str; /* next output char */
6458    qe = u->str + u->length; /* end of output */
6459
6460    for (p = self->str; p < e; p++)
6461        if (*p == '\t') {
6462            if (tabsize > 0) {
6463                i = tabsize - (j % tabsize);
6464                j += i;
6465                while (i--) {
6466                    if (q >= qe)
6467                        goto overflow2;
6468                    *q++ = ' ';
6469                }
6470            }
6471        }
6472        else {
6473            if (q >= qe)
6474                goto overflow2;
6475            *q++ = *p;
6476            j++;
6477            if (*p == '\n' || *p == '\r')
6478                j = 0;
6479        }
6480
6481    return (PyObject*) u;
6482
6483  overflow2:
6484    Py_DECREF(u);
6485  overflow1:
6486    PyErr_SetString(PyExc_OverflowError, "new string is too long");
6487    return NULL;
6488}
6489
6490PyDoc_STRVAR(find__doc__,
6491             "S.find(sub [,start [,end]]) -> int\n\
6492\n\
6493Return the lowest index in S where substring sub is found,\n\
6494such that sub is contained within s[start:end].  Optional\n\
6495arguments start and end are interpreted as in slice notation.\n\
6496\n\
6497Return -1 on failure.");
6498
6499static PyObject *
6500unicode_find(PyUnicodeObject *self, PyObject *args)
6501{
6502    PyUnicodeObject *substring;
6503    Py_ssize_t start;
6504    Py_ssize_t end;
6505    Py_ssize_t result;
6506
6507    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
6508                                            &start, &end))
6509        return NULL;
6510
6511    result = stringlib_find_slice(
6512        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6513        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6514        start, end
6515        );
6516
6517    Py_DECREF(substring);
6518
6519    return PyInt_FromSsize_t(result);
6520}
6521
6522static PyObject *
6523unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
6524{
6525    if (index < 0 || index >= self->length) {
6526        PyErr_SetString(PyExc_IndexError, "string index out of range");
6527        return NULL;
6528    }
6529
6530    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
6531}
6532
6533static long
6534unicode_hash(PyUnicodeObject *self)
6535{
6536    /* Since Unicode objects compare equal to their ASCII string
6537       counterparts, they should use the individual character values
6538       as basis for their hash value.  This is needed to assure that
6539       strings and Unicode objects behave in the same way as
6540       dictionary keys. */
6541
6542    register Py_ssize_t len;
6543    register Py_UNICODE *p;
6544    register long x;
6545
6546    if (self->hash != -1)
6547        return self->hash;
6548    len = PyUnicode_GET_SIZE(self);
6549    p = PyUnicode_AS_UNICODE(self);
6550    x = *p << 7;
6551    while (--len >= 0)
6552        x = (1000003*x) ^ *p++;
6553    x ^= PyUnicode_GET_SIZE(self);
6554    if (x == -1)
6555        x = -2;
6556    self->hash = x;
6557    return x;
6558}
6559
6560PyDoc_STRVAR(index__doc__,
6561             "S.index(sub [,start [,end]]) -> int\n\
6562\n\
6563Like S.find() but raise ValueError when the substring is not found.");
6564
6565static PyObject *
6566unicode_index(PyUnicodeObject *self, PyObject *args)
6567{
6568    Py_ssize_t result;
6569    PyUnicodeObject *substring;
6570    Py_ssize_t start;
6571    Py_ssize_t end;
6572
6573    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
6574                                            &start, &end))
6575        return NULL;
6576
6577    result = stringlib_find_slice(
6578        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
6579        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
6580        start, end
6581        );
6582
6583    Py_DECREF(substring);
6584
6585    if (result < 0) {
6586        PyErr_SetString(PyExc_ValueError, "substring not found");
6587        return NULL;
6588    }
6589
6590    return PyInt_FromSsize_t(result);
6591}
6592
6593PyDoc_STRVAR(islower__doc__,
6594             "S.islower() -> bool\n\
6595\n\
6596Return True if all cased characters in S are lowercase and there is\n\
6597at least one cased character in S, False otherwise.");
6598
6599static PyObject*
6600unicode_islower(PyUnicodeObject *self)
6601{
6602    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6603    register const Py_UNICODE *e;
6604    int cased;
6605
6606    /* Shortcut for single character strings */
6607    if (PyUnicode_GET_SIZE(self) == 1)
6608        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
6609
6610    /* Special case for empty strings */
6611    if (PyUnicode_GET_SIZE(self) == 0)
6612        return PyBool_FromLong(0);
6613
6614    e = p + PyUnicode_GET_SIZE(self);
6615    cased = 0;
6616    for (; p < e; p++) {
6617        register const Py_UNICODE ch = *p;
6618
6619        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
6620            return PyBool_FromLong(0);
6621        else if (!cased && Py_UNICODE_ISLOWER(ch))
6622            cased = 1;
6623    }
6624    return PyBool_FromLong(cased);
6625}
6626
6627PyDoc_STRVAR(isupper__doc__,
6628             "S.isupper() -> bool\n\
6629\n\
6630Return True if all cased characters in S are uppercase and there is\n\
6631at least one cased character in S, False otherwise.");
6632
6633static PyObject*
6634unicode_isupper(PyUnicodeObject *self)
6635{
6636    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6637    register const Py_UNICODE *e;
6638    int cased;
6639
6640    /* Shortcut for single character strings */
6641    if (PyUnicode_GET_SIZE(self) == 1)
6642        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
6643
6644    /* Special case for empty strings */
6645    if (PyUnicode_GET_SIZE(self) == 0)
6646        return PyBool_FromLong(0);
6647
6648    e = p + PyUnicode_GET_SIZE(self);
6649    cased = 0;
6650    for (; p < e; p++) {
6651        register const Py_UNICODE ch = *p;
6652
6653        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
6654            return PyBool_FromLong(0);
6655        else if (!cased && Py_UNICODE_ISUPPER(ch))
6656            cased = 1;
6657    }
6658    return PyBool_FromLong(cased);
6659}
6660
6661PyDoc_STRVAR(istitle__doc__,
6662             "S.istitle() -> bool\n\
6663\n\
6664Return True if S is a titlecased string and there is at least one\n\
6665character in S, i.e. upper- and titlecase characters may only\n\
6666follow uncased characters and lowercase characters only cased ones.\n\
6667Return False otherwise.");
6668
6669static PyObject*
6670unicode_istitle(PyUnicodeObject *self)
6671{
6672    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6673    register const Py_UNICODE *e;
6674    int cased, previous_is_cased;
6675
6676    /* Shortcut for single character strings */
6677    if (PyUnicode_GET_SIZE(self) == 1)
6678        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
6679                               (Py_UNICODE_ISUPPER(*p) != 0));
6680
6681    /* Special case for empty strings */
6682    if (PyUnicode_GET_SIZE(self) == 0)
6683        return PyBool_FromLong(0);
6684
6685    e = p + PyUnicode_GET_SIZE(self);
6686    cased = 0;
6687    previous_is_cased = 0;
6688    for (; p < e; p++) {
6689        register const Py_UNICODE ch = *p;
6690
6691        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
6692            if (previous_is_cased)
6693                return PyBool_FromLong(0);
6694            previous_is_cased = 1;
6695            cased = 1;
6696        }
6697        else if (Py_UNICODE_ISLOWER(ch)) {
6698            if (!previous_is_cased)
6699                return PyBool_FromLong(0);
6700            previous_is_cased = 1;
6701            cased = 1;
6702        }
6703        else
6704            previous_is_cased = 0;
6705    }
6706    return PyBool_FromLong(cased);
6707}
6708
6709PyDoc_STRVAR(isspace__doc__,
6710             "S.isspace() -> bool\n\
6711\n\
6712Return True if all characters in S are whitespace\n\
6713and there is at least one character in S, False otherwise.");
6714
6715static PyObject*
6716unicode_isspace(PyUnicodeObject *self)
6717{
6718    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6719    register const Py_UNICODE *e;
6720
6721    /* Shortcut for single character strings */
6722    if (PyUnicode_GET_SIZE(self) == 1 &&
6723        Py_UNICODE_ISSPACE(*p))
6724        return PyBool_FromLong(1);
6725
6726    /* Special case for empty strings */
6727    if (PyUnicode_GET_SIZE(self) == 0)
6728        return PyBool_FromLong(0);
6729
6730    e = p + PyUnicode_GET_SIZE(self);
6731    for (; p < e; p++) {
6732        if (!Py_UNICODE_ISSPACE(*p))
6733            return PyBool_FromLong(0);
6734    }
6735    return PyBool_FromLong(1);
6736}
6737
6738PyDoc_STRVAR(isalpha__doc__,
6739             "S.isalpha() -> bool\n\
6740\n\
6741Return True if all characters in S are alphabetic\n\
6742and there is at least one character in S, False otherwise.");
6743
6744static PyObject*
6745unicode_isalpha(PyUnicodeObject *self)
6746{
6747    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6748    register const Py_UNICODE *e;
6749
6750    /* Shortcut for single character strings */
6751    if (PyUnicode_GET_SIZE(self) == 1 &&
6752        Py_UNICODE_ISALPHA(*p))
6753        return PyBool_FromLong(1);
6754
6755    /* Special case for empty strings */
6756    if (PyUnicode_GET_SIZE(self) == 0)
6757        return PyBool_FromLong(0);
6758
6759    e = p + PyUnicode_GET_SIZE(self);
6760    for (; p < e; p++) {
6761        if (!Py_UNICODE_ISALPHA(*p))
6762            return PyBool_FromLong(0);
6763    }
6764    return PyBool_FromLong(1);
6765}
6766
6767PyDoc_STRVAR(isalnum__doc__,
6768             "S.isalnum() -> bool\n\
6769\n\
6770Return True if all characters in S are alphanumeric\n\
6771and there is at least one character in S, False otherwise.");
6772
6773static PyObject*
6774unicode_isalnum(PyUnicodeObject *self)
6775{
6776    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6777    register const Py_UNICODE *e;
6778
6779    /* Shortcut for single character strings */
6780    if (PyUnicode_GET_SIZE(self) == 1 &&
6781        Py_UNICODE_ISALNUM(*p))
6782        return PyBool_FromLong(1);
6783
6784    /* Special case for empty strings */
6785    if (PyUnicode_GET_SIZE(self) == 0)
6786        return PyBool_FromLong(0);
6787
6788    e = p + PyUnicode_GET_SIZE(self);
6789    for (; p < e; p++) {
6790        if (!Py_UNICODE_ISALNUM(*p))
6791            return PyBool_FromLong(0);
6792    }
6793    return PyBool_FromLong(1);
6794}
6795
6796PyDoc_STRVAR(isdecimal__doc__,
6797             "S.isdecimal() -> bool\n\
6798\n\
6799Return True if there are only decimal characters in S,\n\
6800False otherwise.");
6801
6802static PyObject*
6803unicode_isdecimal(PyUnicodeObject *self)
6804{
6805    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6806    register const Py_UNICODE *e;
6807
6808    /* Shortcut for single character strings */
6809    if (PyUnicode_GET_SIZE(self) == 1 &&
6810        Py_UNICODE_ISDECIMAL(*p))
6811        return PyBool_FromLong(1);
6812
6813    /* Special case for empty strings */
6814    if (PyUnicode_GET_SIZE(self) == 0)
6815        return PyBool_FromLong(0);
6816
6817    e = p + PyUnicode_GET_SIZE(self);
6818    for (; p < e; p++) {
6819        if (!Py_UNICODE_ISDECIMAL(*p))
6820            return PyBool_FromLong(0);
6821    }
6822    return PyBool_FromLong(1);
6823}
6824
6825PyDoc_STRVAR(isdigit__doc__,
6826             "S.isdigit() -> bool\n\
6827\n\
6828Return True if all characters in S are digits\n\
6829and there is at least one character in S, False otherwise.");
6830
6831static PyObject*
6832unicode_isdigit(PyUnicodeObject *self)
6833{
6834    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6835    register const Py_UNICODE *e;
6836
6837    /* Shortcut for single character strings */
6838    if (PyUnicode_GET_SIZE(self) == 1 &&
6839        Py_UNICODE_ISDIGIT(*p))
6840        return PyBool_FromLong(1);
6841
6842    /* Special case for empty strings */
6843    if (PyUnicode_GET_SIZE(self) == 0)
6844        return PyBool_FromLong(0);
6845
6846    e = p + PyUnicode_GET_SIZE(self);
6847    for (; p < e; p++) {
6848        if (!Py_UNICODE_ISDIGIT(*p))
6849            return PyBool_FromLong(0);
6850    }
6851    return PyBool_FromLong(1);
6852}
6853
6854PyDoc_STRVAR(isnumeric__doc__,
6855             "S.isnumeric() -> bool\n\
6856\n\
6857Return True if there are only numeric characters in S,\n\
6858False otherwise.");
6859
6860static PyObject*
6861unicode_isnumeric(PyUnicodeObject *self)
6862{
6863    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6864    register const Py_UNICODE *e;
6865
6866    /* Shortcut for single character strings */
6867    if (PyUnicode_GET_SIZE(self) == 1 &&
6868        Py_UNICODE_ISNUMERIC(*p))
6869        return PyBool_FromLong(1);
6870
6871    /* Special case for empty strings */
6872    if (PyUnicode_GET_SIZE(self) == 0)
6873        return PyBool_FromLong(0);
6874
6875    e = p + PyUnicode_GET_SIZE(self);
6876    for (; p < e; p++) {
6877        if (!Py_UNICODE_ISNUMERIC(*p))
6878            return PyBool_FromLong(0);
6879    }
6880    return PyBool_FromLong(1);
6881}
6882
6883PyDoc_STRVAR(join__doc__,
6884             "S.join(iterable) -> unicode\n\
6885\n\
6886Return a string which is the concatenation of the strings in the\n\
6887iterable.  The separator between elements is S.");
6888
6889static PyObject*
6890unicode_join(PyObject *self, PyObject *data)
6891{
6892    return PyUnicode_Join(self, data);
6893}
6894
6895static Py_ssize_t
6896unicode_length(PyUnicodeObject *self)
6897{
6898    return self->length;
6899}
6900
6901PyDoc_STRVAR(ljust__doc__,
6902             "S.ljust(width[, fillchar]) -> int\n\
6903\n\
6904Return S left-justified in a Unicode string of length width. Padding is\n\
6905done using the specified fill character (default is a space).");
6906
6907static PyObject *
6908unicode_ljust(PyUnicodeObject *self, PyObject *args)
6909{
6910    Py_ssize_t width;
6911    Py_UNICODE fillchar = ' ';
6912
6913    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
6914        return NULL;
6915
6916    if (self->length >= width && PyUnicode_CheckExact(self)) {
6917        Py_INCREF(self);
6918        return (PyObject*) self;
6919    }
6920
6921    return (PyObject*) pad(self, 0, width - self->length, fillchar);
6922}
6923
6924PyDoc_STRVAR(lower__doc__,
6925             "S.lower() -> unicode\n\
6926\n\
6927Return a copy of the string S converted to lowercase.");
6928
6929static PyObject*
6930unicode_lower(PyUnicodeObject *self)
6931{
6932    return fixup(self, fixlower);
6933}
6934
6935#define LEFTSTRIP 0
6936#define RIGHTSTRIP 1
6937#define BOTHSTRIP 2
6938
6939/* Arrays indexed by above */
6940static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
6941
6942#define STRIPNAME(i) (stripformat[i]+3)
6943
6944/* externally visible for str.strip(unicode) */
6945PyObject *
6946_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
6947{
6948    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6949    Py_ssize_t len = PyUnicode_GET_SIZE(self);
6950    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
6951    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
6952    Py_ssize_t i, j;
6953
6954    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
6955
6956    i = 0;
6957    if (striptype != RIGHTSTRIP) {
6958        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
6959            i++;
6960        }
6961    }
6962
6963    j = len;
6964    if (striptype != LEFTSTRIP) {
6965        do {
6966            j--;
6967        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
6968        j++;
6969    }
6970
6971    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
6972        Py_INCREF(self);
6973        return (PyObject*)self;
6974    }
6975    else
6976        return PyUnicode_FromUnicode(s+i, j-i);
6977}
6978
6979
6980static PyObject *
6981do_strip(PyUnicodeObject *self, int striptype)
6982{
6983    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
6984    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
6985
6986    i = 0;
6987    if (striptype != RIGHTSTRIP) {
6988        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
6989            i++;
6990        }
6991    }
6992
6993    j = len;
6994    if (striptype != LEFTSTRIP) {
6995        do {
6996            j--;
6997        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
6998        j++;
6999    }
7000
7001    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7002        Py_INCREF(self);
7003        return (PyObject*)self;
7004    }
7005    else
7006        return PyUnicode_FromUnicode(s+i, j-i);
7007}
7008
7009
7010static PyObject *
7011do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7012{
7013    PyObject *sep = NULL;
7014
7015    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7016        return NULL;
7017
7018    if (sep != NULL && sep != Py_None) {
7019        if (PyUnicode_Check(sep))
7020            return _PyUnicode_XStrip(self, striptype, sep);
7021        else if (PyString_Check(sep)) {
7022            PyObject *res;
7023            sep = PyUnicode_FromObject(sep);
7024            if (sep==NULL)
7025                return NULL;
7026            res = _PyUnicode_XStrip(self, striptype, sep);
7027            Py_DECREF(sep);
7028            return res;
7029        }
7030        else {
7031            PyErr_Format(PyExc_TypeError,
7032                         "%s arg must be None, unicode or str",
7033                         STRIPNAME(striptype));
7034            return NULL;
7035        }
7036    }
7037
7038    return do_strip(self, striptype);
7039}
7040
7041
7042PyDoc_STRVAR(strip__doc__,
7043             "S.strip([chars]) -> unicode\n\
7044\n\
7045Return a copy of the string S with leading and trailing\n\
7046whitespace removed.\n\
7047If chars is given and not None, remove characters in chars instead.\n\
7048If chars is a str, it will be converted to unicode before stripping");
7049
7050static PyObject *
7051unicode_strip(PyUnicodeObject *self, PyObject *args)
7052{
7053    if (PyTuple_GET_SIZE(args) == 0)
7054        return do_strip(self, BOTHSTRIP); /* Common case */
7055    else
7056        return do_argstrip(self, BOTHSTRIP, args);
7057}
7058
7059
7060PyDoc_STRVAR(lstrip__doc__,
7061             "S.lstrip([chars]) -> unicode\n\
7062\n\
7063Return a copy of the string S with leading whitespace removed.\n\
7064If chars is given and not None, remove characters in chars instead.\n\
7065If chars is a str, it will be converted to unicode before stripping");
7066
7067static PyObject *
7068unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7069{
7070    if (PyTuple_GET_SIZE(args) == 0)
7071        return do_strip(self, LEFTSTRIP); /* Common case */
7072    else
7073        return do_argstrip(self, LEFTSTRIP, args);
7074}
7075
7076
7077PyDoc_STRVAR(rstrip__doc__,
7078             "S.rstrip([chars]) -> unicode\n\
7079\n\
7080Return a copy of the string S with trailing whitespace removed.\n\
7081If chars is given and not None, remove characters in chars instead.\n\
7082If chars is a str, it will be converted to unicode before stripping");
7083
7084static PyObject *
7085unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7086{
7087    if (PyTuple_GET_SIZE(args) == 0)
7088        return do_strip(self, RIGHTSTRIP); /* Common case */
7089    else
7090        return do_argstrip(self, RIGHTSTRIP, args);
7091}
7092
7093
7094static PyObject*
7095unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7096{
7097    PyUnicodeObject *u;
7098    Py_UNICODE *p;
7099    Py_ssize_t nchars;
7100    size_t nbytes;
7101
7102    if (len < 0)
7103        len = 0;
7104
7105    if (len == 1 && PyUnicode_CheckExact(str)) {
7106        /* no repeat, return original string */
7107        Py_INCREF(str);
7108        return (PyObject*) str;
7109    }
7110
7111    /* ensure # of chars needed doesn't overflow int and # of bytes
7112     * needed doesn't overflow size_t
7113     */
7114    nchars = len * str->length;
7115    if (len && nchars / len != str->length) {
7116        PyErr_SetString(PyExc_OverflowError,
7117                        "repeated string is too long");
7118        return NULL;
7119    }
7120    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7121    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7122        PyErr_SetString(PyExc_OverflowError,
7123                        "repeated string is too long");
7124        return NULL;
7125    }
7126    u = _PyUnicode_New(nchars);
7127    if (!u)
7128        return NULL;
7129
7130    p = u->str;
7131
7132    if (str->length == 1 && len > 0) {
7133        Py_UNICODE_FILL(p, str->str[0], len);
7134    } else {
7135        Py_ssize_t done = 0; /* number of characters copied this far */
7136        if (done < nchars) {
7137            Py_UNICODE_COPY(p, str->str, str->length);
7138            done = str->length;
7139        }
7140        while (done < nchars) {
7141            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7142            Py_UNICODE_COPY(p+done, p, n);
7143            done += n;
7144        }
7145    }
7146
7147    return (PyObject*) u;
7148}
7149
7150PyObject *PyUnicode_Replace(PyObject *obj,
7151                            PyObject *subobj,
7152                            PyObject *replobj,
7153                            Py_ssize_t maxcount)
7154{
7155    PyObject *self;
7156    PyObject *str1;
7157    PyObject *str2;
7158    PyObject *result;
7159
7160    self = PyUnicode_FromObject(obj);
7161    if (self == NULL)
7162        return NULL;
7163    str1 = PyUnicode_FromObject(subobj);
7164    if (str1 == NULL) {
7165        Py_DECREF(self);
7166        return NULL;
7167    }
7168    str2 = PyUnicode_FromObject(replobj);
7169    if (str2 == NULL) {
7170        Py_DECREF(self);
7171        Py_DECREF(str1);
7172        return NULL;
7173    }
7174    result = replace((PyUnicodeObject *)self,
7175                     (PyUnicodeObject *)str1,
7176                     (PyUnicodeObject *)str2,
7177                     maxcount);
7178    Py_DECREF(self);
7179    Py_DECREF(str1);
7180    Py_DECREF(str2);
7181    return result;
7182}
7183
7184PyDoc_STRVAR(replace__doc__,
7185             "S.replace(old, new[, count]) -> unicode\n\
7186\n\
7187Return a copy of S with all occurrences of substring\n\
7188old replaced by new.  If the optional argument count is\n\
7189given, only the first count occurrences are replaced.");
7190
7191static PyObject*
7192unicode_replace(PyUnicodeObject *self, PyObject *args)
7193{
7194    PyUnicodeObject *str1;
7195    PyUnicodeObject *str2;
7196    Py_ssize_t maxcount = -1;
7197    PyObject *result;
7198
7199    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
7200        return NULL;
7201    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
7202    if (str1 == NULL)
7203        return NULL;
7204    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
7205    if (str2 == NULL) {
7206        Py_DECREF(str1);
7207        return NULL;
7208    }
7209
7210    result = replace(self, str1, str2, maxcount);
7211
7212    Py_DECREF(str1);
7213    Py_DECREF(str2);
7214    return result;
7215}
7216
7217static
7218PyObject *unicode_repr(PyObject *unicode)
7219{
7220    return unicodeescape_string(PyUnicode_AS_UNICODE(unicode),
7221                                PyUnicode_GET_SIZE(unicode),
7222                                1);
7223}
7224
7225PyDoc_STRVAR(rfind__doc__,
7226             "S.rfind(sub [,start [,end]]) -> int\n\
7227\n\
7228Return the highest index in S where substring sub is found,\n\
7229such that sub is contained within s[start:end].  Optional\n\
7230arguments start and end are interpreted as in slice notation.\n\
7231\n\
7232Return -1 on failure.");
7233
7234static PyObject *
7235unicode_rfind(PyUnicodeObject *self, PyObject *args)
7236{
7237    PyUnicodeObject *substring;
7238    Py_ssize_t start;
7239    Py_ssize_t end;
7240    Py_ssize_t result;
7241
7242    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
7243                                            &start, &end))
7244        return NULL;
7245
7246    result = stringlib_rfind_slice(
7247        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7248        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7249        start, end
7250        );
7251
7252    Py_DECREF(substring);
7253
7254    return PyInt_FromSsize_t(result);
7255}
7256
7257PyDoc_STRVAR(rindex__doc__,
7258             "S.rindex(sub [,start [,end]]) -> int\n\
7259\n\
7260Like S.rfind() but raise ValueError when the substring is not found.");
7261
7262static PyObject *
7263unicode_rindex(PyUnicodeObject *self, PyObject *args)
7264{
7265    PyUnicodeObject *substring;
7266    Py_ssize_t start;
7267    Py_ssize_t end;
7268    Py_ssize_t result;
7269
7270    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
7271                                            &start, &end))
7272        return NULL;
7273
7274    result = stringlib_rfind_slice(
7275        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7276        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7277        start, end
7278        );
7279
7280    Py_DECREF(substring);
7281
7282    if (result < 0) {
7283        PyErr_SetString(PyExc_ValueError, "substring not found");
7284        return NULL;
7285    }
7286    return PyInt_FromSsize_t(result);
7287}
7288
7289PyDoc_STRVAR(rjust__doc__,
7290             "S.rjust(width[, fillchar]) -> unicode\n\
7291\n\
7292Return S right-justified in a Unicode string of length width. Padding is\n\
7293done using the specified fill character (default is a space).");
7294
7295static PyObject *
7296unicode_rjust(PyUnicodeObject *self, PyObject *args)
7297{
7298    Py_ssize_t width;
7299    Py_UNICODE fillchar = ' ';
7300
7301    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
7302        return NULL;
7303
7304    if (self->length >= width && PyUnicode_CheckExact(self)) {
7305        Py_INCREF(self);
7306        return (PyObject*) self;
7307    }
7308
7309    return (PyObject*) pad(self, width - self->length, 0, fillchar);
7310}
7311
7312static PyObject*
7313unicode_slice(PyUnicodeObject *self, Py_ssize_t start, Py_ssize_t end)
7314{
7315    /* standard clamping */
7316    if (start < 0)
7317        start = 0;
7318    if (end < 0)
7319        end = 0;
7320    if (end > self->length)
7321        end = self->length;
7322    if (start == 0 && end == self->length && PyUnicode_CheckExact(self)) {
7323        /* full slice, return original string */
7324        Py_INCREF(self);
7325        return (PyObject*) self;
7326    }
7327    if (start > end)
7328        start = end;
7329    /* copy slice */
7330    return (PyObject*) PyUnicode_FromUnicode(self->str + start,
7331                                             end - start);
7332}
7333
7334PyObject *PyUnicode_Split(PyObject *s,
7335                          PyObject *sep,
7336                          Py_ssize_t maxsplit)
7337{
7338    PyObject *result;
7339
7340    s = PyUnicode_FromObject(s);
7341    if (s == NULL)
7342        return NULL;
7343    if (sep != NULL) {
7344        sep = PyUnicode_FromObject(sep);
7345        if (sep == NULL) {
7346            Py_DECREF(s);
7347            return NULL;
7348        }
7349    }
7350
7351    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7352
7353    Py_DECREF(s);
7354    Py_XDECREF(sep);
7355    return result;
7356}
7357
7358PyDoc_STRVAR(split__doc__,
7359             "S.split([sep [,maxsplit]]) -> list of strings\n\
7360\n\
7361Return a list of the words in S, using sep as the\n\
7362delimiter string.  If maxsplit is given, at most maxsplit\n\
7363splits are done. If sep is not specified or is None, any\n\
7364whitespace string is a separator and empty strings are\n\
7365removed from the result.");
7366
7367static PyObject*
7368unicode_split(PyUnicodeObject *self, PyObject *args)
7369{
7370    PyObject *substring = Py_None;
7371    Py_ssize_t maxcount = -1;
7372
7373    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
7374        return NULL;
7375
7376    if (substring == Py_None)
7377        return split(self, NULL, maxcount);
7378    else if (PyUnicode_Check(substring))
7379        return split(self, (PyUnicodeObject *)substring, maxcount);
7380    else
7381        return PyUnicode_Split((PyObject *)self, substring, maxcount);
7382}
7383
7384PyObject *
7385PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
7386{
7387    PyObject* str_obj;
7388    PyObject* sep_obj;
7389    PyObject* out;
7390
7391    str_obj = PyUnicode_FromObject(str_in);
7392    if (!str_obj)
7393        return NULL;
7394    sep_obj = PyUnicode_FromObject(sep_in);
7395    if (!sep_obj) {
7396        Py_DECREF(str_obj);
7397        return NULL;
7398    }
7399
7400    out = stringlib_partition(
7401        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7402        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7403        );
7404
7405    Py_DECREF(sep_obj);
7406    Py_DECREF(str_obj);
7407
7408    return out;
7409}
7410
7411
7412PyObject *
7413PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
7414{
7415    PyObject* str_obj;
7416    PyObject* sep_obj;
7417    PyObject* out;
7418
7419    str_obj = PyUnicode_FromObject(str_in);
7420    if (!str_obj)
7421        return NULL;
7422    sep_obj = PyUnicode_FromObject(sep_in);
7423    if (!sep_obj) {
7424        Py_DECREF(str_obj);
7425        return NULL;
7426    }
7427
7428    out = stringlib_rpartition(
7429        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
7430        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
7431        );
7432
7433    Py_DECREF(sep_obj);
7434    Py_DECREF(str_obj);
7435
7436    return out;
7437}
7438
7439PyDoc_STRVAR(partition__doc__,
7440             "S.partition(sep) -> (head, sep, tail)\n\
7441\n\
7442Search for the separator sep in S, and return the part before it,\n\
7443the separator itself, and the part after it.  If the separator is not\n\
7444found, return S and two empty strings.");
7445
7446static PyObject*
7447unicode_partition(PyUnicodeObject *self, PyObject *separator)
7448{
7449    return PyUnicode_Partition((PyObject *)self, separator);
7450}
7451
7452PyDoc_STRVAR(rpartition__doc__,
7453             "S.rpartition(sep) -> (head, sep, tail)\n\
7454\n\
7455Search for the separator sep in S, starting at the end of S, and return\n\
7456the part before it, the separator itself, and the part after it.  If the\n\
7457separator is not found, return two empty strings and S.");
7458
7459static PyObject*
7460unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
7461{
7462    return PyUnicode_RPartition((PyObject *)self, separator);
7463}
7464
7465PyObject *PyUnicode_RSplit(PyObject *s,
7466                           PyObject *sep,
7467                           Py_ssize_t maxsplit)
7468{
7469    PyObject *result;
7470
7471    s = PyUnicode_FromObject(s);
7472    if (s == NULL)
7473        return NULL;
7474    if (sep != NULL) {
7475        sep = PyUnicode_FromObject(sep);
7476        if (sep == NULL) {
7477            Py_DECREF(s);
7478            return NULL;
7479        }
7480    }
7481
7482    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
7483
7484    Py_DECREF(s);
7485    Py_XDECREF(sep);
7486    return result;
7487}
7488
7489PyDoc_STRVAR(rsplit__doc__,
7490             "S.rsplit([sep [,maxsplit]]) -> list of strings\n\
7491\n\
7492Return a list of the words in S, using sep as the\n\
7493delimiter string, starting at the end of the string and\n\
7494working to the front.  If maxsplit is given, at most maxsplit\n\
7495splits are done. If sep is not specified, any whitespace string\n\
7496is a separator.");
7497
7498static PyObject*
7499unicode_rsplit(PyUnicodeObject *self, PyObject *args)
7500{
7501    PyObject *substring = Py_None;
7502    Py_ssize_t maxcount = -1;
7503
7504    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
7505        return NULL;
7506
7507    if (substring == Py_None)
7508        return rsplit(self, NULL, maxcount);
7509    else if (PyUnicode_Check(substring))
7510        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
7511    else
7512        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
7513}
7514
7515PyDoc_STRVAR(splitlines__doc__,
7516             "S.splitlines([keepends]) -> list of strings\n\
7517\n\
7518Return a list of the lines in S, breaking at line boundaries.\n\
7519Line breaks are not included in the resulting list unless keepends\n\
7520is given and true.");
7521
7522static PyObject*
7523unicode_splitlines(PyUnicodeObject *self, PyObject *args)
7524{
7525    int keepends = 0;
7526
7527    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
7528        return NULL;
7529
7530    return PyUnicode_Splitlines((PyObject *)self, keepends);
7531}
7532
7533static
7534PyObject *unicode_str(PyUnicodeObject *self)
7535{
7536    return PyUnicode_AsEncodedString((PyObject *)self, NULL, NULL);
7537}
7538
7539PyDoc_STRVAR(swapcase__doc__,
7540             "S.swapcase() -> unicode\n\
7541\n\
7542Return a copy of S with uppercase characters converted to lowercase\n\
7543and vice versa.");
7544
7545static PyObject*
7546unicode_swapcase(PyUnicodeObject *self)
7547{
7548    return fixup(self, fixswapcase);
7549}
7550
7551PyDoc_STRVAR(translate__doc__,
7552             "S.translate(table) -> unicode\n\
7553\n\
7554Return a copy of the string S, where all characters have been mapped\n\
7555through the given translation table, which must be a mapping of\n\
7556Unicode ordinals to Unicode ordinals, Unicode strings or None.\n\
7557Unmapped characters are left untouched. Characters mapped to None\n\
7558are deleted.");
7559
7560static PyObject*
7561unicode_translate(PyUnicodeObject *self, PyObject *table)
7562{
7563    return PyUnicode_TranslateCharmap(self->str,
7564                                      self->length,
7565                                      table,
7566                                      "ignore");
7567}
7568
7569PyDoc_STRVAR(upper__doc__,
7570             "S.upper() -> unicode\n\
7571\n\
7572Return a copy of S converted to uppercase.");
7573
7574static PyObject*
7575unicode_upper(PyUnicodeObject *self)
7576{
7577    return fixup(self, fixupper);
7578}
7579
7580PyDoc_STRVAR(zfill__doc__,
7581             "S.zfill(width) -> unicode\n\
7582\n\
7583Pad a numeric string S with zeros on the left, to fill a field\n\
7584of the specified width. The string S is never truncated.");
7585
7586static PyObject *
7587unicode_zfill(PyUnicodeObject *self, PyObject *args)
7588{
7589    Py_ssize_t fill;
7590    PyUnicodeObject *u;
7591
7592    Py_ssize_t width;
7593    if (!PyArg_ParseTuple(args, "n:zfill", &width))
7594        return NULL;
7595
7596    if (self->length >= width) {
7597        if (PyUnicode_CheckExact(self)) {
7598            Py_INCREF(self);
7599            return (PyObject*) self;
7600        }
7601        else
7602            return PyUnicode_FromUnicode(
7603                PyUnicode_AS_UNICODE(self),
7604                PyUnicode_GET_SIZE(self)
7605                );
7606    }
7607
7608    fill = width - self->length;
7609
7610    u = pad(self, fill, 0, '0');
7611
7612    if (u == NULL)
7613        return NULL;
7614
7615    if (u->str[fill] == '+' || u->str[fill] == '-') {
7616        /* move sign to beginning of string */
7617        u->str[0] = u->str[fill];
7618        u->str[fill] = '0';
7619    }
7620
7621    return (PyObject*) u;
7622}
7623
7624#if 0
7625static PyObject*
7626free_listsize(PyUnicodeObject *self)
7627{
7628    return PyInt_FromLong(numfree);
7629}
7630#endif
7631
7632PyDoc_STRVAR(startswith__doc__,
7633             "S.startswith(prefix[, start[, end]]) -> bool\n\
7634\n\
7635Return True if S starts with the specified prefix, False otherwise.\n\
7636With optional start, test S beginning at that position.\n\
7637With optional end, stop comparing S at that position.\n\
7638prefix can also be a tuple of strings to try.");
7639
7640static PyObject *
7641unicode_startswith(PyUnicodeObject *self,
7642                   PyObject *args)
7643{
7644    PyObject *subobj;
7645    PyUnicodeObject *substring;
7646    Py_ssize_t start = 0;
7647    Py_ssize_t end = PY_SSIZE_T_MAX;
7648    int result;
7649
7650    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
7651        return NULL;
7652    if (PyTuple_Check(subobj)) {
7653        Py_ssize_t i;
7654        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7655            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7656                PyTuple_GET_ITEM(subobj, i));
7657            if (substring == NULL)
7658                return NULL;
7659            result = tailmatch(self, substring, start, end, -1);
7660            Py_DECREF(substring);
7661            if (result) {
7662                Py_RETURN_TRUE;
7663            }
7664        }
7665        /* nothing matched */
7666        Py_RETURN_FALSE;
7667    }
7668    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7669    if (substring == NULL) {
7670        if (PyErr_ExceptionMatches(PyExc_TypeError))
7671            PyErr_Format(PyExc_TypeError, "startswith first arg must be str, "
7672                         "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7673        return NULL;
7674    }
7675    result = tailmatch(self, substring, start, end, -1);
7676    Py_DECREF(substring);
7677    return PyBool_FromLong(result);
7678}
7679
7680
7681PyDoc_STRVAR(endswith__doc__,
7682             "S.endswith(suffix[, start[, end]]) -> bool\n\
7683\n\
7684Return True if S ends with the specified suffix, False otherwise.\n\
7685With optional start, test S beginning at that position.\n\
7686With optional end, stop comparing S at that position.\n\
7687suffix can also be a tuple of strings to try.");
7688
7689static PyObject *
7690unicode_endswith(PyUnicodeObject *self,
7691                 PyObject *args)
7692{
7693    PyObject *subobj;
7694    PyUnicodeObject *substring;
7695    Py_ssize_t start = 0;
7696    Py_ssize_t end = PY_SSIZE_T_MAX;
7697    int result;
7698
7699    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
7700        return NULL;
7701    if (PyTuple_Check(subobj)) {
7702        Py_ssize_t i;
7703        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
7704            substring = (PyUnicodeObject *)PyUnicode_FromObject(
7705                PyTuple_GET_ITEM(subobj, i));
7706            if (substring == NULL)
7707                return NULL;
7708            result = tailmatch(self, substring, start, end, +1);
7709            Py_DECREF(substring);
7710            if (result) {
7711                Py_RETURN_TRUE;
7712            }
7713        }
7714        Py_RETURN_FALSE;
7715    }
7716    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
7717    if (substring == NULL) {
7718        if (PyErr_ExceptionMatches(PyExc_TypeError))
7719            PyErr_Format(PyExc_TypeError, "endswith first arg must be str, "
7720                         "unicode, or tuple, not %s", Py_TYPE(subobj)->tp_name);
7721        return NULL;
7722    }
7723    result = tailmatch(self, substring, start, end, +1);
7724    Py_DECREF(substring);
7725    return PyBool_FromLong(result);
7726}
7727
7728
7729/* Implements do_string_format, which is unicode because of stringlib */
7730#include "stringlib/string_format.h"
7731
7732PyDoc_STRVAR(format__doc__,
7733             "S.format(*args, **kwargs) -> unicode\n\
7734\n\
7735Return a formatted version of S, using substitutions from args and kwargs.\n\
7736The substitutions are identified by braces ('{' and '}').");
7737
7738static PyObject *
7739unicode__format__(PyObject *self, PyObject *args)
7740{
7741    PyObject *format_spec;
7742    PyObject *result = NULL;
7743    PyObject *tmp = NULL;
7744
7745    /* If 2.x, convert format_spec to the same type as value */
7746    /* This is to allow things like u''.format('') */
7747    if (!PyArg_ParseTuple(args, "O:__format__", &format_spec))
7748        goto done;
7749    if (!(PyBytes_Check(format_spec) || PyUnicode_Check(format_spec))) {
7750        PyErr_Format(PyExc_TypeError, "__format__ arg must be str "
7751                     "or unicode, not %s", Py_TYPE(format_spec)->tp_name);
7752        goto done;
7753    }
7754    tmp = PyObject_Unicode(format_spec);
7755    if (tmp == NULL)
7756        goto done;
7757    format_spec = tmp;
7758
7759    result = _PyUnicode_FormatAdvanced(self,
7760                                       PyUnicode_AS_UNICODE(format_spec),
7761                                       PyUnicode_GET_SIZE(format_spec));
7762  done:
7763    Py_XDECREF(tmp);
7764    return result;
7765}
7766
7767PyDoc_STRVAR(p_format__doc__,
7768             "S.__format__(format_spec) -> unicode\n\
7769\n\
7770Return a formatted version of S as described by format_spec.");
7771
7772static PyObject *
7773unicode__sizeof__(PyUnicodeObject *v)
7774{
7775    return PyInt_FromSsize_t(sizeof(PyUnicodeObject) +
7776                             sizeof(Py_UNICODE) * (v->length + 1));
7777}
7778
7779PyDoc_STRVAR(sizeof__doc__,
7780             "S.__sizeof__() -> size of S in memory, in bytes\n\
7781\n\
7782");
7783
7784static PyObject *
7785unicode_getnewargs(PyUnicodeObject *v)
7786{
7787    return Py_BuildValue("(u#)", v->str, v->length);
7788}
7789
7790
7791static PyMethodDef unicode_methods[] = {
7792
7793    /* Order is according to common usage: often used methods should
7794       appear first, since lookup is done sequentially. */
7795
7796    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
7797    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
7798    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
7799    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
7800    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
7801    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
7802    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
7803    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
7804    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
7805    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
7806    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
7807    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
7808    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
7809    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
7810    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
7811    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
7812    {"decode", (PyCFunction) unicode_decode, METH_VARARGS | METH_KEYWORDS, decode__doc__},
7813/*  {"maketrans", (PyCFunction) unicode_maketrans, METH_VARARGS, maketrans__doc__}, */
7814    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
7815    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
7816    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
7817    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
7818    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
7819    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
7820    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
7821    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
7822    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
7823    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
7824    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
7825    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
7826    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
7827    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
7828    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
7829    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
7830    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
7831    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
7832    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
7833    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
7834    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
7835    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
7836    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
7837    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
7838    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
7839    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
7840    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
7841#if 0
7842    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
7843#endif
7844
7845#if 0
7846    /* This one is just used for debugging the implementation. */
7847    {"freelistsize", (PyCFunction) free_listsize, METH_NOARGS},
7848#endif
7849
7850    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
7851    {NULL, NULL}
7852};
7853
7854static PyObject *
7855unicode_mod(PyObject *v, PyObject *w)
7856{
7857    if (!PyUnicode_Check(v)) {
7858        Py_INCREF(Py_NotImplemented);
7859        return Py_NotImplemented;
7860    }
7861    return PyUnicode_Format(v, w);
7862}
7863
7864static PyNumberMethods unicode_as_number = {
7865    0,              /*nb_add*/
7866    0,              /*nb_subtract*/
7867    0,              /*nb_multiply*/
7868    0,              /*nb_divide*/
7869    unicode_mod,            /*nb_remainder*/
7870};
7871
7872static PySequenceMethods unicode_as_sequence = {
7873    (lenfunc) unicode_length,       /* sq_length */
7874    PyUnicode_Concat,           /* sq_concat */
7875    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
7876    (ssizeargfunc) unicode_getitem,     /* sq_item */
7877    (ssizessizeargfunc) unicode_slice,  /* sq_slice */
7878    0,                  /* sq_ass_item */
7879    0,                  /* sq_ass_slice */
7880    PyUnicode_Contains,         /* sq_contains */
7881};
7882
7883static PyObject*
7884unicode_subscript(PyUnicodeObject* self, PyObject* item)
7885{
7886    if (PyIndex_Check(item)) {
7887        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
7888        if (i == -1 && PyErr_Occurred())
7889            return NULL;
7890        if (i < 0)
7891            i += PyUnicode_GET_SIZE(self);
7892        return unicode_getitem(self, i);
7893    } else if (PySlice_Check(item)) {
7894        Py_ssize_t start, stop, step, slicelength, cur, i;
7895        Py_UNICODE* source_buf;
7896        Py_UNICODE* result_buf;
7897        PyObject* result;
7898
7899        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
7900                                 &start, &stop, &step, &slicelength) < 0) {
7901            return NULL;
7902        }
7903
7904        if (slicelength <= 0) {
7905            return PyUnicode_FromUnicode(NULL, 0);
7906        } else if (start == 0 && step == 1 && slicelength == self->length &&
7907                   PyUnicode_CheckExact(self)) {
7908            Py_INCREF(self);
7909            return (PyObject *)self;
7910        } else if (step == 1) {
7911            return PyUnicode_FromUnicode(self->str + start, slicelength);
7912        } else {
7913            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
7914            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
7915                                                       sizeof(Py_UNICODE));
7916
7917            if (result_buf == NULL)
7918                return PyErr_NoMemory();
7919
7920            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
7921                result_buf[i] = source_buf[cur];
7922            }
7923
7924            result = PyUnicode_FromUnicode(result_buf, slicelength);
7925            PyObject_FREE(result_buf);
7926            return result;
7927        }
7928    } else {
7929        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
7930        return NULL;
7931    }
7932}
7933
7934static PyMappingMethods unicode_as_mapping = {
7935    (lenfunc)unicode_length,        /* mp_length */
7936    (binaryfunc)unicode_subscript,  /* mp_subscript */
7937    (objobjargproc)0,           /* mp_ass_subscript */
7938};
7939
7940static Py_ssize_t
7941unicode_buffer_getreadbuf(PyUnicodeObject *self,
7942                          Py_ssize_t index,
7943                          const void **ptr)
7944{
7945    if (index != 0) {
7946        PyErr_SetString(PyExc_SystemError,
7947                        "accessing non-existent unicode segment");
7948        return -1;
7949    }
7950    *ptr = (void *) self->str;
7951    return PyUnicode_GET_DATA_SIZE(self);
7952}
7953
7954static Py_ssize_t
7955unicode_buffer_getwritebuf(PyUnicodeObject *self, Py_ssize_t index,
7956                           const void **ptr)
7957{
7958    PyErr_SetString(PyExc_TypeError,
7959                    "cannot use unicode as modifiable buffer");
7960    return -1;
7961}
7962
7963static int
7964unicode_buffer_getsegcount(PyUnicodeObject *self,
7965                           Py_ssize_t *lenp)
7966{
7967    if (lenp)
7968        *lenp = PyUnicode_GET_DATA_SIZE(self);
7969    return 1;
7970}
7971
7972static Py_ssize_t
7973unicode_buffer_getcharbuf(PyUnicodeObject *self,
7974                          Py_ssize_t index,
7975                          const void **ptr)
7976{
7977    PyObject *str;
7978
7979    if (index != 0) {
7980        PyErr_SetString(PyExc_SystemError,
7981                        "accessing non-existent unicode segment");
7982        return -1;
7983    }
7984    str = _PyUnicode_AsDefaultEncodedString((PyObject *)self, NULL);
7985    if (str == NULL)
7986        return -1;
7987    *ptr = (void *) PyString_AS_STRING(str);
7988    return PyString_GET_SIZE(str);
7989}
7990
7991/* Helpers for PyUnicode_Format() */
7992
7993static PyObject *
7994getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
7995{
7996    Py_ssize_t argidx = *p_argidx;
7997    if (argidx < arglen) {
7998        (*p_argidx)++;
7999        if (arglen < 0)
8000            return args;
8001        else
8002            return PyTuple_GetItem(args, argidx);
8003    }
8004    PyErr_SetString(PyExc_TypeError,
8005                    "not enough arguments for format string");
8006    return NULL;
8007}
8008
8009#define F_LJUST (1<<0)
8010#define F_SIGN  (1<<1)
8011#define F_BLANK (1<<2)
8012#define F_ALT   (1<<3)
8013#define F_ZERO  (1<<4)
8014
8015static Py_ssize_t
8016strtounicode(Py_UNICODE *buffer, const char *charbuffer)
8017{
8018    register Py_ssize_t i;
8019    Py_ssize_t len = strlen(charbuffer);
8020    for (i = len - 1; i >= 0; i--)
8021        buffer[i] = (Py_UNICODE) charbuffer[i];
8022
8023    return len;
8024}
8025
8026static int
8027longtounicode(Py_UNICODE *buffer, size_t len, const char *format, long x)
8028{
8029    Py_ssize_t result;
8030
8031    PyOS_snprintf((char *)buffer, len, format, x);
8032    result = strtounicode(buffer, (char *)buffer);
8033    return Py_SAFE_DOWNCAST(result, Py_ssize_t, int);
8034}
8035
8036/* XXX To save some code duplication, formatfloat/long/int could have been
8037   shared with stringobject.c, converting from 8-bit to Unicode after the
8038   formatting is done. */
8039
8040/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8041
8042static PyObject *
8043formatfloat(PyObject *v, int flags, int prec, int type)
8044{
8045    char *p;
8046    PyObject *result;
8047    double x;
8048
8049    x = PyFloat_AsDouble(v);
8050    if (x == -1.0 && PyErr_Occurred())
8051        return NULL;
8052
8053    if (prec < 0)
8054        prec = 6;
8055
8056    p = PyOS_double_to_string(x, type, prec,
8057                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8058    if (p == NULL)
8059        return NULL;
8060    result = PyUnicode_FromStringAndSize(p, strlen(p));
8061    PyMem_Free(p);
8062    return result;
8063}
8064
8065static PyObject*
8066formatlong(PyObject *val, int flags, int prec, int type)
8067{
8068    char *buf;
8069    int i, len;
8070    PyObject *str; /* temporary string object. */
8071    PyUnicodeObject *result;
8072
8073    str = _PyString_FormatLong(val, flags, prec, type, &buf, &len);
8074    if (!str)
8075        return NULL;
8076    result = _PyUnicode_New(len);
8077    if (!result) {
8078        Py_DECREF(str);
8079        return NULL;
8080    }
8081    for (i = 0; i < len; i++)
8082        result->str[i] = buf[i];
8083    result->str[len] = 0;
8084    Py_DECREF(str);
8085    return (PyObject*)result;
8086}
8087
8088static int
8089formatint(Py_UNICODE *buf,
8090          size_t buflen,
8091          int flags,
8092          int prec,
8093          int type,
8094          PyObject *v)
8095{
8096    /* fmt = '%#.' + `prec` + 'l' + `type`
8097     * worst case length = 3 + 19 (worst len of INT_MAX on 64-bit machine)
8098     *                     + 1 + 1
8099     *                   = 24
8100     */
8101    char fmt[64]; /* plenty big enough! */
8102    char *sign;
8103    long x;
8104
8105    x = PyInt_AsLong(v);
8106    if (x == -1 && PyErr_Occurred())
8107        return -1;
8108    if (x < 0 && type == 'u') {
8109        type = 'd';
8110    }
8111    if (x < 0 && (type == 'x' || type == 'X' || type == 'o'))
8112        sign = "-";
8113    else
8114        sign = "";
8115    if (prec < 0)
8116        prec = 1;
8117
8118    /* buf = '+'/'-'/'' + '0'/'0x'/'' + '[0-9]'*max(prec, len(x in octal))
8119     * worst case buf = '-0x' + [0-9]*prec, where prec >= 11
8120     */
8121    if (buflen <= 14 || buflen <= (size_t)3 + (size_t)prec) {
8122        PyErr_SetString(PyExc_OverflowError,
8123                        "formatted integer is too long (precision too large?)");
8124        return -1;
8125    }
8126
8127    if ((flags & F_ALT) &&
8128        (type == 'x' || type == 'X')) {
8129        /* When converting under %#x or %#X, there are a number
8130         * of issues that cause pain:
8131         * - when 0 is being converted, the C standard leaves off
8132         *   the '0x' or '0X', which is inconsistent with other
8133         *   %#x/%#X conversions and inconsistent with Python's
8134         *   hex() function
8135         * - there are platforms that violate the standard and
8136         *   convert 0 with the '0x' or '0X'
8137         *   (Metrowerks, Compaq Tru64)
8138         * - there are platforms that give '0x' when converting
8139         *   under %#X, but convert 0 in accordance with the
8140         *   standard (OS/2 EMX)
8141         *
8142         * We can achieve the desired consistency by inserting our
8143         * own '0x' or '0X' prefix, and substituting %x/%X in place
8144         * of %#x/%#X.
8145         *
8146         * Note that this is the same approach as used in
8147         * formatint() in stringobject.c
8148         */
8149        PyOS_snprintf(fmt, sizeof(fmt), "%s0%c%%.%dl%c",
8150                      sign, type, prec, type);
8151    }
8152    else {
8153        PyOS_snprintf(fmt, sizeof(fmt), "%s%%%s.%dl%c",
8154                      sign, (flags&F_ALT) ? "#" : "",
8155                      prec, type);
8156    }
8157    if (sign[0])
8158        return longtounicode(buf, buflen, fmt, -x);
8159    else
8160        return longtounicode(buf, buflen, fmt, x);
8161}
8162
8163static int
8164formatchar(Py_UNICODE *buf,
8165           size_t buflen,
8166           PyObject *v)
8167{
8168    PyObject *unistr;
8169    char *str;
8170    /* presume that the buffer is at least 2 characters long */
8171    if (PyUnicode_Check(v)) {
8172        if (PyUnicode_GET_SIZE(v) != 1)
8173            goto onError;
8174        buf[0] = PyUnicode_AS_UNICODE(v)[0];
8175    }
8176
8177    else if (PyString_Check(v)) {
8178        if (PyString_GET_SIZE(v) != 1)
8179            goto onError;
8180        /* #7649: "u'%c' % char" should behave like "u'%s' % char" and fail
8181           with a UnicodeDecodeError if 'char' is not decodable with the
8182           default encoding (usually ASCII, but it might be something else) */
8183        str = PyString_AS_STRING(v);
8184        if ((unsigned char)str[0] > 0x7F) {
8185            /* the char is not ASCII; try to decode the string using the
8186               default encoding and return -1 to let the UnicodeDecodeError
8187               be raised if the string can't be decoded */
8188            unistr = PyUnicode_Decode(str, 1, NULL, "strict");
8189            if (unistr == NULL)
8190                return -1;
8191            buf[0] = PyUnicode_AS_UNICODE(unistr)[0];
8192            Py_DECREF(unistr);
8193        }
8194        else
8195            buf[0] = (Py_UNICODE)str[0];
8196    }
8197
8198    else {
8199        /* Integer input truncated to a character */
8200        long x;
8201        x = PyInt_AsLong(v);
8202        if (x == -1 && PyErr_Occurred())
8203            goto onError;
8204#ifdef Py_UNICODE_WIDE
8205        if (x < 0 || x > 0x10ffff) {
8206            PyErr_SetString(PyExc_OverflowError,
8207                            "%c arg not in range(0x110000) "
8208                            "(wide Python build)");
8209            return -1;
8210        }
8211#else
8212        if (x < 0 || x > 0xffff) {
8213            PyErr_SetString(PyExc_OverflowError,
8214                            "%c arg not in range(0x10000) "
8215                            "(narrow Python build)");
8216            return -1;
8217        }
8218#endif
8219        buf[0] = (Py_UNICODE) x;
8220    }
8221    buf[1] = '\0';
8222    return 1;
8223
8224  onError:
8225    PyErr_SetString(PyExc_TypeError,
8226                    "%c requires int or char");
8227    return -1;
8228}
8229
8230/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
8231
8232   FORMATBUFLEN is the length of the buffer in which the ints &
8233   chars are formatted. XXX This is a magic number. Each formatting
8234   routine does bounds checking to ensure no overflow, but a better
8235   solution may be to malloc a buffer of appropriate size for each
8236   format. For now, the current solution is sufficient.
8237*/
8238#define FORMATBUFLEN (size_t)120
8239
8240PyObject *PyUnicode_Format(PyObject *format,
8241                           PyObject *args)
8242{
8243    Py_UNICODE *fmt, *res;
8244    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
8245    int args_owned = 0;
8246    PyUnicodeObject *result = NULL;
8247    PyObject *dict = NULL;
8248    PyObject *uformat;
8249
8250    if (format == NULL || args == NULL) {
8251        PyErr_BadInternalCall();
8252        return NULL;
8253    }
8254    uformat = PyUnicode_FromObject(format);
8255    if (uformat == NULL)
8256        return NULL;
8257    fmt = PyUnicode_AS_UNICODE(uformat);
8258    fmtcnt = PyUnicode_GET_SIZE(uformat);
8259
8260    reslen = rescnt = fmtcnt + 100;
8261    result = _PyUnicode_New(reslen);
8262    if (result == NULL)
8263        goto onError;
8264    res = PyUnicode_AS_UNICODE(result);
8265
8266    if (PyTuple_Check(args)) {
8267        arglen = PyTuple_Size(args);
8268        argidx = 0;
8269    }
8270    else {
8271        arglen = -1;
8272        argidx = -2;
8273    }
8274    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
8275        !PyObject_TypeCheck(args, &PyBaseString_Type))
8276        dict = args;
8277
8278    while (--fmtcnt >= 0) {
8279        if (*fmt != '%') {
8280            if (--rescnt < 0) {
8281                rescnt = fmtcnt + 100;
8282                reslen += rescnt;
8283                if (_PyUnicode_Resize(&result, reslen) < 0)
8284                    goto onError;
8285                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
8286                --rescnt;
8287            }
8288            *res++ = *fmt++;
8289        }
8290        else {
8291            /* Got a format specifier */
8292            int flags = 0;
8293            Py_ssize_t width = -1;
8294            int prec = -1;
8295            Py_UNICODE c = '\0';
8296            Py_UNICODE fill;
8297            int isnumok;
8298            PyObject *v       = NULL;
8299            PyObject *temp    = NULL;
8300            Py_UNICODE *pbuf  = NULL;
8301            Py_UNICODE sign;
8302            Py_ssize_t len;
8303            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For format{int,char}() */
8304
8305            fmt++;
8306            if (*fmt == '(') {
8307                Py_UNICODE *keystart;
8308                Py_ssize_t keylen;
8309                PyObject *key;
8310                int pcount = 1;
8311
8312                if (dict == NULL) {
8313                    PyErr_SetString(PyExc_TypeError,
8314                                    "format requires a mapping");
8315                    goto onError;
8316                }
8317                ++fmt;
8318                --fmtcnt;
8319                keystart = fmt;
8320                /* Skip over balanced parentheses */
8321                while (pcount > 0 && --fmtcnt >= 0) {
8322                    if (*fmt == ')')
8323                        --pcount;
8324                    else if (*fmt == '(')
8325                        ++pcount;
8326                    fmt++;
8327                }
8328                keylen = fmt - keystart - 1;
8329                if (fmtcnt < 0 || pcount > 0) {
8330                    PyErr_SetString(PyExc_ValueError,
8331                                    "incomplete format key");
8332                    goto onError;
8333                }
8334#if 0
8335                /* keys are converted to strings using UTF-8 and
8336                   then looked up since Python uses strings to hold
8337                   variables names etc. in its namespaces and we
8338                   wouldn't want to break common idioms. */
8339                key = PyUnicode_EncodeUTF8(keystart,
8340                                           keylen,
8341                                           NULL);
8342#else
8343                key = PyUnicode_FromUnicode(keystart, keylen);
8344#endif
8345                if (key == NULL)
8346                    goto onError;
8347                if (args_owned) {
8348                    Py_DECREF(args);
8349                    args_owned = 0;
8350                }
8351                args = PyObject_GetItem(dict, key);
8352                Py_DECREF(key);
8353                if (args == NULL) {
8354                    goto onError;
8355                }
8356                args_owned = 1;
8357                arglen = -1;
8358                argidx = -2;
8359            }
8360            while (--fmtcnt >= 0) {
8361                switch (c = *fmt++) {
8362                case '-': flags |= F_LJUST; continue;
8363                case '+': flags |= F_SIGN; continue;
8364                case ' ': flags |= F_BLANK; continue;
8365                case '#': flags |= F_ALT; continue;
8366                case '0': flags |= F_ZERO; continue;
8367                }
8368                break;
8369            }
8370            if (c == '*') {
8371                v = getnextarg(args, arglen, &argidx);
8372                if (v == NULL)
8373                    goto onError;
8374                if (!PyInt_Check(v)) {
8375                    PyErr_SetString(PyExc_TypeError,
8376                                    "* wants int");
8377                    goto onError;
8378                }
8379                width = PyInt_AsLong(v);
8380                if (width < 0) {
8381                    flags |= F_LJUST;
8382                    width = -width;
8383                }
8384                if (--fmtcnt >= 0)
8385                    c = *fmt++;
8386            }
8387            else if (c >= '0' && c <= '9') {
8388                width = c - '0';
8389                while (--fmtcnt >= 0) {
8390                    c = *fmt++;
8391                    if (c < '0' || c > '9')
8392                        break;
8393                    if ((width*10) / 10 != width) {
8394                        PyErr_SetString(PyExc_ValueError,
8395                                        "width too big");
8396                        goto onError;
8397                    }
8398                    width = width*10 + (c - '0');
8399                }
8400            }
8401            if (c == '.') {
8402                prec = 0;
8403                if (--fmtcnt >= 0)
8404                    c = *fmt++;
8405                if (c == '*') {
8406                    v = getnextarg(args, arglen, &argidx);
8407                    if (v == NULL)
8408                        goto onError;
8409                    if (!PyInt_Check(v)) {
8410                        PyErr_SetString(PyExc_TypeError,
8411                                        "* wants int");
8412                        goto onError;
8413                    }
8414                    prec = PyInt_AsLong(v);
8415                    if (prec < 0)
8416                        prec = 0;
8417                    if (--fmtcnt >= 0)
8418                        c = *fmt++;
8419                }
8420                else if (c >= '0' && c <= '9') {
8421                    prec = c - '0';
8422                    while (--fmtcnt >= 0) {
8423                        c = *fmt++;
8424                        if (c < '0' || c > '9')
8425                            break;
8426                        if ((prec*10) / 10 != prec) {
8427                            PyErr_SetString(PyExc_ValueError,
8428                                            "prec too big");
8429                            goto onError;
8430                        }
8431                        prec = prec*10 + (c - '0');
8432                    }
8433                }
8434            } /* prec */
8435            if (fmtcnt >= 0) {
8436                if (c == 'h' || c == 'l' || c == 'L') {
8437                    if (--fmtcnt >= 0)
8438                        c = *fmt++;
8439                }
8440            }
8441            if (fmtcnt < 0) {
8442                PyErr_SetString(PyExc_ValueError,
8443                                "incomplete format");
8444                goto onError;
8445            }
8446            if (c != '%') {
8447                v = getnextarg(args, arglen, &argidx);
8448                if (v == NULL)
8449                    goto onError;
8450            }
8451            sign = 0;
8452            fill = ' ';
8453            switch (c) {
8454
8455            case '%':
8456                pbuf = formatbuf;
8457                /* presume that buffer length is at least 1 */
8458                pbuf[0] = '%';
8459                len = 1;
8460                break;
8461
8462            case 's':
8463            case 'r':
8464                if (PyUnicode_CheckExact(v) && c == 's') {
8465                    temp = v;
8466                    Py_INCREF(temp);
8467                }
8468                else {
8469                    PyObject *unicode;
8470                    if (c == 's')
8471                        temp = PyObject_Unicode(v);
8472                    else
8473                        temp = PyObject_Repr(v);
8474                    if (temp == NULL)
8475                        goto onError;
8476                    if (PyUnicode_Check(temp))
8477                        /* nothing to do */;
8478                    else if (PyString_Check(temp)) {
8479                        /* convert to string to Unicode */
8480                        unicode = PyUnicode_Decode(PyString_AS_STRING(temp),
8481                                                   PyString_GET_SIZE(temp),
8482                                                   NULL,
8483                                                   "strict");
8484                        Py_DECREF(temp);
8485                        temp = unicode;
8486                        if (temp == NULL)
8487                            goto onError;
8488                    }
8489                    else {
8490                        Py_DECREF(temp);
8491                        PyErr_SetString(PyExc_TypeError,
8492                                        "%s argument has non-string str()");
8493                        goto onError;
8494                    }
8495                }
8496                pbuf = PyUnicode_AS_UNICODE(temp);
8497                len = PyUnicode_GET_SIZE(temp);
8498                if (prec >= 0 && len > prec)
8499                    len = prec;
8500                break;
8501
8502            case 'i':
8503            case 'd':
8504            case 'u':
8505            case 'o':
8506            case 'x':
8507            case 'X':
8508                if (c == 'i')
8509                    c = 'd';
8510                isnumok = 0;
8511                if (PyNumber_Check(v)) {
8512                    PyObject *iobj=NULL;
8513
8514                    if (PyInt_Check(v) || (PyLong_Check(v))) {
8515                        iobj = v;
8516                        Py_INCREF(iobj);
8517                    }
8518                    else {
8519                        iobj = PyNumber_Int(v);
8520                        if (iobj==NULL) iobj = PyNumber_Long(v);
8521                    }
8522                    if (iobj!=NULL) {
8523                        if (PyInt_Check(iobj)) {
8524                            isnumok = 1;
8525                            pbuf = formatbuf;
8526                            len = formatint(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE),
8527                                            flags, prec, c, iobj);
8528                            Py_DECREF(iobj);
8529                            if (len < 0)
8530                                goto onError;
8531                            sign = 1;
8532                        }
8533                        else if (PyLong_Check(iobj)) {
8534                            isnumok = 1;
8535                            temp = formatlong(iobj, flags, prec, c);
8536                            Py_DECREF(iobj);
8537                            if (!temp)
8538                                goto onError;
8539                            pbuf = PyUnicode_AS_UNICODE(temp);
8540                            len = PyUnicode_GET_SIZE(temp);
8541                            sign = 1;
8542                        }
8543                        else {
8544                            Py_DECREF(iobj);
8545                        }
8546                    }
8547                }
8548                if (!isnumok) {
8549                    PyErr_Format(PyExc_TypeError,
8550                                 "%%%c format: a number is required, "
8551                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
8552                    goto onError;
8553                }
8554                if (flags & F_ZERO)
8555                    fill = '0';
8556                break;
8557
8558            case 'e':
8559            case 'E':
8560            case 'f':
8561            case 'F':
8562            case 'g':
8563            case 'G':
8564                temp = formatfloat(v, flags, prec, c);
8565                if (temp == NULL)
8566                    goto onError;
8567                pbuf = PyUnicode_AS_UNICODE(temp);
8568                len = PyUnicode_GET_SIZE(temp);
8569                sign = 1;
8570                if (flags & F_ZERO)
8571                    fill = '0';
8572                break;
8573
8574            case 'c':
8575                pbuf = formatbuf;
8576                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
8577                if (len < 0)
8578                    goto onError;
8579                break;
8580
8581            default:
8582                PyErr_Format(PyExc_ValueError,
8583                             "unsupported format character '%c' (0x%x) "
8584                             "at index %zd",
8585                             (31<=c && c<=126) ? (char)c : '?',
8586                             (int)c,
8587                             (Py_ssize_t)(fmt - 1 -
8588                                          PyUnicode_AS_UNICODE(uformat)));
8589                goto onError;
8590            }
8591            if (sign) {
8592                if (*pbuf == '-' || *pbuf == '+') {
8593                    sign = *pbuf++;
8594                    len--;
8595                }
8596                else if (flags & F_SIGN)
8597                    sign = '+';
8598                else if (flags & F_BLANK)
8599                    sign = ' ';
8600                else
8601                    sign = 0;
8602            }
8603            if (width < len)
8604                width = len;
8605            if (rescnt - (sign != 0) < width) {
8606                reslen -= rescnt;
8607                rescnt = width + fmtcnt + 100;
8608                reslen += rescnt;
8609                if (reslen < 0) {
8610                    Py_XDECREF(temp);
8611                    PyErr_NoMemory();
8612                    goto onError;
8613                }
8614                if (_PyUnicode_Resize(&result, reslen) < 0) {
8615                    Py_XDECREF(temp);
8616                    goto onError;
8617                }
8618                res = PyUnicode_AS_UNICODE(result)
8619                    + reslen - rescnt;
8620            }
8621            if (sign) {
8622                if (fill != ' ')
8623                    *res++ = sign;
8624                rescnt--;
8625                if (width > len)
8626                    width--;
8627            }
8628            if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8629                assert(pbuf[0] == '0');
8630                assert(pbuf[1] == c);
8631                if (fill != ' ') {
8632                    *res++ = *pbuf++;
8633                    *res++ = *pbuf++;
8634                }
8635                rescnt -= 2;
8636                width -= 2;
8637                if (width < 0)
8638                    width = 0;
8639                len -= 2;
8640            }
8641            if (width > len && !(flags & F_LJUST)) {
8642                do {
8643                    --rescnt;
8644                    *res++ = fill;
8645                } while (--width > len);
8646            }
8647            if (fill == ' ') {
8648                if (sign)
8649                    *res++ = sign;
8650                if ((flags & F_ALT) && (c == 'x' || c == 'X')) {
8651                    assert(pbuf[0] == '0');
8652                    assert(pbuf[1] == c);
8653                    *res++ = *pbuf++;
8654                    *res++ = *pbuf++;
8655                }
8656            }
8657            Py_UNICODE_COPY(res, pbuf, len);
8658            res += len;
8659            rescnt -= len;
8660            while (--width >= len) {
8661                --rescnt;
8662                *res++ = ' ';
8663            }
8664            if (dict && (argidx < arglen) && c != '%') {
8665                PyErr_SetString(PyExc_TypeError,
8666                                "not all arguments converted during string formatting");
8667                Py_XDECREF(temp);
8668                goto onError;
8669            }
8670            Py_XDECREF(temp);
8671        } /* '%' */
8672    } /* until end */
8673    if (argidx < arglen && !dict) {
8674        PyErr_SetString(PyExc_TypeError,
8675                        "not all arguments converted during string formatting");
8676        goto onError;
8677    }
8678
8679    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
8680        goto onError;
8681    if (args_owned) {
8682        Py_DECREF(args);
8683    }
8684    Py_DECREF(uformat);
8685    return (PyObject *)result;
8686
8687  onError:
8688    Py_XDECREF(result);
8689    Py_DECREF(uformat);
8690    if (args_owned) {
8691        Py_DECREF(args);
8692    }
8693    return NULL;
8694}
8695
8696static PyBufferProcs unicode_as_buffer = {
8697    (readbufferproc) unicode_buffer_getreadbuf,
8698    (writebufferproc) unicode_buffer_getwritebuf,
8699    (segcountproc) unicode_buffer_getsegcount,
8700    (charbufferproc) unicode_buffer_getcharbuf,
8701};
8702
8703static PyObject *
8704unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
8705
8706static PyObject *
8707unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8708{
8709    PyObject *x = NULL;
8710    static char *kwlist[] = {"string", "encoding", "errors", 0};
8711    char *encoding = NULL;
8712    char *errors = NULL;
8713
8714    if (type != &PyUnicode_Type)
8715        return unicode_subtype_new(type, args, kwds);
8716    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:unicode",
8717                                     kwlist, &x, &encoding, &errors))
8718        return NULL;
8719    if (x == NULL)
8720        return (PyObject *)_PyUnicode_New(0);
8721    if (encoding == NULL && errors == NULL)
8722        return PyObject_Unicode(x);
8723    else
8724        return PyUnicode_FromEncodedObject(x, encoding, errors);
8725}
8726
8727static PyObject *
8728unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
8729{
8730    PyUnicodeObject *tmp, *pnew;
8731    Py_ssize_t n;
8732
8733    assert(PyType_IsSubtype(type, &PyUnicode_Type));
8734    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
8735    if (tmp == NULL)
8736        return NULL;
8737    assert(PyUnicode_Check(tmp));
8738    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
8739    if (pnew == NULL) {
8740        Py_DECREF(tmp);
8741        return NULL;
8742    }
8743    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
8744    if (pnew->str == NULL) {
8745        _Py_ForgetReference((PyObject *)pnew);
8746        PyObject_Del(pnew);
8747        Py_DECREF(tmp);
8748        return PyErr_NoMemory();
8749    }
8750    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
8751    pnew->length = n;
8752    pnew->hash = tmp->hash;
8753    Py_DECREF(tmp);
8754    return (PyObject *)pnew;
8755}
8756
8757PyDoc_STRVAR(unicode_doc,
8758             "unicode(string [, encoding[, errors]]) -> object\n\
8759\n\
8760Create a new Unicode object from the given encoded string.\n\
8761encoding defaults to the current default string encoding.\n\
8762errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
8763
8764PyTypeObject PyUnicode_Type = {
8765    PyVarObject_HEAD_INIT(&PyType_Type, 0)
8766    "unicode",              /* tp_name */
8767    sizeof(PyUnicodeObject),        /* tp_size */
8768    0,                  /* tp_itemsize */
8769    /* Slots */
8770    (destructor)unicode_dealloc,    /* tp_dealloc */
8771    0,                  /* tp_print */
8772    0,                  /* tp_getattr */
8773    0,                  /* tp_setattr */
8774    0,                  /* tp_compare */
8775    unicode_repr,           /* tp_repr */
8776    &unicode_as_number,         /* tp_as_number */
8777    &unicode_as_sequence,       /* tp_as_sequence */
8778    &unicode_as_mapping,        /* tp_as_mapping */
8779    (hashfunc) unicode_hash,        /* tp_hash*/
8780    0,                  /* tp_call*/
8781    (reprfunc) unicode_str,     /* tp_str */
8782    PyObject_GenericGetAttr,        /* tp_getattro */
8783    0,                  /* tp_setattro */
8784    &unicode_as_buffer,         /* tp_as_buffer */
8785    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_CHECKTYPES |
8786    Py_TPFLAGS_BASETYPE | Py_TPFLAGS_UNICODE_SUBCLASS,  /* tp_flags */
8787    unicode_doc,            /* tp_doc */
8788    0,                  /* tp_traverse */
8789    0,                  /* tp_clear */
8790    PyUnicode_RichCompare,      /* tp_richcompare */
8791    0,                  /* tp_weaklistoffset */
8792    0,                  /* tp_iter */
8793    0,                  /* tp_iternext */
8794    unicode_methods,            /* tp_methods */
8795    0,                  /* tp_members */
8796    0,                  /* tp_getset */
8797    &PyBaseString_Type,         /* tp_base */
8798    0,                  /* tp_dict */
8799    0,                  /* tp_descr_get */
8800    0,                  /* tp_descr_set */
8801    0,                  /* tp_dictoffset */
8802    0,                  /* tp_init */
8803    0,                  /* tp_alloc */
8804    unicode_new,            /* tp_new */
8805    PyObject_Del,           /* tp_free */
8806};
8807
8808/* Initialize the Unicode implementation */
8809
8810void _PyUnicode_Init(void)
8811{
8812    int i;
8813
8814    /* XXX - move this array to unicodectype.c ? */
8815    Py_UNICODE linebreak[] = {
8816        0x000A, /* LINE FEED */
8817        0x000D, /* CARRIAGE RETURN */
8818        0x001C, /* FILE SEPARATOR */
8819        0x001D, /* GROUP SEPARATOR */
8820        0x001E, /* RECORD SEPARATOR */
8821        0x0085, /* NEXT LINE */
8822        0x2028, /* LINE SEPARATOR */
8823        0x2029, /* PARAGRAPH SEPARATOR */
8824    };
8825
8826    /* Init the implementation */
8827    free_list = NULL;
8828    numfree = 0;
8829    unicode_empty = _PyUnicode_New(0);
8830    if (!unicode_empty)
8831        return;
8832
8833    strcpy(unicode_default_encoding, "ascii");
8834    for (i = 0; i < 256; i++)
8835        unicode_latin1[i] = NULL;
8836    if (PyType_Ready(&PyUnicode_Type) < 0)
8837        Py_FatalError("Can't initialize 'unicode'");
8838
8839    /* initialize the linebreak bloom filter */
8840    bloom_linebreak = make_bloom_mask(
8841        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
8842        );
8843
8844    PyType_Ready(&EncodingMapType);
8845}
8846
8847/* Finalize the Unicode implementation */
8848
8849int
8850PyUnicode_ClearFreeList(void)
8851{
8852    int freelist_size = numfree;
8853    PyUnicodeObject *u;
8854
8855    for (u = free_list; u != NULL;) {
8856        PyUnicodeObject *v = u;
8857        u = *(PyUnicodeObject **)u;
8858        if (v->str)
8859            PyObject_DEL(v->str);
8860        Py_XDECREF(v->defenc);
8861        PyObject_Del(v);
8862        numfree--;
8863    }
8864    free_list = NULL;
8865    assert(numfree == 0);
8866    return freelist_size;
8867}
8868
8869void
8870_PyUnicode_Fini(void)
8871{
8872    int i;
8873
8874    Py_XDECREF(unicode_empty);
8875    unicode_empty = NULL;
8876
8877    for (i = 0; i < 256; i++) {
8878        if (unicode_latin1[i]) {
8879            Py_DECREF(unicode_latin1[i]);
8880            unicode_latin1[i] = NULL;
8881        }
8882    }
8883    (void)PyUnicode_ClearFreeList();
8884}
8885
8886#ifdef __cplusplus
8887}
8888#endif
8889