unicodeobject.c revision 46408606d80347108a6550805d29402d2771bda3
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "bytes_methods.h"
45
46#include "unicodeobject.h"
47#include "ucnhash.h"
48
49#ifdef MS_WINDOWS
50#include <windows.h>
51#endif
52
53/* Limit for the Unicode object free list */
54
55#define PyUnicode_MAXFREELIST       1024
56
57/* Limit for the Unicode object free list stay alive optimization.
58
59   The implementation will keep allocated Unicode memory intact for
60   all objects on the free list having a size less than this
61   limit. This reduces malloc() overhead for small Unicode objects.
62
63   At worst this will result in PyUnicode_MAXFREELIST *
64   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
65   malloc()-overhead) bytes of unused garbage.
66
67   Setting the limit to 0 effectively turns the feature off.
68
69   Note: This is an experimental feature ! If you get core dumps when
70   using Unicode objects, turn this feature off.
71
72*/
73
74#define KEEPALIVE_SIZE_LIMIT       9
75
76/* Endianness switches; defaults to little endian */
77
78#ifdef WORDS_BIGENDIAN
79# define BYTEORDER_IS_BIG_ENDIAN
80#else
81# define BYTEORDER_IS_LITTLE_ENDIAN
82#endif
83
84/* --- Globals ------------------------------------------------------------
85
86   The globals are initialized by the _PyUnicode_Init() API and should
87   not be used before calling that API.
88
89*/
90
91
92#ifdef __cplusplus
93extern "C" {
94#endif
95
96/* This dictionary holds all interned unicode strings.  Note that references
97   to strings in this dictionary are *not* counted in the string's ob_refcnt.
98   When the interned string reaches a refcnt of 0 the string deallocation
99   function will delete the reference from this dictionary.
100
101   Another way to look at this is that to say that the actual reference
102   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
103*/
104static PyObject *interned;
105
106/* Free list for Unicode objects */
107static PyUnicodeObject *free_list;
108static int numfree;
109
110/* The empty Unicode object is shared to improve performance. */
111static PyUnicodeObject *unicode_empty;
112
113/* Single character Unicode strings in the Latin-1 range are being
114   shared as well. */
115static PyUnicodeObject *unicode_latin1[256];
116
117/* Fast detection of the most frequent whitespace characters */
118const unsigned char _Py_ascii_whitespace[] = {
119    0, 0, 0, 0, 0, 0, 0, 0,
120/*     case 0x0009: * CHARACTER TABULATION */
121/*     case 0x000A: * LINE FEED */
122/*     case 0x000B: * LINE TABULATION */
123/*     case 0x000C: * FORM FEED */
124/*     case 0x000D: * CARRIAGE RETURN */
125    0, 1, 1, 1, 1, 1, 0, 0,
126    0, 0, 0, 0, 0, 0, 0, 0,
127/*     case 0x001C: * FILE SEPARATOR */
128/*     case 0x001D: * GROUP SEPARATOR */
129/*     case 0x001E: * RECORD SEPARATOR */
130/*     case 0x001F: * UNIT SEPARATOR */
131    0, 0, 0, 0, 1, 1, 1, 1,
132/*     case 0x0020: * SPACE */
133    1, 0, 0, 0, 0, 0, 0, 0,
134    0, 0, 0, 0, 0, 0, 0, 0,
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0,
143    0, 0, 0, 0, 0, 0, 0, 0,
144    0, 0, 0, 0, 0, 0, 0, 0,
145    0, 0, 0, 0, 0, 0, 0, 0
146};
147
148static PyObject *unicode_encode_call_errorhandler(const char *errors,
149       PyObject **errorHandler,const char *encoding, const char *reason,
150       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
151       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
152
153static void raise_encode_exception(PyObject **exceptionObject,
154                                   const char *encoding,
155                                   const Py_UNICODE *unicode, Py_ssize_t size,
156                                   Py_ssize_t startpos, Py_ssize_t endpos,
157                                   const char *reason);
158
159/* Same for linebreaks */
160static unsigned char ascii_linebreak[] = {
161    0, 0, 0, 0, 0, 0, 0, 0,
162/*         0x000A, * LINE FEED */
163/*         0x000B, * LINE TABULATION */
164/*         0x000C, * FORM FEED */
165/*         0x000D, * CARRIAGE RETURN */
166    0, 0, 1, 1, 1, 1, 0, 0,
167    0, 0, 0, 0, 0, 0, 0, 0,
168/*         0x001C, * FILE SEPARATOR */
169/*         0x001D, * GROUP SEPARATOR */
170/*         0x001E, * RECORD SEPARATOR */
171    0, 0, 0, 0, 1, 1, 1, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0,
182    0, 0, 0, 0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0, 0, 0, 0,
184    0, 0, 0, 0, 0, 0, 0, 0
185};
186
187
188Py_UNICODE
189PyUnicode_GetMax(void)
190{
191#ifdef Py_UNICODE_WIDE
192    return 0x10FFFF;
193#else
194    /* This is actually an illegal character, so it should
195       not be passed to unichr. */
196    return 0xFFFF;
197#endif
198}
199
200/* --- Bloom Filters ----------------------------------------------------- */
201
202/* stuff to implement simple "bloom filters" for Unicode characters.
203   to keep things simple, we use a single bitmask, using the least 5
204   bits from each unicode characters as the bit index. */
205
206/* the linebreak mask is set up by Unicode_Init below */
207
208#if LONG_BIT >= 128
209#define BLOOM_WIDTH 128
210#elif LONG_BIT >= 64
211#define BLOOM_WIDTH 64
212#elif LONG_BIT >= 32
213#define BLOOM_WIDTH 32
214#else
215#error "LONG_BIT is smaller than 32"
216#endif
217
218#define BLOOM_MASK unsigned long
219
220static BLOOM_MASK bloom_linebreak;
221
222#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
224
225#define BLOOM_LINEBREAK(ch)                                             \
226    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
227     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
228
229Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231    /* calculate simple bloom-style bitmask for a given unicode string */
232
233    BLOOM_MASK mask;
234    Py_ssize_t i;
235
236    mask = 0;
237    for (i = 0; i < len; i++)
238        BLOOM_ADD(mask, ptr[i]);
239
240    return mask;
241}
242
243Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
244{
245    Py_ssize_t i;
246
247    for (i = 0; i < setlen; i++)
248        if (set[i] == chr)
249            return 1;
250
251    return 0;
252}
253
254#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
255    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
256
257/* --- Unicode Object ----------------------------------------------------- */
258
259static
260int unicode_resize(register PyUnicodeObject *unicode,
261                   Py_ssize_t length)
262{
263    void *oldstr;
264
265    /* Shortcut if there's nothing much to do. */
266    if (unicode->length == length)
267        goto reset;
268
269    /* Resizing shared object (unicode_empty or single character
270       objects) in-place is not allowed. Use PyUnicode_Resize()
271       instead ! */
272
273    if (unicode == unicode_empty ||
274        (unicode->length == 1 &&
275         unicode->str[0] < 256U &&
276         unicode_latin1[unicode->str[0]] == unicode)) {
277        PyErr_SetString(PyExc_SystemError,
278                        "can't resize shared str objects");
279        return -1;
280    }
281
282    /* We allocate one more byte to make sure the string is Ux0000 terminated.
283       The overallocation is also used by fastsearch, which assumes that it's
284       safe to look at str[length] (without making any assumptions about what
285       it contains). */
286
287    oldstr = unicode->str;
288    unicode->str = PyObject_REALLOC(unicode->str,
289                                    sizeof(Py_UNICODE) * (length + 1));
290    if (!unicode->str) {
291        unicode->str = (Py_UNICODE *)oldstr;
292        PyErr_NoMemory();
293        return -1;
294    }
295    unicode->str[length] = 0;
296    unicode->length = length;
297
298  reset:
299    /* Reset the object caches */
300    if (unicode->defenc) {
301        Py_CLEAR(unicode->defenc);
302    }
303    unicode->hash = -1;
304
305    return 0;
306}
307
308/* We allocate one more byte to make sure the string is
309   Ux0000 terminated; some code (e.g. new_identifier)
310   relies on that.
311
312   XXX This allocator could further be enhanced by assuring that the
313   free list never reduces its size below 1.
314
315*/
316
317static
318PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
319{
320    register PyUnicodeObject *unicode;
321
322    /* Optimization for empty strings */
323    if (length == 0 && unicode_empty != NULL) {
324        Py_INCREF(unicode_empty);
325        return unicode_empty;
326    }
327
328    /* Ensure we won't overflow the size. */
329    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
330        return (PyUnicodeObject *)PyErr_NoMemory();
331    }
332
333    /* Unicode freelist & memory allocation */
334    if (free_list) {
335        unicode = free_list;
336        free_list = *(PyUnicodeObject **)unicode;
337        numfree--;
338        if (unicode->str) {
339            /* Keep-Alive optimization: we only upsize the buffer,
340               never downsize it. */
341            if ((unicode->length < length) &&
342                unicode_resize(unicode, length) < 0) {
343                PyObject_DEL(unicode->str);
344                unicode->str = NULL;
345            }
346        }
347        else {
348            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
349            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
350        }
351        PyObject_INIT(unicode, &PyUnicode_Type);
352    }
353    else {
354        size_t new_size;
355        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
356        if (unicode == NULL)
357            return NULL;
358        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
359        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
360    }
361
362    if (!unicode->str) {
363        PyErr_NoMemory();
364        goto onError;
365    }
366    /* Initialize the first element to guard against cases where
367     * the caller fails before initializing str -- unicode_resize()
368     * reads str[0], and the Keep-Alive optimization can keep memory
369     * allocated for str alive across a call to unicode_dealloc(unicode).
370     * We don't want unicode_resize to read uninitialized memory in
371     * that case.
372     */
373    unicode->str[0] = 0;
374    unicode->str[length] = 0;
375    unicode->length = length;
376    unicode->hash = -1;
377    unicode->state = 0;
378    unicode->defenc = NULL;
379    return unicode;
380
381  onError:
382    /* XXX UNREF/NEWREF interface should be more symmetrical */
383    _Py_DEC_REFTOTAL;
384    _Py_ForgetReference((PyObject *)unicode);
385    PyObject_Del(unicode);
386    return NULL;
387}
388
389static
390void unicode_dealloc(register PyUnicodeObject *unicode)
391{
392    switch (PyUnicode_CHECK_INTERNED(unicode)) {
393    case SSTATE_NOT_INTERNED:
394        break;
395
396    case SSTATE_INTERNED_MORTAL:
397        /* revive dead object temporarily for DelItem */
398        Py_REFCNT(unicode) = 3;
399        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
400            Py_FatalError(
401                "deletion of interned string failed");
402        break;
403
404    case SSTATE_INTERNED_IMMORTAL:
405        Py_FatalError("Immortal interned string died.");
406
407    default:
408        Py_FatalError("Inconsistent interned string state.");
409    }
410
411    if (PyUnicode_CheckExact(unicode) &&
412        numfree < PyUnicode_MAXFREELIST) {
413        /* Keep-Alive optimization */
414        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
415            PyObject_DEL(unicode->str);
416            unicode->str = NULL;
417            unicode->length = 0;
418        }
419        if (unicode->defenc) {
420            Py_CLEAR(unicode->defenc);
421        }
422        /* Add to free list */
423        *(PyUnicodeObject **)unicode = free_list;
424        free_list = unicode;
425        numfree++;
426    }
427    else {
428        PyObject_DEL(unicode->str);
429        Py_XDECREF(unicode->defenc);
430        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
431    }
432}
433
434static
435int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
436{
437    register PyUnicodeObject *v;
438
439    /* Argument checks */
440    if (unicode == NULL) {
441        PyErr_BadInternalCall();
442        return -1;
443    }
444    v = *unicode;
445    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
446        PyErr_BadInternalCall();
447        return -1;
448    }
449
450    /* Resizing unicode_empty and single character objects is not
451       possible since these are being shared. We simply return a fresh
452       copy with the same Unicode content. */
453    if (v->length != length &&
454        (v == unicode_empty || v->length == 1)) {
455        PyUnicodeObject *w = _PyUnicode_New(length);
456        if (w == NULL)
457            return -1;
458        Py_UNICODE_COPY(w->str, v->str,
459                        length < v->length ? length : v->length);
460        Py_DECREF(*unicode);
461        *unicode = w;
462        return 0;
463    }
464
465    /* Note that we don't have to modify *unicode for unshared Unicode
466       objects, since we can modify them in-place. */
467    return unicode_resize(v, length);
468}
469
470int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
471{
472    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
473}
474
475PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
476                                Py_ssize_t size)
477{
478    PyUnicodeObject *unicode;
479
480    /* If the Unicode data is known at construction time, we can apply
481       some optimizations which share commonly used objects. */
482    if (u != NULL) {
483
484        /* Optimization for empty strings */
485        if (size == 0 && unicode_empty != NULL) {
486            Py_INCREF(unicode_empty);
487            return (PyObject *)unicode_empty;
488        }
489
490        /* Single character Unicode objects in the Latin-1 range are
491           shared when using this constructor */
492        if (size == 1 && *u < 256) {
493            unicode = unicode_latin1[*u];
494            if (!unicode) {
495                unicode = _PyUnicode_New(1);
496                if (!unicode)
497                    return NULL;
498                unicode->str[0] = *u;
499                unicode_latin1[*u] = unicode;
500            }
501            Py_INCREF(unicode);
502            return (PyObject *)unicode;
503        }
504    }
505
506    unicode = _PyUnicode_New(size);
507    if (!unicode)
508        return NULL;
509
510    /* Copy the Unicode data into the new object */
511    if (u != NULL)
512        Py_UNICODE_COPY(unicode->str, u, size);
513
514    return (PyObject *)unicode;
515}
516
517PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
518{
519    PyUnicodeObject *unicode;
520
521    if (size < 0) {
522        PyErr_SetString(PyExc_SystemError,
523                        "Negative size passed to PyUnicode_FromStringAndSize");
524        return NULL;
525    }
526
527    /* If the Unicode data is known at construction time, we can apply
528       some optimizations which share commonly used objects.
529       Also, this means the input must be UTF-8, so fall back to the
530       UTF-8 decoder at the end. */
531    if (u != NULL) {
532
533        /* Optimization for empty strings */
534        if (size == 0 && unicode_empty != NULL) {
535            Py_INCREF(unicode_empty);
536            return (PyObject *)unicode_empty;
537        }
538
539        /* Single characters are shared when using this constructor.
540           Restrict to ASCII, since the input must be UTF-8. */
541        if (size == 1 && Py_CHARMASK(*u) < 128) {
542            unicode = unicode_latin1[Py_CHARMASK(*u)];
543            if (!unicode) {
544                unicode = _PyUnicode_New(1);
545                if (!unicode)
546                    return NULL;
547                unicode->str[0] = Py_CHARMASK(*u);
548                unicode_latin1[Py_CHARMASK(*u)] = unicode;
549            }
550            Py_INCREF(unicode);
551            return (PyObject *)unicode;
552        }
553
554        return PyUnicode_DecodeUTF8(u, size, NULL);
555    }
556
557    unicode = _PyUnicode_New(size);
558    if (!unicode)
559        return NULL;
560
561    return (PyObject *)unicode;
562}
563
564PyObject *PyUnicode_FromString(const char *u)
565{
566    size_t size = strlen(u);
567    if (size > PY_SSIZE_T_MAX) {
568        PyErr_SetString(PyExc_OverflowError, "input too long");
569        return NULL;
570    }
571
572    return PyUnicode_FromStringAndSize(u, size);
573}
574
575#ifdef HAVE_WCHAR_H
576
577#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
578# define CONVERT_WCHAR_TO_SURROGATES
579#endif
580
581#ifdef CONVERT_WCHAR_TO_SURROGATES
582
583/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
584   to convert from UTF32 to UTF16. */
585
586PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
587                                 Py_ssize_t size)
588{
589    PyUnicodeObject *unicode;
590    register Py_ssize_t i;
591    Py_ssize_t alloc;
592    const wchar_t *orig_w;
593
594    if (w == NULL) {
595        if (size == 0)
596            return PyUnicode_FromStringAndSize(NULL, 0);
597        PyErr_BadInternalCall();
598        return NULL;
599    }
600
601    if (size == -1) {
602        size = wcslen(w);
603    }
604
605    alloc = size;
606    orig_w = w;
607    for (i = size; i > 0; i--) {
608        if (*w > 0xFFFF)
609            alloc++;
610        w++;
611    }
612    w = orig_w;
613    unicode = _PyUnicode_New(alloc);
614    if (!unicode)
615        return NULL;
616
617    /* Copy the wchar_t data into the new object */
618    {
619        register Py_UNICODE *u;
620        u = PyUnicode_AS_UNICODE(unicode);
621        for (i = size; i > 0; i--) {
622            if (*w > 0xFFFF) {
623                wchar_t ordinal = *w++;
624                ordinal -= 0x10000;
625                *u++ = 0xD800 | (ordinal >> 10);
626                *u++ = 0xDC00 | (ordinal & 0x3FF);
627            }
628            else
629                *u++ = *w++;
630        }
631    }
632    return (PyObject *)unicode;
633}
634
635#else
636
637PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
638                                 Py_ssize_t size)
639{
640    PyUnicodeObject *unicode;
641
642    if (w == NULL) {
643        if (size == 0)
644            return PyUnicode_FromStringAndSize(NULL, 0);
645        PyErr_BadInternalCall();
646        return NULL;
647    }
648
649    if (size == -1) {
650        size = wcslen(w);
651    }
652
653    unicode = _PyUnicode_New(size);
654    if (!unicode)
655        return NULL;
656
657    /* Copy the wchar_t data into the new object */
658#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
659    memcpy(unicode->str, w, size * sizeof(wchar_t));
660#else
661    {
662        register Py_UNICODE *u;
663        register Py_ssize_t i;
664        u = PyUnicode_AS_UNICODE(unicode);
665        for (i = size; i > 0; i--)
666            *u++ = *w++;
667    }
668#endif
669
670    return (PyObject *)unicode;
671}
672
673#endif /* CONVERT_WCHAR_TO_SURROGATES */
674
675#undef CONVERT_WCHAR_TO_SURROGATES
676
677static void
678makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
679        int zeropad, int width, int precision, char c)
680{
681    *fmt++ = '%';
682    if (width) {
683        if (zeropad)
684            *fmt++ = '0';
685        fmt += sprintf(fmt, "%d", width);
686    }
687    if (precision)
688        fmt += sprintf(fmt, ".%d", precision);
689    if (longflag)
690        *fmt++ = 'l';
691    else if (longlongflag) {
692        /* longlongflag should only ever be nonzero on machines with
693           HAVE_LONG_LONG defined */
694#ifdef HAVE_LONG_LONG
695        char *f = PY_FORMAT_LONG_LONG;
696        while (*f)
697            *fmt++ = *f++;
698#else
699        /* we shouldn't ever get here */
700        assert(0);
701        *fmt++ = 'l';
702#endif
703    }
704    else if (size_tflag) {
705        char *f = PY_FORMAT_SIZE_T;
706        while (*f)
707            *fmt++ = *f++;
708    }
709    *fmt++ = c;
710    *fmt = '\0';
711}
712
713#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
714
715/* size of fixed-size buffer for formatting single arguments */
716#define ITEM_BUFFER_LEN 21
717/* maximum number of characters required for output of %ld.  21 characters
718   allows for 64-bit integers (in decimal) and an optional sign. */
719#define MAX_LONG_CHARS 21
720/* maximum number of characters required for output of %lld.
721   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
722   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
723#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
724
725PyObject *
726PyUnicode_FromFormatV(const char *format, va_list vargs)
727{
728    va_list count;
729    Py_ssize_t callcount = 0;
730    PyObject **callresults = NULL;
731    PyObject **callresult = NULL;
732    Py_ssize_t n = 0;
733    int width = 0;
734    int precision = 0;
735    int zeropad;
736    const char* f;
737    Py_UNICODE *s;
738    PyObject *string;
739    /* used by sprintf */
740    char buffer[ITEM_BUFFER_LEN+1];
741    /* use abuffer instead of buffer, if we need more space
742     * (which can happen if there's a format specifier with width). */
743    char *abuffer = NULL;
744    char *realbuffer;
745    Py_ssize_t abuffersize = 0;
746    char fmt[61]; /* should be enough for %0width.precisionlld */
747    const char *copy;
748
749    Py_VA_COPY(count, vargs);
750    /* step 1: count the number of %S/%R/%A/%s format specifications
751     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
752     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
753     * result in an array) */
754    for (f = format; *f; f++) {
755         if (*f == '%') {
756             if (*(f+1)=='%')
757                 continue;
758             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A')
759                 ++callcount;
760             while (ISDIGIT((unsigned)*f))
761                 width = (width*10) + *f++ - '0';
762             while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
763                 ;
764             if (*f == 's')
765                 ++callcount;
766         }
767    }
768    /* step 2: allocate memory for the results of
769     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
770    if (callcount) {
771        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
772        if (!callresults) {
773            PyErr_NoMemory();
774            return NULL;
775        }
776        callresult = callresults;
777    }
778    /* step 3: figure out how large a buffer we need */
779    for (f = format; *f; f++) {
780        if (*f == '%') {
781#ifdef HAVE_LONG_LONG
782            int longlongflag = 0;
783#endif
784            const char* p = f;
785            width = 0;
786            while (ISDIGIT((unsigned)*f))
787                width = (width*10) + *f++ - '0';
788            while (*++f && *f != '%' && !ISALPHA((unsigned)*f))
789                ;
790
791            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
792             * they don't affect the amount of space we reserve.
793             */
794            if (*f == 'l') {
795                if (f[1] == 'd' || f[1] == 'u') {
796                    ++f;
797                }
798#ifdef HAVE_LONG_LONG
799                else if (f[1] == 'l' &&
800                         (f[2] == 'd' || f[2] == 'u')) {
801                    longlongflag = 1;
802                    f += 2;
803                }
804#endif
805            }
806            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
807                ++f;
808            }
809
810            switch (*f) {
811            case 'c':
812                (void)va_arg(count, int);
813                /* fall through... */
814            case '%':
815                n++;
816                break;
817            case 'd': case 'u': case 'i': case 'x':
818                (void) va_arg(count, int);
819#ifdef HAVE_LONG_LONG
820                if (longlongflag) {
821                    if (width < MAX_LONG_LONG_CHARS)
822                        width = MAX_LONG_LONG_CHARS;
823                }
824                else
825#endif
826                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
827                       including sign.  Decimal takes the most space.  This
828                       isn't enough for octal.  If a width is specified we
829                       need more (which we allocate later). */
830                    if (width < MAX_LONG_CHARS)
831                        width = MAX_LONG_CHARS;
832                n += width;
833                /* XXX should allow for large precision here too. */
834                if (abuffersize < width)
835                    abuffersize = width;
836                break;
837            case 's':
838            {
839                /* UTF-8 */
840                const char *s = va_arg(count, const char*);
841                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
842                if (!str)
843                    goto fail;
844                n += PyUnicode_GET_SIZE(str);
845                /* Remember the str and switch to the next slot */
846                *callresult++ = str;
847                break;
848            }
849            case 'U':
850            {
851                PyObject *obj = va_arg(count, PyObject *);
852                assert(obj && PyUnicode_Check(obj));
853                n += PyUnicode_GET_SIZE(obj);
854                break;
855            }
856            case 'V':
857            {
858                PyObject *obj = va_arg(count, PyObject *);
859                const char *str = va_arg(count, const char *);
860                assert(obj || str);
861                assert(!obj || PyUnicode_Check(obj));
862                if (obj)
863                    n += PyUnicode_GET_SIZE(obj);
864                else
865                    n += strlen(str);
866                break;
867            }
868            case 'S':
869            {
870                PyObject *obj = va_arg(count, PyObject *);
871                PyObject *str;
872                assert(obj);
873                str = PyObject_Str(obj);
874                if (!str)
875                    goto fail;
876                n += PyUnicode_GET_SIZE(str);
877                /* Remember the str and switch to the next slot */
878                *callresult++ = str;
879                break;
880            }
881            case 'R':
882            {
883                PyObject *obj = va_arg(count, PyObject *);
884                PyObject *repr;
885                assert(obj);
886                repr = PyObject_Repr(obj);
887                if (!repr)
888                    goto fail;
889                n += PyUnicode_GET_SIZE(repr);
890                /* Remember the repr and switch to the next slot */
891                *callresult++ = repr;
892                break;
893            }
894            case 'A':
895            {
896                PyObject *obj = va_arg(count, PyObject *);
897                PyObject *ascii;
898                assert(obj);
899                ascii = PyObject_ASCII(obj);
900                if (!ascii)
901                    goto fail;
902                n += PyUnicode_GET_SIZE(ascii);
903                /* Remember the repr and switch to the next slot */
904                *callresult++ = ascii;
905                break;
906            }
907            case 'p':
908                (void) va_arg(count, int);
909                /* maximum 64-bit pointer representation:
910                 * 0xffffffffffffffff
911                 * so 19 characters is enough.
912                 * XXX I count 18 -- what's the extra for?
913                 */
914                n += 19;
915                break;
916            default:
917                /* if we stumble upon an unknown
918                   formatting code, copy the rest of
919                   the format string to the output
920                   string. (we cannot just skip the
921                   code, since there's no way to know
922                   what's in the argument list) */
923                n += strlen(p);
924                goto expand;
925            }
926        } else
927            n++;
928    }
929  expand:
930    if (abuffersize > ITEM_BUFFER_LEN) {
931        /* add 1 for sprintf's trailing null byte */
932        abuffer = PyObject_Malloc(abuffersize + 1);
933        if (!abuffer) {
934            PyErr_NoMemory();
935            goto fail;
936        }
937        realbuffer = abuffer;
938    }
939    else
940        realbuffer = buffer;
941    /* step 4: fill the buffer */
942    /* Since we've analyzed how much space we need for the worst case,
943       we don't have to resize the string.
944       There can be no errors beyond this point. */
945    string = PyUnicode_FromUnicode(NULL, n);
946    if (!string)
947        goto fail;
948
949    s = PyUnicode_AS_UNICODE(string);
950    callresult = callresults;
951
952    for (f = format; *f; f++) {
953        if (*f == '%') {
954            const char* p = f++;
955            int longflag = 0;
956            int longlongflag = 0;
957            int size_tflag = 0;
958            zeropad = (*f == '0');
959            /* parse the width.precision part */
960            width = 0;
961            while (ISDIGIT((unsigned)*f))
962                width = (width*10) + *f++ - '0';
963            precision = 0;
964            if (*f == '.') {
965                f++;
966                while (ISDIGIT((unsigned)*f))
967                    precision = (precision*10) + *f++ - '0';
968            }
969            /* Handle %ld, %lu, %lld and %llu. */
970            if (*f == 'l') {
971                if (f[1] == 'd' || f[1] == 'u') {
972                    longflag = 1;
973                    ++f;
974                }
975#ifdef HAVE_LONG_LONG
976                else if (f[1] == 'l' &&
977                         (f[2] == 'd' || f[2] == 'u')) {
978                    longlongflag = 1;
979                    f += 2;
980                }
981#endif
982            }
983            /* handle the size_t flag. */
984            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
985                size_tflag = 1;
986                ++f;
987            }
988
989            switch (*f) {
990            case 'c':
991                *s++ = va_arg(vargs, int);
992                break;
993            case 'd':
994                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
995                        width, precision, 'd');
996                if (longflag)
997                    sprintf(realbuffer, fmt, va_arg(vargs, long));
998#ifdef HAVE_LONG_LONG
999                else if (longlongflag)
1000                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1001#endif
1002                else if (size_tflag)
1003                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1004                else
1005                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1006                appendstring(realbuffer);
1007                break;
1008            case 'u':
1009                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1010                        width, precision, 'u');
1011                if (longflag)
1012                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1013#ifdef HAVE_LONG_LONG
1014                else if (longlongflag)
1015                    sprintf(realbuffer, fmt, va_arg(vargs,
1016                                                    unsigned PY_LONG_LONG));
1017#endif
1018                else if (size_tflag)
1019                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1020                else
1021                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1022                appendstring(realbuffer);
1023                break;
1024            case 'i':
1025                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
1026                sprintf(realbuffer, fmt, va_arg(vargs, int));
1027                appendstring(realbuffer);
1028                break;
1029            case 'x':
1030                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1031                sprintf(realbuffer, fmt, va_arg(vargs, int));
1032                appendstring(realbuffer);
1033                break;
1034            case 's':
1035            {
1036                /* unused, since we already have the result */
1037                (void) va_arg(vargs, char *);
1038                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1039                                PyUnicode_GET_SIZE(*callresult));
1040                s += PyUnicode_GET_SIZE(*callresult);
1041                /* We're done with the unicode()/repr() => forget it */
1042                Py_DECREF(*callresult);
1043                /* switch to next unicode()/repr() result */
1044                ++callresult;
1045                break;
1046            }
1047            case 'U':
1048            {
1049                PyObject *obj = va_arg(vargs, PyObject *);
1050                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1051                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1052                s += size;
1053                break;
1054            }
1055            case 'V':
1056            {
1057                PyObject *obj = va_arg(vargs, PyObject *);
1058                const char *str = va_arg(vargs, const char *);
1059                if (obj) {
1060                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1061                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1062                    s += size;
1063                } else {
1064                    appendstring(str);
1065                }
1066                break;
1067            }
1068            case 'S':
1069            case 'R':
1070            {
1071                Py_UNICODE *ucopy;
1072                Py_ssize_t usize;
1073                Py_ssize_t upos;
1074                /* unused, since we already have the result */
1075                (void) va_arg(vargs, PyObject *);
1076                ucopy = PyUnicode_AS_UNICODE(*callresult);
1077                usize = PyUnicode_GET_SIZE(*callresult);
1078                for (upos = 0; upos<usize;)
1079                    *s++ = ucopy[upos++];
1080                /* We're done with the unicode()/repr() => forget it */
1081                Py_DECREF(*callresult);
1082                /* switch to next unicode()/repr() result */
1083                ++callresult;
1084                break;
1085            }
1086            case 'p':
1087                sprintf(buffer, "%p", va_arg(vargs, void*));
1088                /* %p is ill-defined:  ensure leading 0x. */
1089                if (buffer[1] == 'X')
1090                    buffer[1] = 'x';
1091                else if (buffer[1] != 'x') {
1092                    memmove(buffer+2, buffer, strlen(buffer)+1);
1093                    buffer[0] = '0';
1094                    buffer[1] = 'x';
1095                }
1096                appendstring(buffer);
1097                break;
1098            case '%':
1099                *s++ = '%';
1100                break;
1101            default:
1102                appendstring(p);
1103                goto end;
1104            }
1105        } else
1106            *s++ = *f;
1107    }
1108
1109  end:
1110    if (callresults)
1111        PyObject_Free(callresults);
1112    if (abuffer)
1113        PyObject_Free(abuffer);
1114    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1115    return string;
1116  fail:
1117    if (callresults) {
1118        PyObject **callresult2 = callresults;
1119        while (callresult2 < callresult) {
1120            Py_DECREF(*callresult2);
1121            ++callresult2;
1122        }
1123        PyObject_Free(callresults);
1124    }
1125    if (abuffer)
1126        PyObject_Free(abuffer);
1127    return NULL;
1128}
1129
1130#undef appendstring
1131
1132PyObject *
1133PyUnicode_FromFormat(const char *format, ...)
1134{
1135    PyObject* ret;
1136    va_list vargs;
1137
1138#ifdef HAVE_STDARG_PROTOTYPES
1139    va_start(vargs, format);
1140#else
1141    va_start(vargs);
1142#endif
1143    ret = PyUnicode_FromFormatV(format, vargs);
1144    va_end(vargs);
1145    return ret;
1146}
1147
1148Py_ssize_t PyUnicode_AsWideChar(PyUnicodeObject *unicode,
1149                                wchar_t *w,
1150                                Py_ssize_t size)
1151{
1152    if (unicode == NULL) {
1153        PyErr_BadInternalCall();
1154        return -1;
1155    }
1156
1157    /* If possible, try to copy the 0-termination as well */
1158    if (size > PyUnicode_GET_SIZE(unicode))
1159        size = PyUnicode_GET_SIZE(unicode) + 1;
1160
1161#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1162    memcpy(w, unicode->str, size * sizeof(wchar_t));
1163#else
1164    {
1165        register Py_UNICODE *u;
1166        register Py_ssize_t i;
1167        u = PyUnicode_AS_UNICODE(unicode);
1168        for (i = size; i > 0; i--)
1169            *w++ = *u++;
1170    }
1171#endif
1172
1173    if (size > PyUnicode_GET_SIZE(unicode))
1174        return PyUnicode_GET_SIZE(unicode);
1175    else
1176        return size;
1177}
1178
1179#endif
1180
1181PyObject *PyUnicode_FromOrdinal(int ordinal)
1182{
1183    Py_UNICODE s[2];
1184
1185    if (ordinal < 0 || ordinal > 0x10ffff) {
1186        PyErr_SetString(PyExc_ValueError,
1187                        "chr() arg not in range(0x110000)");
1188        return NULL;
1189    }
1190
1191#ifndef Py_UNICODE_WIDE
1192    if (ordinal > 0xffff) {
1193        ordinal -= 0x10000;
1194        s[0] = 0xD800 | (ordinal >> 10);
1195        s[1] = 0xDC00 | (ordinal & 0x3FF);
1196        return PyUnicode_FromUnicode(s, 2);
1197    }
1198#endif
1199
1200    s[0] = (Py_UNICODE)ordinal;
1201    return PyUnicode_FromUnicode(s, 1);
1202}
1203
1204PyObject *PyUnicode_FromObject(register PyObject *obj)
1205{
1206    /* XXX Perhaps we should make this API an alias of
1207       PyObject_Str() instead ?! */
1208    if (PyUnicode_CheckExact(obj)) {
1209        Py_INCREF(obj);
1210        return obj;
1211    }
1212    if (PyUnicode_Check(obj)) {
1213        /* For a Unicode subtype that's not a Unicode object,
1214           return a true Unicode object with the same data. */
1215        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1216                                     PyUnicode_GET_SIZE(obj));
1217    }
1218    PyErr_Format(PyExc_TypeError,
1219                 "Can't convert '%.100s' object to str implicitly",
1220                 Py_TYPE(obj)->tp_name);
1221    return NULL;
1222}
1223
1224PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1225                                      const char *encoding,
1226                                      const char *errors)
1227{
1228    Py_buffer buffer;
1229    PyObject *v;
1230
1231    if (obj == NULL) {
1232        PyErr_BadInternalCall();
1233        return NULL;
1234    }
1235
1236    /* Decoding bytes objects is the most common case and should be fast */
1237    if (PyBytes_Check(obj)) {
1238        if (PyBytes_GET_SIZE(obj) == 0) {
1239            Py_INCREF(unicode_empty);
1240            v = (PyObject *) unicode_empty;
1241        }
1242        else {
1243            v = PyUnicode_Decode(
1244                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1245                    encoding, errors);
1246        }
1247        return v;
1248    }
1249
1250    if (PyUnicode_Check(obj)) {
1251        PyErr_SetString(PyExc_TypeError,
1252                        "decoding str is not supported");
1253        return NULL;
1254    }
1255
1256    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1257    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1258        PyErr_Format(PyExc_TypeError,
1259                     "coercing to str: need bytes, bytearray "
1260                     "or buffer-like object, %.80s found",
1261                     Py_TYPE(obj)->tp_name);
1262        return NULL;
1263    }
1264
1265    if (buffer.len == 0) {
1266        Py_INCREF(unicode_empty);
1267        v = (PyObject *) unicode_empty;
1268    }
1269    else
1270        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1271
1272    PyBuffer_Release(&buffer);
1273    return v;
1274}
1275
1276/* Convert encoding to lower case and replace '_' with '-' in order to
1277   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1278   1 on success. */
1279static int
1280normalize_encoding(const char *encoding,
1281                   char *lower,
1282                   size_t lower_len)
1283{
1284    const char *e;
1285    char *l;
1286    char *l_end;
1287
1288    e = encoding;
1289    l = lower;
1290    l_end = &lower[lower_len - 1];
1291    while (*e) {
1292        if (l == l_end)
1293            return 0;
1294        if (ISUPPER(*e)) {
1295            *l++ = TOLOWER(*e++);
1296        }
1297        else if (*e == '_') {
1298            *l++ = '-';
1299            e++;
1300        }
1301        else {
1302            *l++ = *e++;
1303        }
1304    }
1305    *l = '\0';
1306    return 1;
1307}
1308
1309PyObject *PyUnicode_Decode(const char *s,
1310                           Py_ssize_t size,
1311                           const char *encoding,
1312                           const char *errors)
1313{
1314    PyObject *buffer = NULL, *unicode;
1315    Py_buffer info;
1316    char lower[11];  /* Enough for any encoding shortcut */
1317
1318    if (encoding == NULL)
1319        encoding = PyUnicode_GetDefaultEncoding();
1320
1321    /* Shortcuts for common default encodings */
1322    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1323        if (strcmp(lower, "utf-8") == 0)
1324            return PyUnicode_DecodeUTF8(s, size, errors);
1325        else if ((strcmp(lower, "latin-1") == 0) ||
1326                 (strcmp(lower, "iso-8859-1") == 0))
1327            return PyUnicode_DecodeLatin1(s, size, errors);
1328#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1329        else if (strcmp(lower, "mbcs") == 0)
1330            return PyUnicode_DecodeMBCS(s, size, errors);
1331#endif
1332        else if (strcmp(lower, "ascii") == 0)
1333            return PyUnicode_DecodeASCII(s, size, errors);
1334        else if (strcmp(lower, "utf-16") == 0)
1335            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1336        else if (strcmp(lower, "utf-32") == 0)
1337            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1338    }
1339
1340    /* Decode via the codec registry */
1341    buffer = NULL;
1342    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1343        goto onError;
1344    buffer = PyMemoryView_FromBuffer(&info);
1345    if (buffer == NULL)
1346        goto onError;
1347    unicode = PyCodec_Decode(buffer, encoding, errors);
1348    if (unicode == NULL)
1349        goto onError;
1350    if (!PyUnicode_Check(unicode)) {
1351        PyErr_Format(PyExc_TypeError,
1352                     "decoder did not return a str object (type=%.400s)",
1353                     Py_TYPE(unicode)->tp_name);
1354        Py_DECREF(unicode);
1355        goto onError;
1356    }
1357    Py_DECREF(buffer);
1358    return unicode;
1359
1360  onError:
1361    Py_XDECREF(buffer);
1362    return NULL;
1363}
1364
1365PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1366                                    const char *encoding,
1367                                    const char *errors)
1368{
1369    PyObject *v;
1370
1371    if (!PyUnicode_Check(unicode)) {
1372        PyErr_BadArgument();
1373        goto onError;
1374    }
1375
1376    if (encoding == NULL)
1377        encoding = PyUnicode_GetDefaultEncoding();
1378
1379    /* Decode via the codec registry */
1380    v = PyCodec_Decode(unicode, encoding, errors);
1381    if (v == NULL)
1382        goto onError;
1383    return v;
1384
1385  onError:
1386    return NULL;
1387}
1388
1389PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1390                                     const char *encoding,
1391                                     const char *errors)
1392{
1393    PyObject *v;
1394
1395    if (!PyUnicode_Check(unicode)) {
1396        PyErr_BadArgument();
1397        goto onError;
1398    }
1399
1400    if (encoding == NULL)
1401        encoding = PyUnicode_GetDefaultEncoding();
1402
1403    /* Decode via the codec registry */
1404    v = PyCodec_Decode(unicode, encoding, errors);
1405    if (v == NULL)
1406        goto onError;
1407    if (!PyUnicode_Check(v)) {
1408        PyErr_Format(PyExc_TypeError,
1409                     "decoder did not return a str object (type=%.400s)",
1410                     Py_TYPE(v)->tp_name);
1411        Py_DECREF(v);
1412        goto onError;
1413    }
1414    return v;
1415
1416  onError:
1417    return NULL;
1418}
1419
1420PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1421                           Py_ssize_t size,
1422                           const char *encoding,
1423                           const char *errors)
1424{
1425    PyObject *v, *unicode;
1426
1427    unicode = PyUnicode_FromUnicode(s, size);
1428    if (unicode == NULL)
1429        return NULL;
1430    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1431    Py_DECREF(unicode);
1432    return v;
1433}
1434
1435PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1436                                    const char *encoding,
1437                                    const char *errors)
1438{
1439    PyObject *v;
1440
1441    if (!PyUnicode_Check(unicode)) {
1442        PyErr_BadArgument();
1443        goto onError;
1444    }
1445
1446    if (encoding == NULL)
1447        encoding = PyUnicode_GetDefaultEncoding();
1448
1449    /* Encode via the codec registry */
1450    v = PyCodec_Encode(unicode, encoding, errors);
1451    if (v == NULL)
1452        goto onError;
1453    return v;
1454
1455  onError:
1456    return NULL;
1457}
1458
1459PyObject *PyUnicode_EncodeFSDefault(PyObject *unicode)
1460{
1461    if (Py_FileSystemDefaultEncoding) {
1462#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1463        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0)
1464            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1465                                        PyUnicode_GET_SIZE(unicode),
1466                                        NULL);
1467#endif
1468        return PyUnicode_AsEncodedString(unicode,
1469                                         Py_FileSystemDefaultEncoding,
1470                                         "surrogateescape");
1471    } else
1472        return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1473                                    PyUnicode_GET_SIZE(unicode),
1474                                    "surrogateescape");
1475}
1476
1477PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1478                                    const char *encoding,
1479                                    const char *errors)
1480{
1481    PyObject *v;
1482    char lower[11];  /* Enough for any encoding shortcut */
1483
1484    if (!PyUnicode_Check(unicode)) {
1485        PyErr_BadArgument();
1486        return NULL;
1487    }
1488
1489    if (encoding == NULL)
1490        encoding = PyUnicode_GetDefaultEncoding();
1491
1492    /* Shortcuts for common default encodings */
1493    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1494        if (strcmp(lower, "utf-8") == 0)
1495            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1496                                        PyUnicode_GET_SIZE(unicode),
1497                                        errors);
1498        else if ((strcmp(lower, "latin-1") == 0) ||
1499                 (strcmp(lower, "iso-8859-1") == 0))
1500            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1501                                          PyUnicode_GET_SIZE(unicode),
1502                                          errors);
1503#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1504        else if (strcmp(lower, "mbcs") == 0)
1505            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1506                                        PyUnicode_GET_SIZE(unicode),
1507                                        errors);
1508#endif
1509        else if (strcmp(lower, "ascii") == 0)
1510            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1511                                         PyUnicode_GET_SIZE(unicode),
1512                                         errors);
1513    }
1514    /* During bootstrap, we may need to find the encodings
1515       package, to load the file system encoding, and require the
1516       file system encoding in order to load the encodings
1517       package.
1518
1519       Break out of this dependency by assuming that the path to
1520       the encodings module is ASCII-only.  XXX could try wcstombs
1521       instead, if the file system encoding is the locale's
1522       encoding. */
1523    if (Py_FileSystemDefaultEncoding &&
1524             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1525             !PyThreadState_GET()->interp->codecs_initialized)
1526        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1527                                     PyUnicode_GET_SIZE(unicode),
1528                                     errors);
1529
1530    /* Encode via the codec registry */
1531    v = PyCodec_Encode(unicode, encoding, errors);
1532    if (v == NULL)
1533        return NULL;
1534
1535    /* The normal path */
1536    if (PyBytes_Check(v))
1537        return v;
1538
1539    /* If the codec returns a buffer, raise a warning and convert to bytes */
1540    if (PyByteArray_Check(v)) {
1541        int error;
1542        PyObject *b;
1543
1544        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1545            "encoder %s returned bytearray instead of bytes",
1546            encoding);
1547        if (error) {
1548            Py_DECREF(v);
1549            return NULL;
1550        }
1551
1552        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1553        Py_DECREF(v);
1554        return b;
1555    }
1556
1557    PyErr_Format(PyExc_TypeError,
1558                 "encoder did not return a bytes object (type=%.400s)",
1559                 Py_TYPE(v)->tp_name);
1560    Py_DECREF(v);
1561    return NULL;
1562}
1563
1564PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1565                                     const char *encoding,
1566                                     const char *errors)
1567{
1568    PyObject *v;
1569
1570    if (!PyUnicode_Check(unicode)) {
1571        PyErr_BadArgument();
1572        goto onError;
1573    }
1574
1575    if (encoding == NULL)
1576        encoding = PyUnicode_GetDefaultEncoding();
1577
1578    /* Encode via the codec registry */
1579    v = PyCodec_Encode(unicode, encoding, errors);
1580    if (v == NULL)
1581        goto onError;
1582    if (!PyUnicode_Check(v)) {
1583        PyErr_Format(PyExc_TypeError,
1584                     "encoder did not return an str object (type=%.400s)",
1585                     Py_TYPE(v)->tp_name);
1586        Py_DECREF(v);
1587        goto onError;
1588    }
1589    return v;
1590
1591  onError:
1592    return NULL;
1593}
1594
1595PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1596                                            const char *errors)
1597{
1598    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1599    if (v)
1600        return v;
1601    if (errors != NULL)
1602        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1603    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1604                             PyUnicode_GET_SIZE(unicode),
1605                             NULL);
1606    if (!v)
1607        return NULL;
1608    ((PyUnicodeObject *)unicode)->defenc = v;
1609    return v;
1610}
1611
1612PyObject*
1613PyUnicode_DecodeFSDefault(const char *s) {
1614    Py_ssize_t size = (Py_ssize_t)strlen(s);
1615    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1616}
1617
1618PyObject*
1619PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1620{
1621    /* During the early bootstrapping process, Py_FileSystemDefaultEncoding
1622       can be undefined. If it is case, decode using UTF-8. The following assumes
1623       that Py_FileSystemDefaultEncoding is set to a built-in encoding during the
1624       bootstrapping process where the codecs aren't ready yet.
1625    */
1626    if (Py_FileSystemDefaultEncoding) {
1627#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1628        if (strcmp(Py_FileSystemDefaultEncoding, "mbcs") == 0) {
1629            return PyUnicode_DecodeMBCS(s, size, NULL);
1630        }
1631#elif defined(__APPLE__)
1632        if (strcmp(Py_FileSystemDefaultEncoding, "utf-8") == 0) {
1633            return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1634        }
1635#endif
1636        return PyUnicode_Decode(s, size,
1637                                Py_FileSystemDefaultEncoding,
1638                                "surrogateescape");
1639    }
1640    else {
1641        return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1642    }
1643}
1644
1645
1646int
1647PyUnicode_FSConverter(PyObject* arg, void* addr)
1648{
1649    PyObject *output = NULL;
1650    Py_ssize_t size;
1651    void *data;
1652    if (arg == NULL) {
1653        Py_DECREF(*(PyObject**)addr);
1654        return 1;
1655    }
1656    if (PyBytes_Check(arg)) {
1657        output = arg;
1658        Py_INCREF(output);
1659    }
1660    else {
1661        arg = PyUnicode_FromObject(arg);
1662        if (!arg)
1663            return 0;
1664        output = PyUnicode_EncodeFSDefault(arg);
1665        Py_DECREF(arg);
1666        if (!output)
1667            return 0;
1668        if (!PyBytes_Check(output)) {
1669            Py_DECREF(output);
1670            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1671            return 0;
1672        }
1673    }
1674    size = PyBytes_GET_SIZE(output);
1675    data = PyBytes_AS_STRING(output);
1676    if (size != strlen(data)) {
1677        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1678        Py_DECREF(output);
1679        return 0;
1680    }
1681    *(PyObject**)addr = output;
1682    return Py_CLEANUP_SUPPORTED;
1683}
1684
1685
1686int
1687PyUnicode_FSDecoder(PyObject* arg, void* addr)
1688{
1689    PyObject *output = NULL;
1690    Py_ssize_t size;
1691    void *data;
1692    if (arg == NULL) {
1693        Py_DECREF(*(PyObject**)addr);
1694        return 1;
1695    }
1696    if (PyUnicode_Check(arg)) {
1697        output = arg;
1698        Py_INCREF(output);
1699    }
1700    else {
1701        arg = PyBytes_FromObject(arg);
1702        if (!arg)
1703            return 0;
1704        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1705                                                  PyBytes_GET_SIZE(arg));
1706        Py_DECREF(arg);
1707        if (!output)
1708            return 0;
1709        if (!PyUnicode_Check(output)) {
1710            Py_DECREF(output);
1711            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1712            return 0;
1713        }
1714    }
1715    size = PyUnicode_GET_SIZE(output);
1716    data = PyUnicode_AS_UNICODE(output);
1717    if (size != Py_UNICODE_strlen(data)) {
1718        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1719        Py_DECREF(output);
1720        return 0;
1721    }
1722    *(PyObject**)addr = output;
1723    return Py_CLEANUP_SUPPORTED;
1724}
1725
1726
1727char*
1728_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1729{
1730    PyObject *bytes;
1731    if (!PyUnicode_Check(unicode)) {
1732        PyErr_BadArgument();
1733        return NULL;
1734    }
1735    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1736    if (bytes == NULL)
1737        return NULL;
1738    if (psize != NULL)
1739        *psize = PyBytes_GET_SIZE(bytes);
1740    return PyBytes_AS_STRING(bytes);
1741}
1742
1743char*
1744_PyUnicode_AsString(PyObject *unicode)
1745{
1746    return _PyUnicode_AsStringAndSize(unicode, NULL);
1747}
1748
1749Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1750{
1751    if (!PyUnicode_Check(unicode)) {
1752        PyErr_BadArgument();
1753        goto onError;
1754    }
1755    return PyUnicode_AS_UNICODE(unicode);
1756
1757  onError:
1758    return NULL;
1759}
1760
1761Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1762{
1763    if (!PyUnicode_Check(unicode)) {
1764        PyErr_BadArgument();
1765        goto onError;
1766    }
1767    return PyUnicode_GET_SIZE(unicode);
1768
1769  onError:
1770    return -1;
1771}
1772
1773const char *PyUnicode_GetDefaultEncoding(void)
1774{
1775    return "utf-8";
1776}
1777
1778/* create or adjust a UnicodeDecodeError */
1779static void
1780make_decode_exception(PyObject **exceptionObject,
1781                      const char *encoding,
1782                      const char *input, Py_ssize_t length,
1783                      Py_ssize_t startpos, Py_ssize_t endpos,
1784                      const char *reason)
1785{
1786    if (*exceptionObject == NULL) {
1787        *exceptionObject = PyUnicodeDecodeError_Create(
1788            encoding, input, length, startpos, endpos, reason);
1789    }
1790    else {
1791        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
1792            goto onError;
1793        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
1794            goto onError;
1795        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
1796            goto onError;
1797    }
1798    return;
1799
1800onError:
1801    Py_DECREF(*exceptionObject);
1802    *exceptionObject = NULL;
1803}
1804
1805/* error handling callback helper:
1806   build arguments, call the callback and check the arguments,
1807   if no exception occurred, copy the replacement to the output
1808   and adjust various state variables.
1809   return 0 on success, -1 on error
1810*/
1811
1812static
1813int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
1814                                     const char *encoding, const char *reason,
1815                                     const char **input, const char **inend, Py_ssize_t *startinpos,
1816                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
1817                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
1818{
1819    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
1820
1821    PyObject *restuple = NULL;
1822    PyObject *repunicode = NULL;
1823    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
1824    Py_ssize_t insize;
1825    Py_ssize_t requiredsize;
1826    Py_ssize_t newpos;
1827    Py_UNICODE *repptr;
1828    PyObject *inputobj = NULL;
1829    Py_ssize_t repsize;
1830    int res = -1;
1831
1832    if (*errorHandler == NULL) {
1833        *errorHandler = PyCodec_LookupError(errors);
1834        if (*errorHandler == NULL)
1835            goto onError;
1836    }
1837
1838    make_decode_exception(exceptionObject,
1839        encoding,
1840        *input, *inend - *input,
1841        *startinpos, *endinpos,
1842        reason);
1843    if (*exceptionObject == NULL)
1844        goto onError;
1845
1846    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
1847    if (restuple == NULL)
1848        goto onError;
1849    if (!PyTuple_Check(restuple)) {
1850        PyErr_SetString(PyExc_TypeError, &argparse[4]);
1851        goto onError;
1852    }
1853    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
1854        goto onError;
1855
1856    /* Copy back the bytes variables, which might have been modified by the
1857       callback */
1858    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
1859    if (!inputobj)
1860        goto onError;
1861    if (!PyBytes_Check(inputobj)) {
1862        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
1863    }
1864    *input = PyBytes_AS_STRING(inputobj);
1865    insize = PyBytes_GET_SIZE(inputobj);
1866    *inend = *input + insize;
1867    /* we can DECREF safely, as the exception has another reference,
1868       so the object won't go away. */
1869    Py_DECREF(inputobj);
1870
1871    if (newpos<0)
1872        newpos = insize+newpos;
1873    if (newpos<0 || newpos>insize) {
1874        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
1875        goto onError;
1876    }
1877
1878    /* need more space? (at least enough for what we
1879       have+the replacement+the rest of the string (starting
1880       at the new input position), so we won't have to check space
1881       when there are no errors in the rest of the string) */
1882    repptr = PyUnicode_AS_UNICODE(repunicode);
1883    repsize = PyUnicode_GET_SIZE(repunicode);
1884    requiredsize = *outpos + repsize + insize-newpos;
1885    if (requiredsize > outsize) {
1886        if (requiredsize<2*outsize)
1887            requiredsize = 2*outsize;
1888        if (_PyUnicode_Resize(output, requiredsize) < 0)
1889            goto onError;
1890        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
1891    }
1892    *endinpos = newpos;
1893    *inptr = *input + newpos;
1894    Py_UNICODE_COPY(*outptr, repptr, repsize);
1895    *outptr += repsize;
1896    *outpos += repsize;
1897
1898    /* we made it! */
1899    res = 0;
1900
1901  onError:
1902    Py_XDECREF(restuple);
1903    return res;
1904}
1905
1906/* --- UTF-7 Codec -------------------------------------------------------- */
1907
1908/* See RFC2152 for details.  We encode conservatively and decode liberally. */
1909
1910/* Three simple macros defining base-64. */
1911
1912/* Is c a base-64 character? */
1913
1914#define IS_BASE64(c) \
1915    (((c) >= 'A' && (c) <= 'Z') ||     \
1916     ((c) >= 'a' && (c) <= 'z') ||     \
1917     ((c) >= '0' && (c) <= '9') ||     \
1918     (c) == '+' || (c) == '/')
1919
1920/* given that c is a base-64 character, what is its base-64 value? */
1921
1922#define FROM_BASE64(c)                                                  \
1923    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
1924     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
1925     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
1926     (c) == '+' ? 62 : 63)
1927
1928/* What is the base-64 character of the bottom 6 bits of n? */
1929
1930#define TO_BASE64(n)  \
1931    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
1932
1933/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
1934 * decoded as itself.  We are permissive on decoding; the only ASCII
1935 * byte not decoding to itself is the + which begins a base64
1936 * string. */
1937
1938#define DECODE_DIRECT(c)                                \
1939    ((c) <= 127 && (c) != '+')
1940
1941/* The UTF-7 encoder treats ASCII characters differently according to
1942 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
1943 * the above).  See RFC2152.  This array identifies these different
1944 * sets:
1945 * 0 : "Set D"
1946 *     alphanumeric and '(),-./:?
1947 * 1 : "Set O"
1948 *     !"#$%&*;<=>@[]^_`{|}
1949 * 2 : "whitespace"
1950 *     ht nl cr sp
1951 * 3 : special (must be base64 encoded)
1952 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
1953 */
1954
1955static
1956char utf7_category[128] = {
1957/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
1958    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
1959/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
1960    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
1961/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
1962    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
1963/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
1964    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
1965/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
1966    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1967/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
1968    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
1969/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
1970    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
1971/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
1972    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
1973};
1974
1975/* ENCODE_DIRECT: this character should be encoded as itself.  The
1976 * answer depends on whether we are encoding set O as itself, and also
1977 * on whether we are encoding whitespace as itself.  RFC2152 makes it
1978 * clear that the answers to these questions vary between
1979 * applications, so this code needs to be flexible.  */
1980
1981#define ENCODE_DIRECT(c, directO, directWS)             \
1982    ((c) < 128 && (c) > 0 &&                            \
1983     ((utf7_category[(c)] == 0) ||                      \
1984      (directWS && (utf7_category[(c)] == 2)) ||        \
1985      (directO && (utf7_category[(c)] == 1))))
1986
1987PyObject *PyUnicode_DecodeUTF7(const char *s,
1988                               Py_ssize_t size,
1989                               const char *errors)
1990{
1991    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
1992}
1993
1994/* The decoder.  The only state we preserve is our read position,
1995 * i.e. how many characters we have consumed.  So if we end in the
1996 * middle of a shift sequence we have to back off the read position
1997 * and the output to the beginning of the sequence, otherwise we lose
1998 * all the shift state (seen bits, number of bits seen, high
1999 * surrogate). */
2000
2001PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
2002                                       Py_ssize_t size,
2003                                       const char *errors,
2004                                       Py_ssize_t *consumed)
2005{
2006    const char *starts = s;
2007    Py_ssize_t startinpos;
2008    Py_ssize_t endinpos;
2009    Py_ssize_t outpos;
2010    const char *e;
2011    PyUnicodeObject *unicode;
2012    Py_UNICODE *p;
2013    const char *errmsg = "";
2014    int inShift = 0;
2015    Py_UNICODE *shiftOutStart;
2016    unsigned int base64bits = 0;
2017    unsigned long base64buffer = 0;
2018    Py_UNICODE surrogate = 0;
2019    PyObject *errorHandler = NULL;
2020    PyObject *exc = NULL;
2021
2022    unicode = _PyUnicode_New(size);
2023    if (!unicode)
2024        return NULL;
2025    if (size == 0) {
2026        if (consumed)
2027            *consumed = 0;
2028        return (PyObject *)unicode;
2029    }
2030
2031    p = unicode->str;
2032    shiftOutStart = p;
2033    e = s + size;
2034
2035    while (s < e) {
2036        Py_UNICODE ch;
2037      restart:
2038        ch = (unsigned char) *s;
2039
2040        if (inShift) { /* in a base-64 section */
2041            if (IS_BASE64(ch)) { /* consume a base-64 character */
2042                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2043                base64bits += 6;
2044                s++;
2045                if (base64bits >= 16) {
2046                    /* we have enough bits for a UTF-16 value */
2047                    Py_UNICODE outCh = (Py_UNICODE)
2048                                       (base64buffer >> (base64bits-16));
2049                    base64bits -= 16;
2050                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2051                    if (surrogate) {
2052                        /* expecting a second surrogate */
2053                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2054#ifdef Py_UNICODE_WIDE
2055                            *p++ = (((surrogate & 0x3FF)<<10)
2056                                    | (outCh & 0x3FF)) + 0x10000;
2057#else
2058                            *p++ = surrogate;
2059                            *p++ = outCh;
2060#endif
2061                            surrogate = 0;
2062                        }
2063                        else {
2064                            surrogate = 0;
2065                            errmsg = "second surrogate missing";
2066                            goto utf7Error;
2067                        }
2068                    }
2069                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2070                        /* first surrogate */
2071                        surrogate = outCh;
2072                    }
2073                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2074                        errmsg = "unexpected second surrogate";
2075                        goto utf7Error;
2076                    }
2077                    else {
2078                        *p++ = outCh;
2079                    }
2080                }
2081            }
2082            else { /* now leaving a base-64 section */
2083                inShift = 0;
2084                s++;
2085                if (surrogate) {
2086                    errmsg = "second surrogate missing at end of shift sequence";
2087                    goto utf7Error;
2088                }
2089                if (base64bits > 0) { /* left-over bits */
2090                    if (base64bits >= 6) {
2091                        /* We've seen at least one base-64 character */
2092                        errmsg = "partial character in shift sequence";
2093                        goto utf7Error;
2094                    }
2095                    else {
2096                        /* Some bits remain; they should be zero */
2097                        if (base64buffer != 0) {
2098                            errmsg = "non-zero padding bits in shift sequence";
2099                            goto utf7Error;
2100                        }
2101                    }
2102                }
2103                if (ch != '-') {
2104                    /* '-' is absorbed; other terminating
2105                       characters are preserved */
2106                    *p++ = ch;
2107                }
2108            }
2109        }
2110        else if ( ch == '+' ) {
2111            startinpos = s-starts;
2112            s++; /* consume '+' */
2113            if (s < e && *s == '-') { /* '+-' encodes '+' */
2114                s++;
2115                *p++ = '+';
2116            }
2117            else { /* begin base64-encoded section */
2118                inShift = 1;
2119                shiftOutStart = p;
2120                base64bits = 0;
2121            }
2122        }
2123        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2124            *p++ = ch;
2125            s++;
2126        }
2127        else {
2128            startinpos = s-starts;
2129            s++;
2130            errmsg = "unexpected special character";
2131            goto utf7Error;
2132        }
2133        continue;
2134utf7Error:
2135        outpos = p-PyUnicode_AS_UNICODE(unicode);
2136        endinpos = s-starts;
2137        if (unicode_decode_call_errorhandler(
2138                errors, &errorHandler,
2139                "utf7", errmsg,
2140                &starts, &e, &startinpos, &endinpos, &exc, &s,
2141                &unicode, &outpos, &p))
2142            goto onError;
2143    }
2144
2145    /* end of string */
2146
2147    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2148        /* if we're in an inconsistent state, that's an error */
2149        if (surrogate ||
2150                (base64bits >= 6) ||
2151                (base64bits > 0 && base64buffer != 0)) {
2152            outpos = p-PyUnicode_AS_UNICODE(unicode);
2153            endinpos = size;
2154            if (unicode_decode_call_errorhandler(
2155                    errors, &errorHandler,
2156                    "utf7", "unterminated shift sequence",
2157                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2158                    &unicode, &outpos, &p))
2159                goto onError;
2160            if (s < e)
2161                goto restart;
2162        }
2163    }
2164
2165    /* return state */
2166    if (consumed) {
2167        if (inShift) {
2168            p = shiftOutStart; /* back off output */
2169            *consumed = startinpos;
2170        }
2171        else {
2172            *consumed = s-starts;
2173        }
2174    }
2175
2176    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2177        goto onError;
2178
2179    Py_XDECREF(errorHandler);
2180    Py_XDECREF(exc);
2181    return (PyObject *)unicode;
2182
2183  onError:
2184    Py_XDECREF(errorHandler);
2185    Py_XDECREF(exc);
2186    Py_DECREF(unicode);
2187    return NULL;
2188}
2189
2190
2191PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2192                               Py_ssize_t size,
2193                               int base64SetO,
2194                               int base64WhiteSpace,
2195                               const char *errors)
2196{
2197    PyObject *v;
2198    /* It might be possible to tighten this worst case */
2199    Py_ssize_t allocated = 8 * size;
2200    int inShift = 0;
2201    Py_ssize_t i = 0;
2202    unsigned int base64bits = 0;
2203    unsigned long base64buffer = 0;
2204    char * out;
2205    char * start;
2206
2207    if (size == 0)
2208        return PyBytes_FromStringAndSize(NULL, 0);
2209
2210    if (allocated / 8 != size)
2211        return PyErr_NoMemory();
2212
2213    v = PyBytes_FromStringAndSize(NULL, allocated);
2214    if (v == NULL)
2215        return NULL;
2216
2217    start = out = PyBytes_AS_STRING(v);
2218    for (;i < size; ++i) {
2219        Py_UNICODE ch = s[i];
2220
2221        if (inShift) {
2222            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2223                /* shifting out */
2224                if (base64bits) { /* output remaining bits */
2225                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2226                    base64buffer = 0;
2227                    base64bits = 0;
2228                }
2229                inShift = 0;
2230                /* Characters not in the BASE64 set implicitly unshift the sequence
2231                   so no '-' is required, except if the character is itself a '-' */
2232                if (IS_BASE64(ch) || ch == '-') {
2233                    *out++ = '-';
2234                }
2235                *out++ = (char) ch;
2236            }
2237            else {
2238                goto encode_char;
2239            }
2240        }
2241        else { /* not in a shift sequence */
2242            if (ch == '+') {
2243                *out++ = '+';
2244                        *out++ = '-';
2245            }
2246            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2247                *out++ = (char) ch;
2248            }
2249            else {
2250                *out++ = '+';
2251                inShift = 1;
2252                goto encode_char;
2253            }
2254        }
2255        continue;
2256encode_char:
2257#ifdef Py_UNICODE_WIDE
2258        if (ch >= 0x10000) {
2259            /* code first surrogate */
2260            base64bits += 16;
2261            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2262            while (base64bits >= 6) {
2263                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2264                base64bits -= 6;
2265            }
2266            /* prepare second surrogate */
2267            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2268        }
2269#endif
2270        base64bits += 16;
2271        base64buffer = (base64buffer << 16) | ch;
2272        while (base64bits >= 6) {
2273            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2274            base64bits -= 6;
2275        }
2276    }
2277    if (base64bits)
2278        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2279    if (inShift)
2280        *out++ = '-';
2281    if (_PyBytes_Resize(&v, out - start) < 0)
2282        return NULL;
2283    return v;
2284}
2285
2286#undef IS_BASE64
2287#undef FROM_BASE64
2288#undef TO_BASE64
2289#undef DECODE_DIRECT
2290#undef ENCODE_DIRECT
2291
2292/* --- UTF-8 Codec -------------------------------------------------------- */
2293
2294static
2295char utf8_code_length[256] = {
2296    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2297       illegal prefix.  See RFC 3629 for details */
2298    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2299    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2300    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2301    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2302    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2303    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2304    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2305    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2306    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2307    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2308    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2309    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2310    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2311    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2312    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2313    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2314};
2315
2316PyObject *PyUnicode_DecodeUTF8(const char *s,
2317                               Py_ssize_t size,
2318                               const char *errors)
2319{
2320    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2321}
2322
2323/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2324#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2325
2326/* Mask to quickly check whether a C 'long' contains a
2327   non-ASCII, UTF8-encoded char. */
2328#if (SIZEOF_LONG == 8)
2329# define ASCII_CHAR_MASK 0x8080808080808080L
2330#elif (SIZEOF_LONG == 4)
2331# define ASCII_CHAR_MASK 0x80808080L
2332#else
2333# error C 'long' size should be either 4 or 8!
2334#endif
2335
2336PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2337                                       Py_ssize_t size,
2338                                       const char *errors,
2339                                       Py_ssize_t *consumed)
2340{
2341    const char *starts = s;
2342    int n;
2343    int k;
2344    Py_ssize_t startinpos;
2345    Py_ssize_t endinpos;
2346    Py_ssize_t outpos;
2347    const char *e, *aligned_end;
2348    PyUnicodeObject *unicode;
2349    Py_UNICODE *p;
2350    const char *errmsg = "";
2351    PyObject *errorHandler = NULL;
2352    PyObject *exc = NULL;
2353
2354    /* Note: size will always be longer than the resulting Unicode
2355       character count */
2356    unicode = _PyUnicode_New(size);
2357    if (!unicode)
2358        return NULL;
2359    if (size == 0) {
2360        if (consumed)
2361            *consumed = 0;
2362        return (PyObject *)unicode;
2363    }
2364
2365    /* Unpack UTF-8 encoded data */
2366    p = unicode->str;
2367    e = s + size;
2368    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2369
2370    while (s < e) {
2371        Py_UCS4 ch = (unsigned char)*s;
2372
2373        if (ch < 0x80) {
2374            /* Fast path for runs of ASCII characters. Given that common UTF-8
2375               input will consist of an overwhelming majority of ASCII
2376               characters, we try to optimize for this case by checking
2377               as many characters as a C 'long' can contain.
2378               First, check if we can do an aligned read, as most CPUs have
2379               a penalty for unaligned reads.
2380            */
2381            if (!((size_t) s & LONG_PTR_MASK)) {
2382                /* Help register allocation */
2383                register const char *_s = s;
2384                register Py_UNICODE *_p = p;
2385                while (_s < aligned_end) {
2386                    /* Read a whole long at a time (either 4 or 8 bytes),
2387                       and do a fast unrolled copy if it only contains ASCII
2388                       characters. */
2389                    unsigned long data = *(unsigned long *) _s;
2390                    if (data & ASCII_CHAR_MASK)
2391                        break;
2392                    _p[0] = (unsigned char) _s[0];
2393                    _p[1] = (unsigned char) _s[1];
2394                    _p[2] = (unsigned char) _s[2];
2395                    _p[3] = (unsigned char) _s[3];
2396#if (SIZEOF_LONG == 8)
2397                    _p[4] = (unsigned char) _s[4];
2398                    _p[5] = (unsigned char) _s[5];
2399                    _p[6] = (unsigned char) _s[6];
2400                    _p[7] = (unsigned char) _s[7];
2401#endif
2402                    _s += SIZEOF_LONG;
2403                    _p += SIZEOF_LONG;
2404                }
2405                s = _s;
2406                p = _p;
2407                if (s == e)
2408                    break;
2409                ch = (unsigned char)*s;
2410            }
2411        }
2412
2413        if (ch < 0x80) {
2414            *p++ = (Py_UNICODE)ch;
2415            s++;
2416            continue;
2417        }
2418
2419        n = utf8_code_length[ch];
2420
2421        if (s + n > e) {
2422            if (consumed)
2423                break;
2424            else {
2425                errmsg = "unexpected end of data";
2426                startinpos = s-starts;
2427                endinpos = startinpos+1;
2428                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2429                    endinpos++;
2430                goto utf8Error;
2431            }
2432        }
2433
2434        switch (n) {
2435
2436        case 0:
2437            errmsg = "invalid start byte";
2438            startinpos = s-starts;
2439            endinpos = startinpos+1;
2440            goto utf8Error;
2441
2442        case 1:
2443            errmsg = "internal error";
2444            startinpos = s-starts;
2445            endinpos = startinpos+1;
2446            goto utf8Error;
2447
2448        case 2:
2449            if ((s[1] & 0xc0) != 0x80) {
2450                errmsg = "invalid continuation byte";
2451                startinpos = s-starts;
2452                endinpos = startinpos + 1;
2453                goto utf8Error;
2454            }
2455            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2456            assert ((ch > 0x007F) && (ch <= 0x07FF));
2457            *p++ = (Py_UNICODE)ch;
2458            break;
2459
2460        case 3:
2461            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2462               will result in surrogates in range d800-dfff. Surrogates are
2463               not valid UTF-8 so they are rejected.
2464               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2465               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2466            if ((s[1] & 0xc0) != 0x80 ||
2467                (s[2] & 0xc0) != 0x80 ||
2468                ((unsigned char)s[0] == 0xE0 &&
2469                 (unsigned char)s[1] < 0xA0) ||
2470                ((unsigned char)s[0] == 0xED &&
2471                 (unsigned char)s[1] > 0x9F)) {
2472                errmsg = "invalid continuation byte";
2473                startinpos = s-starts;
2474                endinpos = startinpos + 1;
2475
2476                /* if s[1] first two bits are 1 and 0, then the invalid
2477                   continuation byte is s[2], so increment endinpos by 1,
2478                   if not, s[1] is invalid and endinpos doesn't need to
2479                   be incremented. */
2480                if ((s[1] & 0xC0) == 0x80)
2481                    endinpos++;
2482                goto utf8Error;
2483            }
2484            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2485            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2486            *p++ = (Py_UNICODE)ch;
2487            break;
2488
2489        case 4:
2490            if ((s[1] & 0xc0) != 0x80 ||
2491                (s[2] & 0xc0) != 0x80 ||
2492                (s[3] & 0xc0) != 0x80 ||
2493                ((unsigned char)s[0] == 0xF0 &&
2494                 (unsigned char)s[1] < 0x90) ||
2495                ((unsigned char)s[0] == 0xF4 &&
2496                 (unsigned char)s[1] > 0x8F)) {
2497                errmsg = "invalid continuation byte";
2498                startinpos = s-starts;
2499                endinpos = startinpos + 1;
2500                if ((s[1] & 0xC0) == 0x80) {
2501                    endinpos++;
2502                    if ((s[2] & 0xC0) == 0x80)
2503                        endinpos++;
2504                }
2505                goto utf8Error;
2506            }
2507            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2508                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2509            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2510
2511#ifdef Py_UNICODE_WIDE
2512            *p++ = (Py_UNICODE)ch;
2513#else
2514            /*  compute and append the two surrogates: */
2515
2516            /*  translate from 10000..10FFFF to 0..FFFF */
2517            ch -= 0x10000;
2518
2519            /*  high surrogate = top 10 bits added to D800 */
2520            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2521
2522            /*  low surrogate = bottom 10 bits added to DC00 */
2523            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2524#endif
2525            break;
2526        }
2527        s += n;
2528        continue;
2529
2530      utf8Error:
2531        outpos = p-PyUnicode_AS_UNICODE(unicode);
2532        if (unicode_decode_call_errorhandler(
2533                errors, &errorHandler,
2534                "utf8", errmsg,
2535                &starts, &e, &startinpos, &endinpos, &exc, &s,
2536                &unicode, &outpos, &p))
2537            goto onError;
2538        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2539    }
2540    if (consumed)
2541        *consumed = s-starts;
2542
2543    /* Adjust length */
2544    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2545        goto onError;
2546
2547    Py_XDECREF(errorHandler);
2548    Py_XDECREF(exc);
2549    return (PyObject *)unicode;
2550
2551  onError:
2552    Py_XDECREF(errorHandler);
2553    Py_XDECREF(exc);
2554    Py_DECREF(unicode);
2555    return NULL;
2556}
2557
2558#undef ASCII_CHAR_MASK
2559
2560
2561/* Allocation strategy:  if the string is short, convert into a stack buffer
2562   and allocate exactly as much space needed at the end.  Else allocate the
2563   maximum possible needed (4 result bytes per Unicode character), and return
2564   the excess memory at the end.
2565*/
2566PyObject *
2567PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2568                     Py_ssize_t size,
2569                     const char *errors)
2570{
2571#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2572
2573    Py_ssize_t i;                /* index into s of next input byte */
2574    PyObject *result;            /* result string object */
2575    char *p;                     /* next free byte in output buffer */
2576    Py_ssize_t nallocated;      /* number of result bytes allocated */
2577    Py_ssize_t nneeded;            /* number of result bytes needed */
2578    char stackbuf[MAX_SHORT_UNICHARS * 4];
2579    PyObject *errorHandler = NULL;
2580    PyObject *exc = NULL;
2581
2582    assert(s != NULL);
2583    assert(size >= 0);
2584
2585    if (size <= MAX_SHORT_UNICHARS) {
2586        /* Write into the stack buffer; nallocated can't overflow.
2587         * At the end, we'll allocate exactly as much heap space as it
2588         * turns out we need.
2589         */
2590        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2591        result = NULL;   /* will allocate after we're done */
2592        p = stackbuf;
2593    }
2594    else {
2595        /* Overallocate on the heap, and give the excess back at the end. */
2596        nallocated = size * 4;
2597        if (nallocated / 4 != size)  /* overflow! */
2598            return PyErr_NoMemory();
2599        result = PyBytes_FromStringAndSize(NULL, nallocated);
2600        if (result == NULL)
2601            return NULL;
2602        p = PyBytes_AS_STRING(result);
2603    }
2604
2605    for (i = 0; i < size;) {
2606        Py_UCS4 ch = s[i++];
2607
2608        if (ch < 0x80)
2609            /* Encode ASCII */
2610            *p++ = (char) ch;
2611
2612        else if (ch < 0x0800) {
2613            /* Encode Latin-1 */
2614            *p++ = (char)(0xc0 | (ch >> 6));
2615            *p++ = (char)(0x80 | (ch & 0x3f));
2616        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2617#ifndef Py_UNICODE_WIDE
2618            /* Special case: check for high and low surrogate */
2619            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2620                Py_UCS4 ch2 = s[i];
2621                /* Combine the two surrogates to form a UCS4 value */
2622                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2623                i++;
2624
2625                /* Encode UCS4 Unicode ordinals */
2626                *p++ = (char)(0xf0 | (ch >> 18));
2627                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2628                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2629                *p++ = (char)(0x80 | (ch & 0x3f));
2630            } else {
2631#endif
2632                Py_ssize_t newpos;
2633                PyObject *rep;
2634                Py_ssize_t repsize, k;
2635                rep = unicode_encode_call_errorhandler
2636                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
2637                     s, size, &exc, i-1, i, &newpos);
2638                if (!rep)
2639                    goto error;
2640
2641                if (PyBytes_Check(rep))
2642                    repsize = PyBytes_GET_SIZE(rep);
2643                else
2644                    repsize = PyUnicode_GET_SIZE(rep);
2645
2646                if (repsize > 4) {
2647                    Py_ssize_t offset;
2648
2649                    if (result == NULL)
2650                        offset = p - stackbuf;
2651                    else
2652                        offset = p - PyBytes_AS_STRING(result);
2653
2654                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2655                        /* integer overflow */
2656                        PyErr_NoMemory();
2657                        goto error;
2658                    }
2659                    nallocated += repsize - 4;
2660                    if (result != NULL) {
2661                        if (_PyBytes_Resize(&result, nallocated) < 0)
2662                            goto error;
2663                    } else {
2664                        result = PyBytes_FromStringAndSize(NULL, nallocated);
2665                        if (result == NULL)
2666                            goto error;
2667                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
2668                    }
2669                    p = PyBytes_AS_STRING(result) + offset;
2670                }
2671
2672                if (PyBytes_Check(rep)) {
2673                    char *prep = PyBytes_AS_STRING(rep);
2674                    for(k = repsize; k > 0; k--)
2675                        *p++ = *prep++;
2676                } else /* rep is unicode */ {
2677                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
2678                    Py_UNICODE c;
2679
2680                    for(k=0; k<repsize; k++) {
2681                        c = prep[k];
2682                        if (0x80 <= c) {
2683                            raise_encode_exception(&exc, "utf-8", s, size,
2684                                                   i-1, i, "surrogates not allowed");
2685                            goto error;
2686                        }
2687                        *p++ = (char)prep[k];
2688                    }
2689                }
2690                Py_DECREF(rep);
2691#ifndef Py_UNICODE_WIDE
2692            }
2693#endif
2694        } else if (ch < 0x10000) {
2695            *p++ = (char)(0xe0 | (ch >> 12));
2696            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2697            *p++ = (char)(0x80 | (ch & 0x3f));
2698        } else /* ch >= 0x10000 */ {
2699            /* Encode UCS4 Unicode ordinals */
2700            *p++ = (char)(0xf0 | (ch >> 18));
2701            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2702            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2703            *p++ = (char)(0x80 | (ch & 0x3f));
2704        }
2705    }
2706
2707    if (result == NULL) {
2708        /* This was stack allocated. */
2709        nneeded = p - stackbuf;
2710        assert(nneeded <= nallocated);
2711        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
2712    }
2713    else {
2714        /* Cut back to size actually needed. */
2715        nneeded = p - PyBytes_AS_STRING(result);
2716        assert(nneeded <= nallocated);
2717        _PyBytes_Resize(&result, nneeded);
2718    }
2719    Py_XDECREF(errorHandler);
2720    Py_XDECREF(exc);
2721    return result;
2722 error:
2723    Py_XDECREF(errorHandler);
2724    Py_XDECREF(exc);
2725    Py_XDECREF(result);
2726    return NULL;
2727
2728#undef MAX_SHORT_UNICHARS
2729}
2730
2731PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
2732{
2733    if (!PyUnicode_Check(unicode)) {
2734        PyErr_BadArgument();
2735        return NULL;
2736    }
2737    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
2738                                PyUnicode_GET_SIZE(unicode),
2739                                NULL);
2740}
2741
2742/* --- UTF-32 Codec ------------------------------------------------------- */
2743
2744PyObject *
2745PyUnicode_DecodeUTF32(const char *s,
2746                      Py_ssize_t size,
2747                      const char *errors,
2748                      int *byteorder)
2749{
2750    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
2751}
2752
2753PyObject *
2754PyUnicode_DecodeUTF32Stateful(const char *s,
2755                              Py_ssize_t size,
2756                              const char *errors,
2757                              int *byteorder,
2758                              Py_ssize_t *consumed)
2759{
2760    const char *starts = s;
2761    Py_ssize_t startinpos;
2762    Py_ssize_t endinpos;
2763    Py_ssize_t outpos;
2764    PyUnicodeObject *unicode;
2765    Py_UNICODE *p;
2766#ifndef Py_UNICODE_WIDE
2767    int pairs = 0;
2768    const unsigned char *qq;
2769#else
2770    const int pairs = 0;
2771#endif
2772    const unsigned char *q, *e;
2773    int bo = 0;       /* assume native ordering by default */
2774    const char *errmsg = "";
2775    /* Offsets from q for retrieving bytes in the right order. */
2776#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2777    int iorder[] = {0, 1, 2, 3};
2778#else
2779    int iorder[] = {3, 2, 1, 0};
2780#endif
2781    PyObject *errorHandler = NULL;
2782    PyObject *exc = NULL;
2783
2784    q = (unsigned char *)s;
2785    e = q + size;
2786
2787    if (byteorder)
2788        bo = *byteorder;
2789
2790    /* Check for BOM marks (U+FEFF) in the input and adjust current
2791       byte order setting accordingly. In native mode, the leading BOM
2792       mark is skipped, in all other modes, it is copied to the output
2793       stream as-is (giving a ZWNBSP character). */
2794    if (bo == 0) {
2795        if (size >= 4) {
2796            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2797                (q[iorder[1]] << 8) | q[iorder[0]];
2798#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2799            if (bom == 0x0000FEFF) {
2800                q += 4;
2801                bo = -1;
2802            }
2803            else if (bom == 0xFFFE0000) {
2804                q += 4;
2805                bo = 1;
2806            }
2807#else
2808            if (bom == 0x0000FEFF) {
2809                q += 4;
2810                bo = 1;
2811            }
2812            else if (bom == 0xFFFE0000) {
2813                q += 4;
2814                bo = -1;
2815            }
2816#endif
2817        }
2818    }
2819
2820    if (bo == -1) {
2821        /* force LE */
2822        iorder[0] = 0;
2823        iorder[1] = 1;
2824        iorder[2] = 2;
2825        iorder[3] = 3;
2826    }
2827    else if (bo == 1) {
2828        /* force BE */
2829        iorder[0] = 3;
2830        iorder[1] = 2;
2831        iorder[2] = 1;
2832        iorder[3] = 0;
2833    }
2834
2835    /* On narrow builds we split characters outside the BMP into two
2836       codepoints => count how much extra space we need. */
2837#ifndef Py_UNICODE_WIDE
2838    for (qq = q; qq < e; qq += 4)
2839        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
2840            pairs++;
2841#endif
2842
2843    /* This might be one to much, because of a BOM */
2844    unicode = _PyUnicode_New((size+3)/4+pairs);
2845    if (!unicode)
2846        return NULL;
2847    if (size == 0)
2848        return (PyObject *)unicode;
2849
2850    /* Unpack UTF-32 encoded data */
2851    p = unicode->str;
2852
2853    while (q < e) {
2854        Py_UCS4 ch;
2855        /* remaining bytes at the end? (size should be divisible by 4) */
2856        if (e-q<4) {
2857            if (consumed)
2858                break;
2859            errmsg = "truncated data";
2860            startinpos = ((const char *)q)-starts;
2861            endinpos = ((const char *)e)-starts;
2862            goto utf32Error;
2863            /* The remaining input chars are ignored if the callback
2864               chooses to skip the input */
2865        }
2866        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
2867            (q[iorder[1]] << 8) | q[iorder[0]];
2868
2869        if (ch >= 0x110000)
2870        {
2871            errmsg = "codepoint not in range(0x110000)";
2872            startinpos = ((const char *)q)-starts;
2873            endinpos = startinpos+4;
2874            goto utf32Error;
2875        }
2876#ifndef Py_UNICODE_WIDE
2877        if (ch >= 0x10000)
2878        {
2879            *p++ = 0xD800 | ((ch-0x10000) >> 10);
2880            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
2881        }
2882        else
2883#endif
2884            *p++ = ch;
2885        q += 4;
2886        continue;
2887      utf32Error:
2888        outpos = p-PyUnicode_AS_UNICODE(unicode);
2889        if (unicode_decode_call_errorhandler(
2890                errors, &errorHandler,
2891                "utf32", errmsg,
2892                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
2893                &unicode, &outpos, &p))
2894            goto onError;
2895    }
2896
2897    if (byteorder)
2898        *byteorder = bo;
2899
2900    if (consumed)
2901        *consumed = (const char *)q-starts;
2902
2903    /* Adjust length */
2904    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2905        goto onError;
2906
2907    Py_XDECREF(errorHandler);
2908    Py_XDECREF(exc);
2909    return (PyObject *)unicode;
2910
2911  onError:
2912    Py_DECREF(unicode);
2913    Py_XDECREF(errorHandler);
2914    Py_XDECREF(exc);
2915    return NULL;
2916}
2917
2918PyObject *
2919PyUnicode_EncodeUTF32(const Py_UNICODE *s,
2920                      Py_ssize_t size,
2921                      const char *errors,
2922                      int byteorder)
2923{
2924    PyObject *v;
2925    unsigned char *p;
2926    Py_ssize_t nsize, bytesize;
2927#ifndef Py_UNICODE_WIDE
2928    Py_ssize_t i, pairs;
2929#else
2930    const int pairs = 0;
2931#endif
2932    /* Offsets from p for storing byte pairs in the right order. */
2933#ifdef BYTEORDER_IS_LITTLE_ENDIAN
2934    int iorder[] = {0, 1, 2, 3};
2935#else
2936    int iorder[] = {3, 2, 1, 0};
2937#endif
2938
2939#define STORECHAR(CH)                           \
2940    do {                                        \
2941        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
2942        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
2943        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
2944        p[iorder[0]] = (CH) & 0xff;             \
2945        p += 4;                                 \
2946    } while(0)
2947
2948    /* In narrow builds we can output surrogate pairs as one codepoint,
2949       so we need less space. */
2950#ifndef Py_UNICODE_WIDE
2951    for (i = pairs = 0; i < size-1; i++)
2952        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
2953            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
2954            pairs++;
2955#endif
2956    nsize = (size - pairs + (byteorder == 0));
2957    bytesize = nsize * 4;
2958    if (bytesize / 4 != nsize)
2959        return PyErr_NoMemory();
2960    v = PyBytes_FromStringAndSize(NULL, bytesize);
2961    if (v == NULL)
2962        return NULL;
2963
2964    p = (unsigned char *)PyBytes_AS_STRING(v);
2965    if (byteorder == 0)
2966        STORECHAR(0xFEFF);
2967    if (size == 0)
2968        goto done;
2969
2970    if (byteorder == -1) {
2971        /* force LE */
2972        iorder[0] = 0;
2973        iorder[1] = 1;
2974        iorder[2] = 2;
2975        iorder[3] = 3;
2976    }
2977    else if (byteorder == 1) {
2978        /* force BE */
2979        iorder[0] = 3;
2980        iorder[1] = 2;
2981        iorder[2] = 1;
2982        iorder[3] = 0;
2983    }
2984
2985    while (size-- > 0) {
2986        Py_UCS4 ch = *s++;
2987#ifndef Py_UNICODE_WIDE
2988        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
2989            Py_UCS4 ch2 = *s;
2990            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
2991                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
2992                s++;
2993                size--;
2994            }
2995        }
2996#endif
2997        STORECHAR(ch);
2998    }
2999
3000  done:
3001    return v;
3002#undef STORECHAR
3003}
3004
3005PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3006{
3007    if (!PyUnicode_Check(unicode)) {
3008        PyErr_BadArgument();
3009        return NULL;
3010    }
3011    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3012                                 PyUnicode_GET_SIZE(unicode),
3013                                 NULL,
3014                                 0);
3015}
3016
3017/* --- UTF-16 Codec ------------------------------------------------------- */
3018
3019PyObject *
3020PyUnicode_DecodeUTF16(const char *s,
3021                      Py_ssize_t size,
3022                      const char *errors,
3023                      int *byteorder)
3024{
3025    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3026}
3027
3028/* Two masks for fast checking of whether a C 'long' may contain
3029   UTF16-encoded surrogate characters. This is an efficient heuristic,
3030   assuming that non-surrogate characters with a code point >= 0x8000 are
3031   rare in most input.
3032   FAST_CHAR_MASK is used when the input is in native byte ordering,
3033   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3034*/
3035#if (SIZEOF_LONG == 8)
3036# define FAST_CHAR_MASK         0x8000800080008000L
3037# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3038#elif (SIZEOF_LONG == 4)
3039# define FAST_CHAR_MASK         0x80008000L
3040# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3041#else
3042# error C 'long' size should be either 4 or 8!
3043#endif
3044
3045PyObject *
3046PyUnicode_DecodeUTF16Stateful(const char *s,
3047                              Py_ssize_t size,
3048                              const char *errors,
3049                              int *byteorder,
3050                              Py_ssize_t *consumed)
3051{
3052    const char *starts = s;
3053    Py_ssize_t startinpos;
3054    Py_ssize_t endinpos;
3055    Py_ssize_t outpos;
3056    PyUnicodeObject *unicode;
3057    Py_UNICODE *p;
3058    const unsigned char *q, *e, *aligned_end;
3059    int bo = 0;       /* assume native ordering by default */
3060    int native_ordering = 0;
3061    const char *errmsg = "";
3062    /* Offsets from q for retrieving byte pairs in the right order. */
3063#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3064    int ihi = 1, ilo = 0;
3065#else
3066    int ihi = 0, ilo = 1;
3067#endif
3068    PyObject *errorHandler = NULL;
3069    PyObject *exc = NULL;
3070
3071    /* Note: size will always be longer than the resulting Unicode
3072       character count */
3073    unicode = _PyUnicode_New(size);
3074    if (!unicode)
3075        return NULL;
3076    if (size == 0)
3077        return (PyObject *)unicode;
3078
3079    /* Unpack UTF-16 encoded data */
3080    p = unicode->str;
3081    q = (unsigned char *)s;
3082    e = q + size - 1;
3083
3084    if (byteorder)
3085        bo = *byteorder;
3086
3087    /* Check for BOM marks (U+FEFF) in the input and adjust current
3088       byte order setting accordingly. In native mode, the leading BOM
3089       mark is skipped, in all other modes, it is copied to the output
3090       stream as-is (giving a ZWNBSP character). */
3091    if (bo == 0) {
3092        if (size >= 2) {
3093            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3094#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3095            if (bom == 0xFEFF) {
3096                q += 2;
3097                bo = -1;
3098            }
3099            else if (bom == 0xFFFE) {
3100                q += 2;
3101                bo = 1;
3102            }
3103#else
3104            if (bom == 0xFEFF) {
3105                q += 2;
3106                bo = 1;
3107            }
3108            else if (bom == 0xFFFE) {
3109                q += 2;
3110                bo = -1;
3111            }
3112#endif
3113        }
3114    }
3115
3116    if (bo == -1) {
3117        /* force LE */
3118        ihi = 1;
3119        ilo = 0;
3120    }
3121    else if (bo == 1) {
3122        /* force BE */
3123        ihi = 0;
3124        ilo = 1;
3125    }
3126#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3127    native_ordering = ilo < ihi;
3128#else
3129    native_ordering = ilo > ihi;
3130#endif
3131
3132    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3133    while (q < e) {
3134        Py_UNICODE ch;
3135        /* First check for possible aligned read of a C 'long'. Unaligned
3136           reads are more expensive, better to defer to another iteration. */
3137        if (!((size_t) q & LONG_PTR_MASK)) {
3138            /* Fast path for runs of non-surrogate chars. */
3139            register const unsigned char *_q = q;
3140            Py_UNICODE *_p = p;
3141            if (native_ordering) {
3142                /* Native ordering is simple: as long as the input cannot
3143                   possibly contain a surrogate char, do an unrolled copy
3144                   of several 16-bit code points to the target object.
3145                   The non-surrogate check is done on several input bytes
3146                   at a time (as many as a C 'long' can contain). */
3147                while (_q < aligned_end) {
3148                    unsigned long data = * (unsigned long *) _q;
3149                    if (data & FAST_CHAR_MASK)
3150                        break;
3151                    _p[0] = ((unsigned short *) _q)[0];
3152                    _p[1] = ((unsigned short *) _q)[1];
3153#if (SIZEOF_LONG == 8)
3154                    _p[2] = ((unsigned short *) _q)[2];
3155                    _p[3] = ((unsigned short *) _q)[3];
3156#endif
3157                    _q += SIZEOF_LONG;
3158                    _p += SIZEOF_LONG / 2;
3159                }
3160            }
3161            else {
3162                /* Byteswapped ordering is similar, but we must decompose
3163                   the copy bytewise, and take care of zero'ing out the
3164                   upper bytes if the target object is in 32-bit units
3165                   (that is, in UCS-4 builds). */
3166                while (_q < aligned_end) {
3167                    unsigned long data = * (unsigned long *) _q;
3168                    if (data & SWAPPED_FAST_CHAR_MASK)
3169                        break;
3170                    /* Zero upper bytes in UCS-4 builds */
3171#if (Py_UNICODE_SIZE > 2)
3172                    _p[0] = 0;
3173                    _p[1] = 0;
3174#if (SIZEOF_LONG == 8)
3175                    _p[2] = 0;
3176                    _p[3] = 0;
3177#endif
3178#endif
3179                    /* Issue #4916; UCS-4 builds on big endian machines must
3180                       fill the two last bytes of each 4-byte unit. */
3181#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3182# define OFF 2
3183#else
3184# define OFF 0
3185#endif
3186                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3187                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3188                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3189                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3190#if (SIZEOF_LONG == 8)
3191                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3192                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3193                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3194                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3195#endif
3196#undef OFF
3197                    _q += SIZEOF_LONG;
3198                    _p += SIZEOF_LONG / 2;
3199                }
3200            }
3201            p = _p;
3202            q = _q;
3203            if (q >= e)
3204                break;
3205        }
3206        ch = (q[ihi] << 8) | q[ilo];
3207
3208        q += 2;
3209
3210        if (ch < 0xD800 || ch > 0xDFFF) {
3211            *p++ = ch;
3212            continue;
3213        }
3214
3215        /* UTF-16 code pair: */
3216        if (q > e) {
3217            errmsg = "unexpected end of data";
3218            startinpos = (((const char *)q) - 2) - starts;
3219            endinpos = ((const char *)e) + 1 - starts;
3220            goto utf16Error;
3221        }
3222        if (0xD800 <= ch && ch <= 0xDBFF) {
3223            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3224            q += 2;
3225            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3226#ifndef Py_UNICODE_WIDE
3227                *p++ = ch;
3228                *p++ = ch2;
3229#else
3230                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3231#endif
3232                continue;
3233            }
3234            else {
3235                errmsg = "illegal UTF-16 surrogate";
3236                startinpos = (((const char *)q)-4)-starts;
3237                endinpos = startinpos+2;
3238                goto utf16Error;
3239            }
3240
3241        }
3242        errmsg = "illegal encoding";
3243        startinpos = (((const char *)q)-2)-starts;
3244        endinpos = startinpos+2;
3245        /* Fall through to report the error */
3246
3247      utf16Error:
3248        outpos = p - PyUnicode_AS_UNICODE(unicode);
3249        if (unicode_decode_call_errorhandler(
3250                errors,
3251                &errorHandler,
3252                "utf16", errmsg,
3253                &starts,
3254                (const char **)&e,
3255                &startinpos,
3256                &endinpos,
3257                &exc,
3258                (const char **)&q,
3259                &unicode,
3260                &outpos,
3261                &p))
3262            goto onError;
3263    }
3264    /* remaining byte at the end? (size should be even) */
3265    if (e == q) {
3266        if (!consumed) {
3267            errmsg = "truncated data";
3268            startinpos = ((const char *)q) - starts;
3269            endinpos = ((const char *)e) + 1 - starts;
3270            outpos = p - PyUnicode_AS_UNICODE(unicode);
3271            if (unicode_decode_call_errorhandler(
3272                    errors,
3273                    &errorHandler,
3274                    "utf16", errmsg,
3275                    &starts,
3276                    (const char **)&e,
3277                    &startinpos,
3278                    &endinpos,
3279                    &exc,
3280                    (const char **)&q,
3281                    &unicode,
3282                    &outpos,
3283                    &p))
3284                goto onError;
3285            /* The remaining input chars are ignored if the callback
3286               chooses to skip the input */
3287        }
3288    }
3289
3290    if (byteorder)
3291        *byteorder = bo;
3292
3293    if (consumed)
3294        *consumed = (const char *)q-starts;
3295
3296    /* Adjust length */
3297    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3298        goto onError;
3299
3300    Py_XDECREF(errorHandler);
3301    Py_XDECREF(exc);
3302    return (PyObject *)unicode;
3303
3304  onError:
3305    Py_DECREF(unicode);
3306    Py_XDECREF(errorHandler);
3307    Py_XDECREF(exc);
3308    return NULL;
3309}
3310
3311#undef FAST_CHAR_MASK
3312#undef SWAPPED_FAST_CHAR_MASK
3313
3314PyObject *
3315PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3316                      Py_ssize_t size,
3317                      const char *errors,
3318                      int byteorder)
3319{
3320    PyObject *v;
3321    unsigned char *p;
3322    Py_ssize_t nsize, bytesize;
3323#ifdef Py_UNICODE_WIDE
3324    Py_ssize_t i, pairs;
3325#else
3326    const int pairs = 0;
3327#endif
3328    /* Offsets from p for storing byte pairs in the right order. */
3329#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3330    int ihi = 1, ilo = 0;
3331#else
3332    int ihi = 0, ilo = 1;
3333#endif
3334
3335#define STORECHAR(CH)                           \
3336    do {                                        \
3337        p[ihi] = ((CH) >> 8) & 0xff;            \
3338        p[ilo] = (CH) & 0xff;                   \
3339        p += 2;                                 \
3340    } while(0)
3341
3342#ifdef Py_UNICODE_WIDE
3343    for (i = pairs = 0; i < size; i++)
3344        if (s[i] >= 0x10000)
3345            pairs++;
3346#endif
3347    /* 2 * (size + pairs + (byteorder == 0)) */
3348    if (size > PY_SSIZE_T_MAX ||
3349        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3350        return PyErr_NoMemory();
3351    nsize = size + pairs + (byteorder == 0);
3352    bytesize = nsize * 2;
3353    if (bytesize / 2 != nsize)
3354        return PyErr_NoMemory();
3355    v = PyBytes_FromStringAndSize(NULL, bytesize);
3356    if (v == NULL)
3357        return NULL;
3358
3359    p = (unsigned char *)PyBytes_AS_STRING(v);
3360    if (byteorder == 0)
3361        STORECHAR(0xFEFF);
3362    if (size == 0)
3363        goto done;
3364
3365    if (byteorder == -1) {
3366        /* force LE */
3367        ihi = 1;
3368        ilo = 0;
3369    }
3370    else if (byteorder == 1) {
3371        /* force BE */
3372        ihi = 0;
3373        ilo = 1;
3374    }
3375
3376    while (size-- > 0) {
3377        Py_UNICODE ch = *s++;
3378        Py_UNICODE ch2 = 0;
3379#ifdef Py_UNICODE_WIDE
3380        if (ch >= 0x10000) {
3381            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3382            ch  = 0xD800 | ((ch-0x10000) >> 10);
3383        }
3384#endif
3385        STORECHAR(ch);
3386        if (ch2)
3387            STORECHAR(ch2);
3388    }
3389
3390  done:
3391    return v;
3392#undef STORECHAR
3393}
3394
3395PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3396{
3397    if (!PyUnicode_Check(unicode)) {
3398        PyErr_BadArgument();
3399        return NULL;
3400    }
3401    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3402                                 PyUnicode_GET_SIZE(unicode),
3403                                 NULL,
3404                                 0);
3405}
3406
3407/* --- Unicode Escape Codec ----------------------------------------------- */
3408
3409static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3410
3411PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3412                                        Py_ssize_t size,
3413                                        const char *errors)
3414{
3415    const char *starts = s;
3416    Py_ssize_t startinpos;
3417    Py_ssize_t endinpos;
3418    Py_ssize_t outpos;
3419    int i;
3420    PyUnicodeObject *v;
3421    Py_UNICODE *p;
3422    const char *end;
3423    char* message;
3424    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3425    PyObject *errorHandler = NULL;
3426    PyObject *exc = NULL;
3427
3428    /* Escaped strings will always be longer than the resulting
3429       Unicode string, so we start with size here and then reduce the
3430       length after conversion to the true value.
3431       (but if the error callback returns a long replacement string
3432       we'll have to allocate more space) */
3433    v = _PyUnicode_New(size);
3434    if (v == NULL)
3435        goto onError;
3436    if (size == 0)
3437        return (PyObject *)v;
3438
3439    p = PyUnicode_AS_UNICODE(v);
3440    end = s + size;
3441
3442    while (s < end) {
3443        unsigned char c;
3444        Py_UNICODE x;
3445        int digits;
3446
3447        /* Non-escape characters are interpreted as Unicode ordinals */
3448        if (*s != '\\') {
3449            *p++ = (unsigned char) *s++;
3450            continue;
3451        }
3452
3453        startinpos = s-starts;
3454        /* \ - Escapes */
3455        s++;
3456        c = *s++;
3457        if (s > end)
3458            c = '\0'; /* Invalid after \ */
3459        switch (c) {
3460
3461            /* \x escapes */
3462        case '\n': break;
3463        case '\\': *p++ = '\\'; break;
3464        case '\'': *p++ = '\''; break;
3465        case '\"': *p++ = '\"'; break;
3466        case 'b': *p++ = '\b'; break;
3467        case 'f': *p++ = '\014'; break; /* FF */
3468        case 't': *p++ = '\t'; break;
3469        case 'n': *p++ = '\n'; break;
3470        case 'r': *p++ = '\r'; break;
3471        case 'v': *p++ = '\013'; break; /* VT */
3472        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3473
3474            /* \OOO (octal) escapes */
3475        case '0': case '1': case '2': case '3':
3476        case '4': case '5': case '6': case '7':
3477            x = s[-1] - '0';
3478            if (s < end && '0' <= *s && *s <= '7') {
3479                x = (x<<3) + *s++ - '0';
3480                if (s < end && '0' <= *s && *s <= '7')
3481                    x = (x<<3) + *s++ - '0';
3482            }
3483            *p++ = x;
3484            break;
3485
3486            /* hex escapes */
3487            /* \xXX */
3488        case 'x':
3489            digits = 2;
3490            message = "truncated \\xXX escape";
3491            goto hexescape;
3492
3493            /* \uXXXX */
3494        case 'u':
3495            digits = 4;
3496            message = "truncated \\uXXXX escape";
3497            goto hexescape;
3498
3499            /* \UXXXXXXXX */
3500        case 'U':
3501            digits = 8;
3502            message = "truncated \\UXXXXXXXX escape";
3503        hexescape:
3504            chr = 0;
3505            outpos = p-PyUnicode_AS_UNICODE(v);
3506            if (s+digits>end) {
3507                endinpos = size;
3508                if (unicode_decode_call_errorhandler(
3509                        errors, &errorHandler,
3510                        "unicodeescape", "end of string in escape sequence",
3511                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3512                        &v, &outpos, &p))
3513                    goto onError;
3514                goto nextByte;
3515            }
3516            for (i = 0; i < digits; ++i) {
3517                c = (unsigned char) s[i];
3518                if (!ISXDIGIT(c)) {
3519                    endinpos = (s+i+1)-starts;
3520                    if (unicode_decode_call_errorhandler(
3521                            errors, &errorHandler,
3522                            "unicodeescape", message,
3523                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3524                            &v, &outpos, &p))
3525                        goto onError;
3526                    goto nextByte;
3527                }
3528                chr = (chr<<4) & ~0xF;
3529                if (c >= '0' && c <= '9')
3530                    chr += c - '0';
3531                else if (c >= 'a' && c <= 'f')
3532                    chr += 10 + c - 'a';
3533                else
3534                    chr += 10 + c - 'A';
3535            }
3536            s += i;
3537            if (chr == 0xffffffff && PyErr_Occurred())
3538                /* _decoding_error will have already written into the
3539                   target buffer. */
3540                break;
3541        store:
3542            /* when we get here, chr is a 32-bit unicode character */
3543            if (chr <= 0xffff)
3544                /* UCS-2 character */
3545                *p++ = (Py_UNICODE) chr;
3546            else if (chr <= 0x10ffff) {
3547                /* UCS-4 character. Either store directly, or as
3548                   surrogate pair. */
3549#ifdef Py_UNICODE_WIDE
3550                *p++ = chr;
3551#else
3552                chr -= 0x10000L;
3553                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3554                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3555#endif
3556            } else {
3557                endinpos = s-starts;
3558                outpos = p-PyUnicode_AS_UNICODE(v);
3559                if (unicode_decode_call_errorhandler(
3560                        errors, &errorHandler,
3561                        "unicodeescape", "illegal Unicode character",
3562                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3563                        &v, &outpos, &p))
3564                    goto onError;
3565            }
3566            break;
3567
3568            /* \N{name} */
3569        case 'N':
3570            message = "malformed \\N character escape";
3571            if (ucnhash_CAPI == NULL) {
3572                /* load the unicode data module */
3573                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3574                if (ucnhash_CAPI == NULL)
3575                    goto ucnhashError;
3576            }
3577            if (*s == '{') {
3578                const char *start = s+1;
3579                /* look for the closing brace */
3580                while (*s != '}' && s < end)
3581                    s++;
3582                if (s > start && s < end && *s == '}') {
3583                    /* found a name.  look it up in the unicode database */
3584                    message = "unknown Unicode character name";
3585                    s++;
3586                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3587                        goto store;
3588                }
3589            }
3590            endinpos = s-starts;
3591            outpos = p-PyUnicode_AS_UNICODE(v);
3592            if (unicode_decode_call_errorhandler(
3593                    errors, &errorHandler,
3594                    "unicodeescape", message,
3595                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3596                    &v, &outpos, &p))
3597                goto onError;
3598            break;
3599
3600        default:
3601            if (s > end) {
3602                message = "\\ at end of string";
3603                s--;
3604                endinpos = s-starts;
3605                outpos = p-PyUnicode_AS_UNICODE(v);
3606                if (unicode_decode_call_errorhandler(
3607                        errors, &errorHandler,
3608                        "unicodeescape", message,
3609                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3610                        &v, &outpos, &p))
3611                    goto onError;
3612            }
3613            else {
3614                *p++ = '\\';
3615                *p++ = (unsigned char)s[-1];
3616            }
3617            break;
3618        }
3619      nextByte:
3620        ;
3621    }
3622    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3623        goto onError;
3624    Py_XDECREF(errorHandler);
3625    Py_XDECREF(exc);
3626    return (PyObject *)v;
3627
3628  ucnhashError:
3629    PyErr_SetString(
3630        PyExc_UnicodeError,
3631        "\\N escapes not supported (can't load unicodedata module)"
3632        );
3633    Py_XDECREF(v);
3634    Py_XDECREF(errorHandler);
3635    Py_XDECREF(exc);
3636    return NULL;
3637
3638  onError:
3639    Py_XDECREF(v);
3640    Py_XDECREF(errorHandler);
3641    Py_XDECREF(exc);
3642    return NULL;
3643}
3644
3645/* Return a Unicode-Escape string version of the Unicode object.
3646
3647   If quotes is true, the string is enclosed in u"" or u'' quotes as
3648   appropriate.
3649
3650*/
3651
3652Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3653                                             Py_ssize_t size,
3654                                             Py_UNICODE ch)
3655{
3656    /* like wcschr, but doesn't stop at NULL characters */
3657
3658    while (size-- > 0) {
3659        if (*s == ch)
3660            return s;
3661        s++;
3662    }
3663
3664    return NULL;
3665}
3666
3667static const char *hexdigits = "0123456789abcdef";
3668
3669PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
3670                                        Py_ssize_t size)
3671{
3672    PyObject *repr;
3673    char *p;
3674
3675#ifdef Py_UNICODE_WIDE
3676    const Py_ssize_t expandsize = 10;
3677#else
3678    const Py_ssize_t expandsize = 6;
3679#endif
3680
3681    /* XXX(nnorwitz): rather than over-allocating, it would be
3682       better to choose a different scheme.  Perhaps scan the
3683       first N-chars of the string and allocate based on that size.
3684    */
3685    /* Initial allocation is based on the longest-possible unichr
3686       escape.
3687
3688       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
3689       unichr, so in this case it's the longest unichr escape. In
3690       narrow (UTF-16) builds this is five chars per source unichr
3691       since there are two unichrs in the surrogate pair, so in narrow
3692       (UTF-16) builds it's not the longest unichr escape.
3693
3694       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
3695       so in the narrow (UTF-16) build case it's the longest unichr
3696       escape.
3697    */
3698
3699    if (size == 0)
3700        return PyBytes_FromStringAndSize(NULL, 0);
3701
3702    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
3703        return PyErr_NoMemory();
3704
3705    repr = PyBytes_FromStringAndSize(NULL,
3706                                     2
3707                                     + expandsize*size
3708                                     + 1);
3709    if (repr == NULL)
3710        return NULL;
3711
3712    p = PyBytes_AS_STRING(repr);
3713
3714    while (size-- > 0) {
3715        Py_UNICODE ch = *s++;
3716
3717        /* Escape backslashes */
3718        if (ch == '\\') {
3719            *p++ = '\\';
3720            *p++ = (char) ch;
3721            continue;
3722        }
3723
3724#ifdef Py_UNICODE_WIDE
3725        /* Map 21-bit characters to '\U00xxxxxx' */
3726        else if (ch >= 0x10000) {
3727            *p++ = '\\';
3728            *p++ = 'U';
3729            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
3730            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
3731            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
3732            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
3733            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
3734            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
3735            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
3736            *p++ = hexdigits[ch & 0x0000000F];
3737            continue;
3738        }
3739#else
3740        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3741        else if (ch >= 0xD800 && ch < 0xDC00) {
3742            Py_UNICODE ch2;
3743            Py_UCS4 ucs;
3744
3745            ch2 = *s++;
3746            size--;
3747            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3748                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3749                *p++ = '\\';
3750                *p++ = 'U';
3751                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
3752                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
3753                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
3754                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
3755                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
3756                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
3757                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
3758                *p++ = hexdigits[ucs & 0x0000000F];
3759                continue;
3760            }
3761            /* Fall through: isolated surrogates are copied as-is */
3762            s--;
3763            size++;
3764        }
3765#endif
3766
3767        /* Map 16-bit characters to '\uxxxx' */
3768        if (ch >= 256) {
3769            *p++ = '\\';
3770            *p++ = 'u';
3771            *p++ = hexdigits[(ch >> 12) & 0x000F];
3772            *p++ = hexdigits[(ch >> 8) & 0x000F];
3773            *p++ = hexdigits[(ch >> 4) & 0x000F];
3774            *p++ = hexdigits[ch & 0x000F];
3775        }
3776
3777        /* Map special whitespace to '\t', \n', '\r' */
3778        else if (ch == '\t') {
3779            *p++ = '\\';
3780            *p++ = 't';
3781        }
3782        else if (ch == '\n') {
3783            *p++ = '\\';
3784            *p++ = 'n';
3785        }
3786        else if (ch == '\r') {
3787            *p++ = '\\';
3788            *p++ = 'r';
3789        }
3790
3791        /* Map non-printable US ASCII to '\xhh' */
3792        else if (ch < ' ' || ch >= 0x7F) {
3793            *p++ = '\\';
3794            *p++ = 'x';
3795            *p++ = hexdigits[(ch >> 4) & 0x000F];
3796            *p++ = hexdigits[ch & 0x000F];
3797        }
3798
3799        /* Copy everything else as-is */
3800        else
3801            *p++ = (char) ch;
3802    }
3803
3804    assert(p - PyBytes_AS_STRING(repr) > 0);
3805    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
3806        return NULL;
3807    return repr;
3808}
3809
3810PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
3811{
3812    PyObject *s;
3813    if (!PyUnicode_Check(unicode)) {
3814        PyErr_BadArgument();
3815        return NULL;
3816    }
3817    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
3818                                      PyUnicode_GET_SIZE(unicode));
3819    return s;
3820}
3821
3822/* --- Raw Unicode Escape Codec ------------------------------------------- */
3823
3824PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
3825                                           Py_ssize_t size,
3826                                           const char *errors)
3827{
3828    const char *starts = s;
3829    Py_ssize_t startinpos;
3830    Py_ssize_t endinpos;
3831    Py_ssize_t outpos;
3832    PyUnicodeObject *v;
3833    Py_UNICODE *p;
3834    const char *end;
3835    const char *bs;
3836    PyObject *errorHandler = NULL;
3837    PyObject *exc = NULL;
3838
3839    /* Escaped strings will always be longer than the resulting
3840       Unicode string, so we start with size here and then reduce the
3841       length after conversion to the true value. (But decoding error
3842       handler might have to resize the string) */
3843    v = _PyUnicode_New(size);
3844    if (v == NULL)
3845        goto onError;
3846    if (size == 0)
3847        return (PyObject *)v;
3848    p = PyUnicode_AS_UNICODE(v);
3849    end = s + size;
3850    while (s < end) {
3851        unsigned char c;
3852        Py_UCS4 x;
3853        int i;
3854        int count;
3855
3856        /* Non-escape characters are interpreted as Unicode ordinals */
3857        if (*s != '\\') {
3858            *p++ = (unsigned char)*s++;
3859            continue;
3860        }
3861        startinpos = s-starts;
3862
3863        /* \u-escapes are only interpreted iff the number of leading
3864           backslashes if odd */
3865        bs = s;
3866        for (;s < end;) {
3867            if (*s != '\\')
3868                break;
3869            *p++ = (unsigned char)*s++;
3870        }
3871        if (((s - bs) & 1) == 0 ||
3872            s >= end ||
3873            (*s != 'u' && *s != 'U')) {
3874            continue;
3875        }
3876        p--;
3877        count = *s=='u' ? 4 : 8;
3878        s++;
3879
3880        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
3881        outpos = p-PyUnicode_AS_UNICODE(v);
3882        for (x = 0, i = 0; i < count; ++i, ++s) {
3883            c = (unsigned char)*s;
3884            if (!ISXDIGIT(c)) {
3885                endinpos = s-starts;
3886                if (unicode_decode_call_errorhandler(
3887                        errors, &errorHandler,
3888                        "rawunicodeescape", "truncated \\uXXXX",
3889                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3890                        &v, &outpos, &p))
3891                    goto onError;
3892                goto nextByte;
3893            }
3894            x = (x<<4) & ~0xF;
3895            if (c >= '0' && c <= '9')
3896                x += c - '0';
3897            else if (c >= 'a' && c <= 'f')
3898                x += 10 + c - 'a';
3899            else
3900                x += 10 + c - 'A';
3901        }
3902        if (x <= 0xffff)
3903            /* UCS-2 character */
3904            *p++ = (Py_UNICODE) x;
3905        else if (x <= 0x10ffff) {
3906            /* UCS-4 character. Either store directly, or as
3907               surrogate pair. */
3908#ifdef Py_UNICODE_WIDE
3909            *p++ = (Py_UNICODE) x;
3910#else
3911            x -= 0x10000L;
3912            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
3913            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
3914#endif
3915        } else {
3916            endinpos = s-starts;
3917            outpos = p-PyUnicode_AS_UNICODE(v);
3918            if (unicode_decode_call_errorhandler(
3919                    errors, &errorHandler,
3920                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
3921                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3922                    &v, &outpos, &p))
3923                goto onError;
3924        }
3925      nextByte:
3926        ;
3927    }
3928    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3929        goto onError;
3930    Py_XDECREF(errorHandler);
3931    Py_XDECREF(exc);
3932    return (PyObject *)v;
3933
3934  onError:
3935    Py_XDECREF(v);
3936    Py_XDECREF(errorHandler);
3937    Py_XDECREF(exc);
3938    return NULL;
3939}
3940
3941PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
3942                                           Py_ssize_t size)
3943{
3944    PyObject *repr;
3945    char *p;
3946    char *q;
3947
3948#ifdef Py_UNICODE_WIDE
3949    const Py_ssize_t expandsize = 10;
3950#else
3951    const Py_ssize_t expandsize = 6;
3952#endif
3953
3954    if (size > PY_SSIZE_T_MAX / expandsize)
3955        return PyErr_NoMemory();
3956
3957    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
3958    if (repr == NULL)
3959        return NULL;
3960    if (size == 0)
3961        return repr;
3962
3963    p = q = PyBytes_AS_STRING(repr);
3964    while (size-- > 0) {
3965        Py_UNICODE ch = *s++;
3966#ifdef Py_UNICODE_WIDE
3967        /* Map 32-bit characters to '\Uxxxxxxxx' */
3968        if (ch >= 0x10000) {
3969            *p++ = '\\';
3970            *p++ = 'U';
3971            *p++ = hexdigits[(ch >> 28) & 0xf];
3972            *p++ = hexdigits[(ch >> 24) & 0xf];
3973            *p++ = hexdigits[(ch >> 20) & 0xf];
3974            *p++ = hexdigits[(ch >> 16) & 0xf];
3975            *p++ = hexdigits[(ch >> 12) & 0xf];
3976            *p++ = hexdigits[(ch >> 8) & 0xf];
3977            *p++ = hexdigits[(ch >> 4) & 0xf];
3978            *p++ = hexdigits[ch & 15];
3979        }
3980        else
3981#else
3982            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
3983            if (ch >= 0xD800 && ch < 0xDC00) {
3984                Py_UNICODE ch2;
3985                Py_UCS4 ucs;
3986
3987                ch2 = *s++;
3988                size--;
3989                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
3990                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
3991                    *p++ = '\\';
3992                    *p++ = 'U';
3993                    *p++ = hexdigits[(ucs >> 28) & 0xf];
3994                    *p++ = hexdigits[(ucs >> 24) & 0xf];
3995                    *p++ = hexdigits[(ucs >> 20) & 0xf];
3996                    *p++ = hexdigits[(ucs >> 16) & 0xf];
3997                    *p++ = hexdigits[(ucs >> 12) & 0xf];
3998                    *p++ = hexdigits[(ucs >> 8) & 0xf];
3999                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4000                    *p++ = hexdigits[ucs & 0xf];
4001                    continue;
4002                }
4003                /* Fall through: isolated surrogates are copied as-is */
4004                s--;
4005                size++;
4006            }
4007#endif
4008        /* Map 16-bit characters to '\uxxxx' */
4009        if (ch >= 256) {
4010            *p++ = '\\';
4011            *p++ = 'u';
4012            *p++ = hexdigits[(ch >> 12) & 0xf];
4013            *p++ = hexdigits[(ch >> 8) & 0xf];
4014            *p++ = hexdigits[(ch >> 4) & 0xf];
4015            *p++ = hexdigits[ch & 15];
4016        }
4017        /* Copy everything else as-is */
4018        else
4019            *p++ = (char) ch;
4020    }
4021    size = p - q;
4022
4023    assert(size > 0);
4024    if (_PyBytes_Resize(&repr, size) < 0)
4025        return NULL;
4026    return repr;
4027}
4028
4029PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4030{
4031    PyObject *s;
4032    if (!PyUnicode_Check(unicode)) {
4033        PyErr_BadArgument();
4034        return NULL;
4035    }
4036    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4037                                         PyUnicode_GET_SIZE(unicode));
4038
4039    return s;
4040}
4041
4042/* --- Unicode Internal Codec ------------------------------------------- */
4043
4044PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
4045                                           Py_ssize_t size,
4046                                           const char *errors)
4047{
4048    const char *starts = s;
4049    Py_ssize_t startinpos;
4050    Py_ssize_t endinpos;
4051    Py_ssize_t outpos;
4052    PyUnicodeObject *v;
4053    Py_UNICODE *p;
4054    const char *end;
4055    const char *reason;
4056    PyObject *errorHandler = NULL;
4057    PyObject *exc = NULL;
4058
4059#ifdef Py_UNICODE_WIDE
4060    Py_UNICODE unimax = PyUnicode_GetMax();
4061#endif
4062
4063    /* XXX overflow detection missing */
4064    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4065    if (v == NULL)
4066        goto onError;
4067    if (PyUnicode_GetSize((PyObject *)v) == 0)
4068        return (PyObject *)v;
4069    p = PyUnicode_AS_UNICODE(v);
4070    end = s + size;
4071
4072    while (s < end) {
4073        memcpy(p, s, sizeof(Py_UNICODE));
4074        /* We have to sanity check the raw data, otherwise doom looms for
4075           some malformed UCS-4 data. */
4076        if (
4077#ifdef Py_UNICODE_WIDE
4078            *p > unimax || *p < 0 ||
4079#endif
4080            end-s < Py_UNICODE_SIZE
4081            )
4082        {
4083            startinpos = s - starts;
4084            if (end-s < Py_UNICODE_SIZE) {
4085                endinpos = end-starts;
4086                reason = "truncated input";
4087            }
4088            else {
4089                endinpos = s - starts + Py_UNICODE_SIZE;
4090                reason = "illegal code point (> 0x10FFFF)";
4091            }
4092            outpos = p - PyUnicode_AS_UNICODE(v);
4093            if (unicode_decode_call_errorhandler(
4094                    errors, &errorHandler,
4095                    "unicode_internal", reason,
4096                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4097                    &v, &outpos, &p)) {
4098                goto onError;
4099            }
4100        }
4101        else {
4102            p++;
4103            s += Py_UNICODE_SIZE;
4104        }
4105    }
4106
4107    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4108        goto onError;
4109    Py_XDECREF(errorHandler);
4110    Py_XDECREF(exc);
4111    return (PyObject *)v;
4112
4113  onError:
4114    Py_XDECREF(v);
4115    Py_XDECREF(errorHandler);
4116    Py_XDECREF(exc);
4117    return NULL;
4118}
4119
4120/* --- Latin-1 Codec ------------------------------------------------------ */
4121
4122PyObject *PyUnicode_DecodeLatin1(const char *s,
4123                                 Py_ssize_t size,
4124                                 const char *errors)
4125{
4126    PyUnicodeObject *v;
4127    Py_UNICODE *p;
4128    const char *e, *unrolled_end;
4129
4130    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4131    if (size == 1) {
4132        Py_UNICODE r = *(unsigned char*)s;
4133        return PyUnicode_FromUnicode(&r, 1);
4134    }
4135
4136    v = _PyUnicode_New(size);
4137    if (v == NULL)
4138        goto onError;
4139    if (size == 0)
4140        return (PyObject *)v;
4141    p = PyUnicode_AS_UNICODE(v);
4142    e = s + size;
4143    /* Unrolling the copy makes it much faster by reducing the looping
4144       overhead. This is similar to what many memcpy() implementations do. */
4145    unrolled_end = e - 4;
4146    while (s < unrolled_end) {
4147        p[0] = (unsigned char) s[0];
4148        p[1] = (unsigned char) s[1];
4149        p[2] = (unsigned char) s[2];
4150        p[3] = (unsigned char) s[3];
4151        s += 4;
4152        p += 4;
4153    }
4154    while (s < e)
4155        *p++ = (unsigned char) *s++;
4156    return (PyObject *)v;
4157
4158  onError:
4159    Py_XDECREF(v);
4160    return NULL;
4161}
4162
4163/* create or adjust a UnicodeEncodeError */
4164static void make_encode_exception(PyObject **exceptionObject,
4165                                  const char *encoding,
4166                                  const Py_UNICODE *unicode, Py_ssize_t size,
4167                                  Py_ssize_t startpos, Py_ssize_t endpos,
4168                                  const char *reason)
4169{
4170    if (*exceptionObject == NULL) {
4171        *exceptionObject = PyUnicodeEncodeError_Create(
4172            encoding, unicode, size, startpos, endpos, reason);
4173    }
4174    else {
4175        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4176            goto onError;
4177        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4178            goto onError;
4179        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4180            goto onError;
4181        return;
4182      onError:
4183        Py_DECREF(*exceptionObject);
4184        *exceptionObject = NULL;
4185    }
4186}
4187
4188/* raises a UnicodeEncodeError */
4189static void raise_encode_exception(PyObject **exceptionObject,
4190                                   const char *encoding,
4191                                   const Py_UNICODE *unicode, Py_ssize_t size,
4192                                   Py_ssize_t startpos, Py_ssize_t endpos,
4193                                   const char *reason)
4194{
4195    make_encode_exception(exceptionObject,
4196                          encoding, unicode, size, startpos, endpos, reason);
4197    if (*exceptionObject != NULL)
4198        PyCodec_StrictErrors(*exceptionObject);
4199}
4200
4201/* error handling callback helper:
4202   build arguments, call the callback and check the arguments,
4203   put the result into newpos and return the replacement string, which
4204   has to be freed by the caller */
4205static PyObject *unicode_encode_call_errorhandler(const char *errors,
4206                                                  PyObject **errorHandler,
4207                                                  const char *encoding, const char *reason,
4208                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4209                                                  Py_ssize_t startpos, Py_ssize_t endpos,
4210                                                  Py_ssize_t *newpos)
4211{
4212    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4213
4214    PyObject *restuple;
4215    PyObject *resunicode;
4216
4217    if (*errorHandler == NULL) {
4218        *errorHandler = PyCodec_LookupError(errors);
4219        if (*errorHandler == NULL)
4220            return NULL;
4221    }
4222
4223    make_encode_exception(exceptionObject,
4224                          encoding, unicode, size, startpos, endpos, reason);
4225    if (*exceptionObject == NULL)
4226        return NULL;
4227
4228    restuple = PyObject_CallFunctionObjArgs(
4229        *errorHandler, *exceptionObject, NULL);
4230    if (restuple == NULL)
4231        return NULL;
4232    if (!PyTuple_Check(restuple)) {
4233        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4234        Py_DECREF(restuple);
4235        return NULL;
4236    }
4237    if (!PyArg_ParseTuple(restuple, argparse,
4238                          &resunicode, newpos)) {
4239        Py_DECREF(restuple);
4240        return NULL;
4241    }
4242    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4243        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4244        Py_DECREF(restuple);
4245        return NULL;
4246    }
4247    if (*newpos<0)
4248        *newpos = size+*newpos;
4249    if (*newpos<0 || *newpos>size) {
4250        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4251        Py_DECREF(restuple);
4252        return NULL;
4253    }
4254    Py_INCREF(resunicode);
4255    Py_DECREF(restuple);
4256    return resunicode;
4257}
4258
4259static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4260                                     Py_ssize_t size,
4261                                     const char *errors,
4262                                     int limit)
4263{
4264    /* output object */
4265    PyObject *res;
4266    /* pointers to the beginning and end+1 of input */
4267    const Py_UNICODE *startp = p;
4268    const Py_UNICODE *endp = p + size;
4269    /* pointer to the beginning of the unencodable characters */
4270    /* const Py_UNICODE *badp = NULL; */
4271    /* pointer into the output */
4272    char *str;
4273    /* current output position */
4274    Py_ssize_t ressize;
4275    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4276    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4277    PyObject *errorHandler = NULL;
4278    PyObject *exc = NULL;
4279    /* the following variable is used for caching string comparisons
4280     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4281    int known_errorHandler = -1;
4282
4283    /* allocate enough for a simple encoding without
4284       replacements, if we need more, we'll resize */
4285    if (size == 0)
4286        return PyBytes_FromStringAndSize(NULL, 0);
4287    res = PyBytes_FromStringAndSize(NULL, size);
4288    if (res == NULL)
4289        return NULL;
4290    str = PyBytes_AS_STRING(res);
4291    ressize = size;
4292
4293    while (p<endp) {
4294        Py_UNICODE c = *p;
4295
4296        /* can we encode this? */
4297        if (c<limit) {
4298            /* no overflow check, because we know that the space is enough */
4299            *str++ = (char)c;
4300            ++p;
4301        }
4302        else {
4303            Py_ssize_t unicodepos = p-startp;
4304            Py_ssize_t requiredsize;
4305            PyObject *repunicode;
4306            Py_ssize_t repsize;
4307            Py_ssize_t newpos;
4308            Py_ssize_t respos;
4309            Py_UNICODE *uni2;
4310            /* startpos for collecting unencodable chars */
4311            const Py_UNICODE *collstart = p;
4312            const Py_UNICODE *collend = p;
4313            /* find all unecodable characters */
4314            while ((collend < endp) && ((*collend)>=limit))
4315                ++collend;
4316            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4317            if (known_errorHandler==-1) {
4318                if ((errors==NULL) || (!strcmp(errors, "strict")))
4319                    known_errorHandler = 1;
4320                else if (!strcmp(errors, "replace"))
4321                    known_errorHandler = 2;
4322                else if (!strcmp(errors, "ignore"))
4323                    known_errorHandler = 3;
4324                else if (!strcmp(errors, "xmlcharrefreplace"))
4325                    known_errorHandler = 4;
4326                else
4327                    known_errorHandler = 0;
4328            }
4329            switch (known_errorHandler) {
4330            case 1: /* strict */
4331                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4332                goto onError;
4333            case 2: /* replace */
4334                while (collstart++<collend)
4335                    *str++ = '?'; /* fall through */
4336            case 3: /* ignore */
4337                p = collend;
4338                break;
4339            case 4: /* xmlcharrefreplace */
4340                respos = str - PyBytes_AS_STRING(res);
4341                /* determine replacement size (temporarily (mis)uses p) */
4342                for (p = collstart, repsize = 0; p < collend; ++p) {
4343                    if (*p<10)
4344                        repsize += 2+1+1;
4345                    else if (*p<100)
4346                        repsize += 2+2+1;
4347                    else if (*p<1000)
4348                        repsize += 2+3+1;
4349                    else if (*p<10000)
4350                        repsize += 2+4+1;
4351#ifndef Py_UNICODE_WIDE
4352                    else
4353                        repsize += 2+5+1;
4354#else
4355                    else if (*p<100000)
4356                        repsize += 2+5+1;
4357                    else if (*p<1000000)
4358                        repsize += 2+6+1;
4359                    else
4360                        repsize += 2+7+1;
4361#endif
4362                }
4363                requiredsize = respos+repsize+(endp-collend);
4364                if (requiredsize > ressize) {
4365                    if (requiredsize<2*ressize)
4366                        requiredsize = 2*ressize;
4367                    if (_PyBytes_Resize(&res, requiredsize))
4368                        goto onError;
4369                    str = PyBytes_AS_STRING(res) + respos;
4370                    ressize = requiredsize;
4371                }
4372                /* generate replacement (temporarily (mis)uses p) */
4373                for (p = collstart; p < collend; ++p) {
4374                    str += sprintf(str, "&#%d;", (int)*p);
4375                }
4376                p = collend;
4377                break;
4378            default:
4379                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4380                                                              encoding, reason, startp, size, &exc,
4381                                                              collstart-startp, collend-startp, &newpos);
4382                if (repunicode == NULL)
4383                    goto onError;
4384                if (PyBytes_Check(repunicode)) {
4385                    /* Directly copy bytes result to output. */
4386                    repsize = PyBytes_Size(repunicode);
4387                    if (repsize > 1) {
4388                        /* Make room for all additional bytes. */
4389                        respos = str - PyBytes_AS_STRING(res);
4390                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4391                            Py_DECREF(repunicode);
4392                            goto onError;
4393                        }
4394                        str = PyBytes_AS_STRING(res) + respos;
4395                        ressize += repsize-1;
4396                    }
4397                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4398                    str += repsize;
4399                    p = startp + newpos;
4400                    Py_DECREF(repunicode);
4401                    break;
4402                }
4403                /* need more space? (at least enough for what we
4404                   have+the replacement+the rest of the string, so
4405                   we won't have to check space for encodable characters) */
4406                respos = str - PyBytes_AS_STRING(res);
4407                repsize = PyUnicode_GET_SIZE(repunicode);
4408                requiredsize = respos+repsize+(endp-collend);
4409                if (requiredsize > ressize) {
4410                    if (requiredsize<2*ressize)
4411                        requiredsize = 2*ressize;
4412                    if (_PyBytes_Resize(&res, requiredsize)) {
4413                        Py_DECREF(repunicode);
4414                        goto onError;
4415                    }
4416                    str = PyBytes_AS_STRING(res) + respos;
4417                    ressize = requiredsize;
4418                }
4419                /* check if there is anything unencodable in the replacement
4420                   and copy it to the output */
4421                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4422                    c = *uni2;
4423                    if (c >= limit) {
4424                        raise_encode_exception(&exc, encoding, startp, size,
4425                                               unicodepos, unicodepos+1, reason);
4426                        Py_DECREF(repunicode);
4427                        goto onError;
4428                    }
4429                    *str = (char)c;
4430                }
4431                p = startp + newpos;
4432                Py_DECREF(repunicode);
4433            }
4434        }
4435    }
4436    /* Resize if we allocated to much */
4437    size = str - PyBytes_AS_STRING(res);
4438    if (size < ressize) { /* If this falls res will be NULL */
4439        assert(size >= 0);
4440        if (_PyBytes_Resize(&res, size) < 0)
4441            goto onError;
4442    }
4443
4444    Py_XDECREF(errorHandler);
4445    Py_XDECREF(exc);
4446    return res;
4447
4448  onError:
4449    Py_XDECREF(res);
4450    Py_XDECREF(errorHandler);
4451    Py_XDECREF(exc);
4452    return NULL;
4453}
4454
4455PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4456                                 Py_ssize_t size,
4457                                 const char *errors)
4458{
4459    return unicode_encode_ucs1(p, size, errors, 256);
4460}
4461
4462PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4463{
4464    if (!PyUnicode_Check(unicode)) {
4465        PyErr_BadArgument();
4466        return NULL;
4467    }
4468    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4469                                  PyUnicode_GET_SIZE(unicode),
4470                                  NULL);
4471}
4472
4473/* --- 7-bit ASCII Codec -------------------------------------------------- */
4474
4475PyObject *PyUnicode_DecodeASCII(const char *s,
4476                                Py_ssize_t size,
4477                                const char *errors)
4478{
4479    const char *starts = s;
4480    PyUnicodeObject *v;
4481    Py_UNICODE *p;
4482    Py_ssize_t startinpos;
4483    Py_ssize_t endinpos;
4484    Py_ssize_t outpos;
4485    const char *e;
4486    PyObject *errorHandler = NULL;
4487    PyObject *exc = NULL;
4488
4489    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4490    if (size == 1 && *(unsigned char*)s < 128) {
4491        Py_UNICODE r = *(unsigned char*)s;
4492        return PyUnicode_FromUnicode(&r, 1);
4493    }
4494
4495    v = _PyUnicode_New(size);
4496    if (v == NULL)
4497        goto onError;
4498    if (size == 0)
4499        return (PyObject *)v;
4500    p = PyUnicode_AS_UNICODE(v);
4501    e = s + size;
4502    while (s < e) {
4503        register unsigned char c = (unsigned char)*s;
4504        if (c < 128) {
4505            *p++ = c;
4506            ++s;
4507        }
4508        else {
4509            startinpos = s-starts;
4510            endinpos = startinpos + 1;
4511            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4512            if (unicode_decode_call_errorhandler(
4513                    errors, &errorHandler,
4514                    "ascii", "ordinal not in range(128)",
4515                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4516                    &v, &outpos, &p))
4517                goto onError;
4518        }
4519    }
4520    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4521        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4522            goto onError;
4523    Py_XDECREF(errorHandler);
4524    Py_XDECREF(exc);
4525    return (PyObject *)v;
4526
4527  onError:
4528    Py_XDECREF(v);
4529    Py_XDECREF(errorHandler);
4530    Py_XDECREF(exc);
4531    return NULL;
4532}
4533
4534PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4535                                Py_ssize_t size,
4536                                const char *errors)
4537{
4538    return unicode_encode_ucs1(p, size, errors, 128);
4539}
4540
4541PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4542{
4543    if (!PyUnicode_Check(unicode)) {
4544        PyErr_BadArgument();
4545        return NULL;
4546    }
4547    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4548                                 PyUnicode_GET_SIZE(unicode),
4549                                 NULL);
4550}
4551
4552#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4553
4554/* --- MBCS codecs for Windows -------------------------------------------- */
4555
4556#if SIZEOF_INT < SIZEOF_SIZE_T
4557#define NEED_RETRY
4558#endif
4559
4560/* XXX This code is limited to "true" double-byte encodings, as
4561   a) it assumes an incomplete character consists of a single byte, and
4562   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4563   encodings, see IsDBCSLeadByteEx documentation. */
4564
4565static int is_dbcs_lead_byte(const char *s, int offset)
4566{
4567    const char *curr = s + offset;
4568
4569    if (IsDBCSLeadByte(*curr)) {
4570        const char *prev = CharPrev(s, curr);
4571        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4572    }
4573    return 0;
4574}
4575
4576/*
4577 * Decode MBCS string into unicode object. If 'final' is set, converts
4578 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4579 */
4580static int decode_mbcs(PyUnicodeObject **v,
4581                       const char *s, /* MBCS string */
4582                       int size, /* sizeof MBCS string */
4583                       int final,
4584                       const char *errors)
4585{
4586    Py_UNICODE *p;
4587    Py_ssize_t n;
4588    DWORD usize;
4589    DWORD flags;
4590
4591    assert(size >= 0);
4592
4593    /* check and handle 'errors' arg */
4594    if (errors==NULL || strcmp(errors, "strict")==0)
4595        flags = MB_ERR_INVALID_CHARS;
4596    else if (strcmp(errors, "ignore")==0)
4597        flags = 0;
4598    else {
4599        PyErr_Format(PyExc_ValueError,
4600                     "mbcs encoding does not support errors='%s'",
4601                     errors);
4602        return -1;
4603    }
4604
4605    /* Skip trailing lead-byte unless 'final' is set */
4606    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4607        --size;
4608
4609    /* First get the size of the result */
4610    if (size > 0) {
4611        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4612        if (usize==0)
4613            goto mbcs_decode_error;
4614    } else
4615        usize = 0;
4616
4617    if (*v == NULL) {
4618        /* Create unicode object */
4619        *v = _PyUnicode_New(usize);
4620        if (*v == NULL)
4621            return -1;
4622        n = 0;
4623    }
4624    else {
4625        /* Extend unicode object */
4626        n = PyUnicode_GET_SIZE(*v);
4627        if (_PyUnicode_Resize(v, n + usize) < 0)
4628            return -1;
4629    }
4630
4631    /* Do the conversion */
4632    if (usize > 0) {
4633        p = PyUnicode_AS_UNICODE(*v) + n;
4634        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4635            goto mbcs_decode_error;
4636        }
4637    }
4638    return size;
4639
4640mbcs_decode_error:
4641    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4642       we raise a UnicodeDecodeError - else it is a 'generic'
4643       windows error
4644     */
4645    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4646        /* Ideally, we should get reason from FormatMessage - this
4647           is the Windows 2000 English version of the message
4648        */
4649        PyObject *exc = NULL;
4650        const char *reason = "No mapping for the Unicode character exists "
4651                             "in the target multi-byte code page.";
4652        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4653        if (exc != NULL) {
4654            PyCodec_StrictErrors(exc);
4655            Py_DECREF(exc);
4656        }
4657    } else {
4658        PyErr_SetFromWindowsErrWithFilename(0, NULL);
4659    }
4660    return -1;
4661}
4662
4663PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
4664                                       Py_ssize_t size,
4665                                       const char *errors,
4666                                       Py_ssize_t *consumed)
4667{
4668    PyUnicodeObject *v = NULL;
4669    int done;
4670
4671    if (consumed)
4672        *consumed = 0;
4673
4674#ifdef NEED_RETRY
4675  retry:
4676    if (size > INT_MAX)
4677        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
4678    else
4679#endif
4680        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
4681
4682    if (done < 0) {
4683        Py_XDECREF(v);
4684        return NULL;
4685    }
4686
4687    if (consumed)
4688        *consumed += done;
4689
4690#ifdef NEED_RETRY
4691    if (size > INT_MAX) {
4692        s += done;
4693        size -= done;
4694        goto retry;
4695    }
4696#endif
4697
4698    return (PyObject *)v;
4699}
4700
4701PyObject *PyUnicode_DecodeMBCS(const char *s,
4702                               Py_ssize_t size,
4703                               const char *errors)
4704{
4705    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
4706}
4707
4708/*
4709 * Convert unicode into string object (MBCS).
4710 * Returns 0 if succeed, -1 otherwise.
4711 */
4712static int encode_mbcs(PyObject **repr,
4713                       const Py_UNICODE *p, /* unicode */
4714                       int size, /* size of unicode */
4715                       const char* errors)
4716{
4717    BOOL usedDefaultChar = FALSE;
4718    BOOL *pusedDefaultChar;
4719    int mbcssize;
4720    Py_ssize_t n;
4721    PyObject *exc = NULL;
4722    DWORD flags;
4723
4724    assert(size >= 0);
4725
4726    /* check and handle 'errors' arg */
4727    if (errors==NULL || strcmp(errors, "strict")==0) {
4728        flags = WC_NO_BEST_FIT_CHARS;
4729        pusedDefaultChar = &usedDefaultChar;
4730    } else if (strcmp(errors, "replace")==0) {
4731        flags = 0;
4732        pusedDefaultChar = NULL;
4733    } else {
4734         PyErr_Format(PyExc_ValueError,
4735                      "mbcs encoding does not support errors='%s'",
4736                      errors);
4737         return -1;
4738    }
4739
4740    /* First get the size of the result */
4741    if (size > 0) {
4742        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
4743                                       NULL, pusedDefaultChar);
4744        if (mbcssize == 0) {
4745            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4746            return -1;
4747        }
4748        /* If we used a default char, then we failed! */
4749        if (pusedDefaultChar && *pusedDefaultChar)
4750            goto mbcs_encode_error;
4751    } else {
4752        mbcssize = 0;
4753    }
4754
4755    if (*repr == NULL) {
4756        /* Create string object */
4757        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
4758        if (*repr == NULL)
4759            return -1;
4760        n = 0;
4761    }
4762    else {
4763        /* Extend string object */
4764        n = PyBytes_Size(*repr);
4765        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
4766            return -1;
4767    }
4768
4769    /* Do the conversion */
4770    if (size > 0) {
4771        char *s = PyBytes_AS_STRING(*repr) + n;
4772        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
4773                                     NULL, pusedDefaultChar)) {
4774            PyErr_SetFromWindowsErrWithFilename(0, NULL);
4775            return -1;
4776        }
4777        if (pusedDefaultChar && *pusedDefaultChar)
4778            goto mbcs_encode_error;
4779    }
4780    return 0;
4781
4782mbcs_encode_error:
4783    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
4784    Py_XDECREF(exc);
4785    return -1;
4786}
4787
4788PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
4789                               Py_ssize_t size,
4790                               const char *errors)
4791{
4792    PyObject *repr = NULL;
4793    int ret;
4794
4795#ifdef NEED_RETRY
4796  retry:
4797    if (size > INT_MAX)
4798        ret = encode_mbcs(&repr, p, INT_MAX, errors);
4799    else
4800#endif
4801        ret = encode_mbcs(&repr, p, (int)size, errors);
4802
4803    if (ret < 0) {
4804        Py_XDECREF(repr);
4805        return NULL;
4806    }
4807
4808#ifdef NEED_RETRY
4809    if (size > INT_MAX) {
4810        p += INT_MAX;
4811        size -= INT_MAX;
4812        goto retry;
4813    }
4814#endif
4815
4816    return repr;
4817}
4818
4819PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
4820{
4821    if (!PyUnicode_Check(unicode)) {
4822        PyErr_BadArgument();
4823        return NULL;
4824    }
4825    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
4826                                PyUnicode_GET_SIZE(unicode),
4827                                NULL);
4828}
4829
4830#undef NEED_RETRY
4831
4832#endif /* MS_WINDOWS */
4833
4834/* --- Character Mapping Codec -------------------------------------------- */
4835
4836PyObject *PyUnicode_DecodeCharmap(const char *s,
4837                                  Py_ssize_t size,
4838                                  PyObject *mapping,
4839                                  const char *errors)
4840{
4841    const char *starts = s;
4842    Py_ssize_t startinpos;
4843    Py_ssize_t endinpos;
4844    Py_ssize_t outpos;
4845    const char *e;
4846    PyUnicodeObject *v;
4847    Py_UNICODE *p;
4848    Py_ssize_t extrachars = 0;
4849    PyObject *errorHandler = NULL;
4850    PyObject *exc = NULL;
4851    Py_UNICODE *mapstring = NULL;
4852    Py_ssize_t maplen = 0;
4853
4854    /* Default to Latin-1 */
4855    if (mapping == NULL)
4856        return PyUnicode_DecodeLatin1(s, size, errors);
4857
4858    v = _PyUnicode_New(size);
4859    if (v == NULL)
4860        goto onError;
4861    if (size == 0)
4862        return (PyObject *)v;
4863    p = PyUnicode_AS_UNICODE(v);
4864    e = s + size;
4865    if (PyUnicode_CheckExact(mapping)) {
4866        mapstring = PyUnicode_AS_UNICODE(mapping);
4867        maplen = PyUnicode_GET_SIZE(mapping);
4868        while (s < e) {
4869            unsigned char ch = *s;
4870            Py_UNICODE x = 0xfffe; /* illegal value */
4871
4872            if (ch < maplen)
4873                x = mapstring[ch];
4874
4875            if (x == 0xfffe) {
4876                /* undefined mapping */
4877                outpos = p-PyUnicode_AS_UNICODE(v);
4878                startinpos = s-starts;
4879                endinpos = startinpos+1;
4880                if (unicode_decode_call_errorhandler(
4881                        errors, &errorHandler,
4882                        "charmap", "character maps to <undefined>",
4883                        &starts, &e, &startinpos, &endinpos, &exc, &s,
4884                        &v, &outpos, &p)) {
4885                    goto onError;
4886                }
4887                continue;
4888            }
4889            *p++ = x;
4890            ++s;
4891        }
4892    }
4893    else {
4894        while (s < e) {
4895            unsigned char ch = *s;
4896            PyObject *w, *x;
4897
4898            /* Get mapping (char ordinal -> integer, Unicode char or None) */
4899            w = PyLong_FromLong((long)ch);
4900            if (w == NULL)
4901                goto onError;
4902            x = PyObject_GetItem(mapping, w);
4903            Py_DECREF(w);
4904            if (x == NULL) {
4905                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
4906                    /* No mapping found means: mapping is undefined. */
4907                    PyErr_Clear();
4908                    x = Py_None;
4909                    Py_INCREF(x);
4910                } else
4911                    goto onError;
4912            }
4913
4914            /* Apply mapping */
4915            if (PyLong_Check(x)) {
4916                long value = PyLong_AS_LONG(x);
4917                if (value < 0 || value > 65535) {
4918                    PyErr_SetString(PyExc_TypeError,
4919                                    "character mapping must be in range(65536)");
4920                    Py_DECREF(x);
4921                    goto onError;
4922                }
4923                *p++ = (Py_UNICODE)value;
4924            }
4925            else if (x == Py_None) {
4926                /* undefined mapping */
4927                outpos = p-PyUnicode_AS_UNICODE(v);
4928                startinpos = s-starts;
4929                endinpos = startinpos+1;
4930                if (unicode_decode_call_errorhandler(
4931                        errors, &errorHandler,
4932                        "charmap", "character maps to <undefined>",
4933                        &starts, &e, &startinpos, &endinpos, &exc, &s,
4934                        &v, &outpos, &p)) {
4935                    Py_DECREF(x);
4936                    goto onError;
4937                }
4938                Py_DECREF(x);
4939                continue;
4940            }
4941            else if (PyUnicode_Check(x)) {
4942                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
4943
4944                if (targetsize == 1)
4945                    /* 1-1 mapping */
4946                    *p++ = *PyUnicode_AS_UNICODE(x);
4947
4948                else if (targetsize > 1) {
4949                    /* 1-n mapping */
4950                    if (targetsize > extrachars) {
4951                        /* resize first */
4952                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
4953                        Py_ssize_t needed = (targetsize - extrachars) + \
4954                            (targetsize << 2);
4955                        extrachars += needed;
4956                        /* XXX overflow detection missing */
4957                        if (_PyUnicode_Resize(&v,
4958                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
4959                            Py_DECREF(x);
4960                            goto onError;
4961                        }
4962                        p = PyUnicode_AS_UNICODE(v) + oldpos;
4963                    }
4964                    Py_UNICODE_COPY(p,
4965                                    PyUnicode_AS_UNICODE(x),
4966                                    targetsize);
4967                    p += targetsize;
4968                    extrachars -= targetsize;
4969                }
4970                /* 1-0 mapping: skip the character */
4971            }
4972            else {
4973                /* wrong return value */
4974                PyErr_SetString(PyExc_TypeError,
4975                                "character mapping must return integer, None or str");
4976                Py_DECREF(x);
4977                goto onError;
4978            }
4979            Py_DECREF(x);
4980            ++s;
4981        }
4982    }
4983    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4984        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4985            goto onError;
4986    Py_XDECREF(errorHandler);
4987    Py_XDECREF(exc);
4988    return (PyObject *)v;
4989
4990  onError:
4991    Py_XDECREF(errorHandler);
4992    Py_XDECREF(exc);
4993    Py_XDECREF(v);
4994    return NULL;
4995}
4996
4997/* Charmap encoding: the lookup table */
4998
4999struct encoding_map{
5000    PyObject_HEAD
5001    unsigned char level1[32];
5002    int count2, count3;
5003    unsigned char level23[1];
5004};
5005
5006static PyObject*
5007encoding_map_size(PyObject *obj, PyObject* args)
5008{
5009    struct encoding_map *map = (struct encoding_map*)obj;
5010    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5011                           128*map->count3);
5012}
5013
5014static PyMethodDef encoding_map_methods[] = {
5015    {"size", encoding_map_size, METH_NOARGS,
5016     PyDoc_STR("Return the size (in bytes) of this object") },
5017    { 0 }
5018};
5019
5020static void
5021encoding_map_dealloc(PyObject* o)
5022{
5023    PyObject_FREE(o);
5024}
5025
5026static PyTypeObject EncodingMapType = {
5027    PyVarObject_HEAD_INIT(NULL, 0)
5028    "EncodingMap",          /*tp_name*/
5029    sizeof(struct encoding_map),   /*tp_basicsize*/
5030    0,                      /*tp_itemsize*/
5031    /* methods */
5032    encoding_map_dealloc,   /*tp_dealloc*/
5033    0,                      /*tp_print*/
5034    0,                      /*tp_getattr*/
5035    0,                      /*tp_setattr*/
5036    0,                      /*tp_reserved*/
5037    0,                      /*tp_repr*/
5038    0,                      /*tp_as_number*/
5039    0,                      /*tp_as_sequence*/
5040    0,                      /*tp_as_mapping*/
5041    0,                      /*tp_hash*/
5042    0,                      /*tp_call*/
5043    0,                      /*tp_str*/
5044    0,                      /*tp_getattro*/
5045    0,                      /*tp_setattro*/
5046    0,                      /*tp_as_buffer*/
5047    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5048    0,                      /*tp_doc*/
5049    0,                      /*tp_traverse*/
5050    0,                      /*tp_clear*/
5051    0,                      /*tp_richcompare*/
5052    0,                      /*tp_weaklistoffset*/
5053    0,                      /*tp_iter*/
5054    0,                      /*tp_iternext*/
5055    encoding_map_methods,   /*tp_methods*/
5056    0,                      /*tp_members*/
5057    0,                      /*tp_getset*/
5058    0,                      /*tp_base*/
5059    0,                      /*tp_dict*/
5060    0,                      /*tp_descr_get*/
5061    0,                      /*tp_descr_set*/
5062    0,                      /*tp_dictoffset*/
5063    0,                      /*tp_init*/
5064    0,                      /*tp_alloc*/
5065    0,                      /*tp_new*/
5066    0,                      /*tp_free*/
5067    0,                      /*tp_is_gc*/
5068};
5069
5070PyObject*
5071PyUnicode_BuildEncodingMap(PyObject* string)
5072{
5073    Py_UNICODE *decode;
5074    PyObject *result;
5075    struct encoding_map *mresult;
5076    int i;
5077    int need_dict = 0;
5078    unsigned char level1[32];
5079    unsigned char level2[512];
5080    unsigned char *mlevel1, *mlevel2, *mlevel3;
5081    int count2 = 0, count3 = 0;
5082
5083    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5084        PyErr_BadArgument();
5085        return NULL;
5086    }
5087    decode = PyUnicode_AS_UNICODE(string);
5088    memset(level1, 0xFF, sizeof level1);
5089    memset(level2, 0xFF, sizeof level2);
5090
5091    /* If there isn't a one-to-one mapping of NULL to \0,
5092       or if there are non-BMP characters, we need to use
5093       a mapping dictionary. */
5094    if (decode[0] != 0)
5095        need_dict = 1;
5096    for (i = 1; i < 256; i++) {
5097        int l1, l2;
5098        if (decode[i] == 0
5099#ifdef Py_UNICODE_WIDE
5100            || decode[i] > 0xFFFF
5101#endif
5102            ) {
5103            need_dict = 1;
5104            break;
5105        }
5106        if (decode[i] == 0xFFFE)
5107            /* unmapped character */
5108            continue;
5109        l1 = decode[i] >> 11;
5110        l2 = decode[i] >> 7;
5111        if (level1[l1] == 0xFF)
5112            level1[l1] = count2++;
5113        if (level2[l2] == 0xFF)
5114            level2[l2] = count3++;
5115    }
5116
5117    if (count2 >= 0xFF || count3 >= 0xFF)
5118        need_dict = 1;
5119
5120    if (need_dict) {
5121        PyObject *result = PyDict_New();
5122        PyObject *key, *value;
5123        if (!result)
5124            return NULL;
5125        for (i = 0; i < 256; i++) {
5126            key = value = NULL;
5127            key = PyLong_FromLong(decode[i]);
5128            value = PyLong_FromLong(i);
5129            if (!key || !value)
5130                goto failed1;
5131            if (PyDict_SetItem(result, key, value) == -1)
5132                goto failed1;
5133            Py_DECREF(key);
5134            Py_DECREF(value);
5135        }
5136        return result;
5137      failed1:
5138        Py_XDECREF(key);
5139        Py_XDECREF(value);
5140        Py_DECREF(result);
5141        return NULL;
5142    }
5143
5144    /* Create a three-level trie */
5145    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5146                             16*count2 + 128*count3 - 1);
5147    if (!result)
5148        return PyErr_NoMemory();
5149    PyObject_Init(result, &EncodingMapType);
5150    mresult = (struct encoding_map*)result;
5151    mresult->count2 = count2;
5152    mresult->count3 = count3;
5153    mlevel1 = mresult->level1;
5154    mlevel2 = mresult->level23;
5155    mlevel3 = mresult->level23 + 16*count2;
5156    memcpy(mlevel1, level1, 32);
5157    memset(mlevel2, 0xFF, 16*count2);
5158    memset(mlevel3, 0, 128*count3);
5159    count3 = 0;
5160    for (i = 1; i < 256; i++) {
5161        int o1, o2, o3, i2, i3;
5162        if (decode[i] == 0xFFFE)
5163            /* unmapped character */
5164            continue;
5165        o1 = decode[i]>>11;
5166        o2 = (decode[i]>>7) & 0xF;
5167        i2 = 16*mlevel1[o1] + o2;
5168        if (mlevel2[i2] == 0xFF)
5169            mlevel2[i2] = count3++;
5170        o3 = decode[i] & 0x7F;
5171        i3 = 128*mlevel2[i2] + o3;
5172        mlevel3[i3] = i;
5173    }
5174    return result;
5175}
5176
5177static int
5178encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5179{
5180    struct encoding_map *map = (struct encoding_map*)mapping;
5181    int l1 = c>>11;
5182    int l2 = (c>>7) & 0xF;
5183    int l3 = c & 0x7F;
5184    int i;
5185
5186#ifdef Py_UNICODE_WIDE
5187    if (c > 0xFFFF) {
5188        return -1;
5189    }
5190#endif
5191    if (c == 0)
5192        return 0;
5193    /* level 1*/
5194    i = map->level1[l1];
5195    if (i == 0xFF) {
5196        return -1;
5197    }
5198    /* level 2*/
5199    i = map->level23[16*i+l2];
5200    if (i == 0xFF) {
5201        return -1;
5202    }
5203    /* level 3 */
5204    i = map->level23[16*map->count2 + 128*i + l3];
5205    if (i == 0) {
5206        return -1;
5207    }
5208    return i;
5209}
5210
5211/* Lookup the character ch in the mapping. If the character
5212   can't be found, Py_None is returned (or NULL, if another
5213   error occurred). */
5214static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5215{
5216    PyObject *w = PyLong_FromLong((long)c);
5217    PyObject *x;
5218
5219    if (w == NULL)
5220        return NULL;
5221    x = PyObject_GetItem(mapping, w);
5222    Py_DECREF(w);
5223    if (x == NULL) {
5224        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5225            /* No mapping found means: mapping is undefined. */
5226            PyErr_Clear();
5227            x = Py_None;
5228            Py_INCREF(x);
5229            return x;
5230        } else
5231            return NULL;
5232    }
5233    else if (x == Py_None)
5234        return x;
5235    else if (PyLong_Check(x)) {
5236        long value = PyLong_AS_LONG(x);
5237        if (value < 0 || value > 255) {
5238            PyErr_SetString(PyExc_TypeError,
5239                            "character mapping must be in range(256)");
5240            Py_DECREF(x);
5241            return NULL;
5242        }
5243        return x;
5244    }
5245    else if (PyBytes_Check(x))
5246        return x;
5247    else {
5248        /* wrong return value */
5249        PyErr_Format(PyExc_TypeError,
5250                     "character mapping must return integer, bytes or None, not %.400s",
5251                     x->ob_type->tp_name);
5252        Py_DECREF(x);
5253        return NULL;
5254    }
5255}
5256
5257static int
5258charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5259{
5260    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5261    /* exponentially overallocate to minimize reallocations */
5262    if (requiredsize < 2*outsize)
5263        requiredsize = 2*outsize;
5264    if (_PyBytes_Resize(outobj, requiredsize))
5265        return -1;
5266    return 0;
5267}
5268
5269typedef enum charmapencode_result {
5270    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5271}charmapencode_result;
5272/* lookup the character, put the result in the output string and adjust
5273   various state variables. Resize the output bytes object if not enough
5274   space is available. Return a new reference to the object that
5275   was put in the output buffer, or Py_None, if the mapping was undefined
5276   (in which case no character was written) or NULL, if a
5277   reallocation error occurred. The caller must decref the result */
5278static
5279charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5280                                          PyObject **outobj, Py_ssize_t *outpos)
5281{
5282    PyObject *rep;
5283    char *outstart;
5284    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5285
5286    if (Py_TYPE(mapping) == &EncodingMapType) {
5287        int res = encoding_map_lookup(c, mapping);
5288        Py_ssize_t requiredsize = *outpos+1;
5289        if (res == -1)
5290            return enc_FAILED;
5291        if (outsize<requiredsize)
5292            if (charmapencode_resize(outobj, outpos, requiredsize))
5293                return enc_EXCEPTION;
5294        outstart = PyBytes_AS_STRING(*outobj);
5295        outstart[(*outpos)++] = (char)res;
5296        return enc_SUCCESS;
5297    }
5298
5299    rep = charmapencode_lookup(c, mapping);
5300    if (rep==NULL)
5301        return enc_EXCEPTION;
5302    else if (rep==Py_None) {
5303        Py_DECREF(rep);
5304        return enc_FAILED;
5305    } else {
5306        if (PyLong_Check(rep)) {
5307            Py_ssize_t requiredsize = *outpos+1;
5308            if (outsize<requiredsize)
5309                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5310                    Py_DECREF(rep);
5311                    return enc_EXCEPTION;
5312                }
5313            outstart = PyBytes_AS_STRING(*outobj);
5314            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5315        }
5316        else {
5317            const char *repchars = PyBytes_AS_STRING(rep);
5318            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5319            Py_ssize_t requiredsize = *outpos+repsize;
5320            if (outsize<requiredsize)
5321                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5322                    Py_DECREF(rep);
5323                    return enc_EXCEPTION;
5324                }
5325            outstart = PyBytes_AS_STRING(*outobj);
5326            memcpy(outstart + *outpos, repchars, repsize);
5327            *outpos += repsize;
5328        }
5329    }
5330    Py_DECREF(rep);
5331    return enc_SUCCESS;
5332}
5333
5334/* handle an error in PyUnicode_EncodeCharmap
5335   Return 0 on success, -1 on error */
5336static
5337int charmap_encoding_error(
5338    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5339    PyObject **exceptionObject,
5340    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5341    PyObject **res, Py_ssize_t *respos)
5342{
5343    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5344    Py_ssize_t repsize;
5345    Py_ssize_t newpos;
5346    Py_UNICODE *uni2;
5347    /* startpos for collecting unencodable chars */
5348    Py_ssize_t collstartpos = *inpos;
5349    Py_ssize_t collendpos = *inpos+1;
5350    Py_ssize_t collpos;
5351    char *encoding = "charmap";
5352    char *reason = "character maps to <undefined>";
5353    charmapencode_result x;
5354
5355    /* find all unencodable characters */
5356    while (collendpos < size) {
5357        PyObject *rep;
5358        if (Py_TYPE(mapping) == &EncodingMapType) {
5359            int res = encoding_map_lookup(p[collendpos], mapping);
5360            if (res != -1)
5361                break;
5362            ++collendpos;
5363            continue;
5364        }
5365
5366        rep = charmapencode_lookup(p[collendpos], mapping);
5367        if (rep==NULL)
5368            return -1;
5369        else if (rep!=Py_None) {
5370            Py_DECREF(rep);
5371            break;
5372        }
5373        Py_DECREF(rep);
5374        ++collendpos;
5375    }
5376    /* cache callback name lookup
5377     * (if not done yet, i.e. it's the first error) */
5378    if (*known_errorHandler==-1) {
5379        if ((errors==NULL) || (!strcmp(errors, "strict")))
5380            *known_errorHandler = 1;
5381        else if (!strcmp(errors, "replace"))
5382            *known_errorHandler = 2;
5383        else if (!strcmp(errors, "ignore"))
5384            *known_errorHandler = 3;
5385        else if (!strcmp(errors, "xmlcharrefreplace"))
5386            *known_errorHandler = 4;
5387        else
5388            *known_errorHandler = 0;
5389    }
5390    switch (*known_errorHandler) {
5391    case 1: /* strict */
5392        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5393        return -1;
5394    case 2: /* replace */
5395        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5396            x = charmapencode_output('?', mapping, res, respos);
5397            if (x==enc_EXCEPTION) {
5398                return -1;
5399            }
5400            else if (x==enc_FAILED) {
5401                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5402                return -1;
5403            }
5404        }
5405        /* fall through */
5406    case 3: /* ignore */
5407        *inpos = collendpos;
5408        break;
5409    case 4: /* xmlcharrefreplace */
5410        /* generate replacement (temporarily (mis)uses p) */
5411        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5412            char buffer[2+29+1+1];
5413            char *cp;
5414            sprintf(buffer, "&#%d;", (int)p[collpos]);
5415            for (cp = buffer; *cp; ++cp) {
5416                x = charmapencode_output(*cp, mapping, res, respos);
5417                if (x==enc_EXCEPTION)
5418                    return -1;
5419                else if (x==enc_FAILED) {
5420                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5421                    return -1;
5422                }
5423            }
5424        }
5425        *inpos = collendpos;
5426        break;
5427    default:
5428        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5429                                                      encoding, reason, p, size, exceptionObject,
5430                                                      collstartpos, collendpos, &newpos);
5431        if (repunicode == NULL)
5432            return -1;
5433        if (PyBytes_Check(repunicode)) {
5434            /* Directly copy bytes result to output. */
5435            Py_ssize_t outsize = PyBytes_Size(*res);
5436            Py_ssize_t requiredsize;
5437            repsize = PyBytes_Size(repunicode);
5438            requiredsize = *respos + repsize;
5439            if (requiredsize > outsize)
5440                /* Make room for all additional bytes. */
5441                if (charmapencode_resize(res, respos, requiredsize)) {
5442                    Py_DECREF(repunicode);
5443                    return -1;
5444                }
5445            memcpy(PyBytes_AsString(*res) + *respos,
5446                   PyBytes_AsString(repunicode),  repsize);
5447            *respos += repsize;
5448            *inpos = newpos;
5449            Py_DECREF(repunicode);
5450            break;
5451        }
5452        /* generate replacement  */
5453        repsize = PyUnicode_GET_SIZE(repunicode);
5454        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5455            x = charmapencode_output(*uni2, mapping, res, respos);
5456            if (x==enc_EXCEPTION) {
5457                return -1;
5458            }
5459            else if (x==enc_FAILED) {
5460                Py_DECREF(repunicode);
5461                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5462                return -1;
5463            }
5464        }
5465        *inpos = newpos;
5466        Py_DECREF(repunicode);
5467    }
5468    return 0;
5469}
5470
5471PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5472                                  Py_ssize_t size,
5473                                  PyObject *mapping,
5474                                  const char *errors)
5475{
5476    /* output object */
5477    PyObject *res = NULL;
5478    /* current input position */
5479    Py_ssize_t inpos = 0;
5480    /* current output position */
5481    Py_ssize_t respos = 0;
5482    PyObject *errorHandler = NULL;
5483    PyObject *exc = NULL;
5484    /* the following variable is used for caching string comparisons
5485     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5486     * 3=ignore, 4=xmlcharrefreplace */
5487    int known_errorHandler = -1;
5488
5489    /* Default to Latin-1 */
5490    if (mapping == NULL)
5491        return PyUnicode_EncodeLatin1(p, size, errors);
5492
5493    /* allocate enough for a simple encoding without
5494       replacements, if we need more, we'll resize */
5495    res = PyBytes_FromStringAndSize(NULL, size);
5496    if (res == NULL)
5497        goto onError;
5498    if (size == 0)
5499        return res;
5500
5501    while (inpos<size) {
5502        /* try to encode it */
5503        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5504        if (x==enc_EXCEPTION) /* error */
5505            goto onError;
5506        if (x==enc_FAILED) { /* unencodable character */
5507            if (charmap_encoding_error(p, size, &inpos, mapping,
5508                                       &exc,
5509                                       &known_errorHandler, &errorHandler, errors,
5510                                       &res, &respos)) {
5511                goto onError;
5512            }
5513        }
5514        else
5515            /* done with this character => adjust input position */
5516            ++inpos;
5517    }
5518
5519    /* Resize if we allocated to much */
5520    if (respos<PyBytes_GET_SIZE(res))
5521        if (_PyBytes_Resize(&res, respos) < 0)
5522            goto onError;
5523
5524    Py_XDECREF(exc);
5525    Py_XDECREF(errorHandler);
5526    return res;
5527
5528  onError:
5529    Py_XDECREF(res);
5530    Py_XDECREF(exc);
5531    Py_XDECREF(errorHandler);
5532    return NULL;
5533}
5534
5535PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5536                                    PyObject *mapping)
5537{
5538    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5539        PyErr_BadArgument();
5540        return NULL;
5541    }
5542    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5543                                   PyUnicode_GET_SIZE(unicode),
5544                                   mapping,
5545                                   NULL);
5546}
5547
5548/* create or adjust a UnicodeTranslateError */
5549static void make_translate_exception(PyObject **exceptionObject,
5550                                     const Py_UNICODE *unicode, Py_ssize_t size,
5551                                     Py_ssize_t startpos, Py_ssize_t endpos,
5552                                     const char *reason)
5553{
5554    if (*exceptionObject == NULL) {
5555        *exceptionObject = PyUnicodeTranslateError_Create(
5556            unicode, size, startpos, endpos, reason);
5557    }
5558    else {
5559        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5560            goto onError;
5561        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5562            goto onError;
5563        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5564            goto onError;
5565        return;
5566      onError:
5567        Py_DECREF(*exceptionObject);
5568        *exceptionObject = NULL;
5569    }
5570}
5571
5572/* raises a UnicodeTranslateError */
5573static void raise_translate_exception(PyObject **exceptionObject,
5574                                      const Py_UNICODE *unicode, Py_ssize_t size,
5575                                      Py_ssize_t startpos, Py_ssize_t endpos,
5576                                      const char *reason)
5577{
5578    make_translate_exception(exceptionObject,
5579                             unicode, size, startpos, endpos, reason);
5580    if (*exceptionObject != NULL)
5581        PyCodec_StrictErrors(*exceptionObject);
5582}
5583
5584/* error handling callback helper:
5585   build arguments, call the callback and check the arguments,
5586   put the result into newpos and return the replacement string, which
5587   has to be freed by the caller */
5588static PyObject *unicode_translate_call_errorhandler(const char *errors,
5589                                                     PyObject **errorHandler,
5590                                                     const char *reason,
5591                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5592                                                     Py_ssize_t startpos, Py_ssize_t endpos,
5593                                                     Py_ssize_t *newpos)
5594{
5595    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5596
5597    Py_ssize_t i_newpos;
5598    PyObject *restuple;
5599    PyObject *resunicode;
5600
5601    if (*errorHandler == NULL) {
5602        *errorHandler = PyCodec_LookupError(errors);
5603        if (*errorHandler == NULL)
5604            return NULL;
5605    }
5606
5607    make_translate_exception(exceptionObject,
5608                             unicode, size, startpos, endpos, reason);
5609    if (*exceptionObject == NULL)
5610        return NULL;
5611
5612    restuple = PyObject_CallFunctionObjArgs(
5613        *errorHandler, *exceptionObject, NULL);
5614    if (restuple == NULL)
5615        return NULL;
5616    if (!PyTuple_Check(restuple)) {
5617        PyErr_SetString(PyExc_TypeError, &argparse[4]);
5618        Py_DECREF(restuple);
5619        return NULL;
5620    }
5621    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5622                          &resunicode, &i_newpos)) {
5623        Py_DECREF(restuple);
5624        return NULL;
5625    }
5626    if (i_newpos<0)
5627        *newpos = size+i_newpos;
5628    else
5629        *newpos = i_newpos;
5630    if (*newpos<0 || *newpos>size) {
5631        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5632        Py_DECREF(restuple);
5633        return NULL;
5634    }
5635    Py_INCREF(resunicode);
5636    Py_DECREF(restuple);
5637    return resunicode;
5638}
5639
5640/* Lookup the character ch in the mapping and put the result in result,
5641   which must be decrefed by the caller.
5642   Return 0 on success, -1 on error */
5643static
5644int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5645{
5646    PyObject *w = PyLong_FromLong((long)c);
5647    PyObject *x;
5648
5649    if (w == NULL)
5650        return -1;
5651    x = PyObject_GetItem(mapping, w);
5652    Py_DECREF(w);
5653    if (x == NULL) {
5654        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5655            /* No mapping found means: use 1:1 mapping. */
5656            PyErr_Clear();
5657            *result = NULL;
5658            return 0;
5659        } else
5660            return -1;
5661    }
5662    else if (x == Py_None) {
5663        *result = x;
5664        return 0;
5665    }
5666    else if (PyLong_Check(x)) {
5667        long value = PyLong_AS_LONG(x);
5668        long max = PyUnicode_GetMax();
5669        if (value < 0 || value > max) {
5670            PyErr_Format(PyExc_TypeError,
5671                         "character mapping must be in range(0x%x)", max+1);
5672            Py_DECREF(x);
5673            return -1;
5674        }
5675        *result = x;
5676        return 0;
5677    }
5678    else if (PyUnicode_Check(x)) {
5679        *result = x;
5680        return 0;
5681    }
5682    else {
5683        /* wrong return value */
5684        PyErr_SetString(PyExc_TypeError,
5685                        "character mapping must return integer, None or str");
5686        Py_DECREF(x);
5687        return -1;
5688    }
5689}
5690/* ensure that *outobj is at least requiredsize characters long,
5691   if not reallocate and adjust various state variables.
5692   Return 0 on success, -1 on error */
5693static
5694int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
5695                               Py_ssize_t requiredsize)
5696{
5697    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
5698    if (requiredsize > oldsize) {
5699        /* remember old output position */
5700        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
5701        /* exponentially overallocate to minimize reallocations */
5702        if (requiredsize < 2 * oldsize)
5703            requiredsize = 2 * oldsize;
5704        if (PyUnicode_Resize(outobj, requiredsize) < 0)
5705            return -1;
5706        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
5707    }
5708    return 0;
5709}
5710/* lookup the character, put the result in the output string and adjust
5711   various state variables. Return a new reference to the object that
5712   was put in the output buffer in *result, or Py_None, if the mapping was
5713   undefined (in which case no character was written).
5714   The called must decref result.
5715   Return 0 on success, -1 on error. */
5716static
5717int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
5718                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
5719                            PyObject **res)
5720{
5721    if (charmaptranslate_lookup(*curinp, mapping, res))
5722        return -1;
5723    if (*res==NULL) {
5724        /* not found => default to 1:1 mapping */
5725        *(*outp)++ = *curinp;
5726    }
5727    else if (*res==Py_None)
5728        ;
5729    else if (PyLong_Check(*res)) {
5730        /* no overflow check, because we know that the space is enough */
5731        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
5732    }
5733    else if (PyUnicode_Check(*res)) {
5734        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
5735        if (repsize==1) {
5736            /* no overflow check, because we know that the space is enough */
5737            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
5738        }
5739        else if (repsize!=0) {
5740            /* more than one character */
5741            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
5742                (insize - (curinp-startinp)) +
5743                repsize - 1;
5744            if (charmaptranslate_makespace(outobj, outp, requiredsize))
5745                return -1;
5746            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
5747            *outp += repsize;
5748        }
5749    }
5750    else
5751        return -1;
5752    return 0;
5753}
5754
5755PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
5756                                     Py_ssize_t size,
5757                                     PyObject *mapping,
5758                                     const char *errors)
5759{
5760    /* output object */
5761    PyObject *res = NULL;
5762    /* pointers to the beginning and end+1 of input */
5763    const Py_UNICODE *startp = p;
5764    const Py_UNICODE *endp = p + size;
5765    /* pointer into the output */
5766    Py_UNICODE *str;
5767    /* current output position */
5768    Py_ssize_t respos = 0;
5769    char *reason = "character maps to <undefined>";
5770    PyObject *errorHandler = NULL;
5771    PyObject *exc = NULL;
5772    /* the following variable is used for caching string comparisons
5773     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5774     * 3=ignore, 4=xmlcharrefreplace */
5775    int known_errorHandler = -1;
5776
5777    if (mapping == NULL) {
5778        PyErr_BadArgument();
5779        return NULL;
5780    }
5781
5782    /* allocate enough for a simple 1:1 translation without
5783       replacements, if we need more, we'll resize */
5784    res = PyUnicode_FromUnicode(NULL, size);
5785    if (res == NULL)
5786        goto onError;
5787    if (size == 0)
5788        return res;
5789    str = PyUnicode_AS_UNICODE(res);
5790
5791    while (p<endp) {
5792        /* try to encode it */
5793        PyObject *x = NULL;
5794        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
5795            Py_XDECREF(x);
5796            goto onError;
5797        }
5798        Py_XDECREF(x);
5799        if (x!=Py_None) /* it worked => adjust input pointer */
5800            ++p;
5801        else { /* untranslatable character */
5802            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5803            Py_ssize_t repsize;
5804            Py_ssize_t newpos;
5805            Py_UNICODE *uni2;
5806            /* startpos for collecting untranslatable chars */
5807            const Py_UNICODE *collstart = p;
5808            const Py_UNICODE *collend = p+1;
5809            const Py_UNICODE *coll;
5810
5811            /* find all untranslatable characters */
5812            while (collend < endp) {
5813                if (charmaptranslate_lookup(*collend, mapping, &x))
5814                    goto onError;
5815                Py_XDECREF(x);
5816                if (x!=Py_None)
5817                    break;
5818                ++collend;
5819            }
5820            /* cache callback name lookup
5821             * (if not done yet, i.e. it's the first error) */
5822            if (known_errorHandler==-1) {
5823                if ((errors==NULL) || (!strcmp(errors, "strict")))
5824                    known_errorHandler = 1;
5825                else if (!strcmp(errors, "replace"))
5826                    known_errorHandler = 2;
5827                else if (!strcmp(errors, "ignore"))
5828                    known_errorHandler = 3;
5829                else if (!strcmp(errors, "xmlcharrefreplace"))
5830                    known_errorHandler = 4;
5831                else
5832                    known_errorHandler = 0;
5833            }
5834            switch (known_errorHandler) {
5835            case 1: /* strict */
5836                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
5837                goto onError;
5838            case 2: /* replace */
5839                /* No need to check for space, this is a 1:1 replacement */
5840                for (coll = collstart; coll<collend; ++coll)
5841                    *str++ = '?';
5842                /* fall through */
5843            case 3: /* ignore */
5844                p = collend;
5845                break;
5846            case 4: /* xmlcharrefreplace */
5847                /* generate replacement (temporarily (mis)uses p) */
5848                for (p = collstart; p < collend; ++p) {
5849                    char buffer[2+29+1+1];
5850                    char *cp;
5851                    sprintf(buffer, "&#%d;", (int)*p);
5852                    if (charmaptranslate_makespace(&res, &str,
5853                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
5854                        goto onError;
5855                    for (cp = buffer; *cp; ++cp)
5856                        *str++ = *cp;
5857                }
5858                p = collend;
5859                break;
5860            default:
5861                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
5862                                                                 reason, startp, size, &exc,
5863                                                                 collstart-startp, collend-startp, &newpos);
5864                if (repunicode == NULL)
5865                    goto onError;
5866                /* generate replacement  */
5867                repsize = PyUnicode_GET_SIZE(repunicode);
5868                if (charmaptranslate_makespace(&res, &str,
5869                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
5870                    Py_DECREF(repunicode);
5871                    goto onError;
5872                }
5873                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
5874                    *str++ = *uni2;
5875                p = startp + newpos;
5876                Py_DECREF(repunicode);
5877            }
5878        }
5879    }
5880    /* Resize if we allocated to much */
5881    respos = str-PyUnicode_AS_UNICODE(res);
5882    if (respos<PyUnicode_GET_SIZE(res)) {
5883        if (PyUnicode_Resize(&res, respos) < 0)
5884            goto onError;
5885    }
5886    Py_XDECREF(exc);
5887    Py_XDECREF(errorHandler);
5888    return res;
5889
5890  onError:
5891    Py_XDECREF(res);
5892    Py_XDECREF(exc);
5893    Py_XDECREF(errorHandler);
5894    return NULL;
5895}
5896
5897PyObject *PyUnicode_Translate(PyObject *str,
5898                              PyObject *mapping,
5899                              const char *errors)
5900{
5901    PyObject *result;
5902
5903    str = PyUnicode_FromObject(str);
5904    if (str == NULL)
5905        goto onError;
5906    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
5907                                        PyUnicode_GET_SIZE(str),
5908                                        mapping,
5909                                        errors);
5910    Py_DECREF(str);
5911    return result;
5912
5913  onError:
5914    Py_XDECREF(str);
5915    return NULL;
5916}
5917
5918/* --- Decimal Encoder ---------------------------------------------------- */
5919
5920int PyUnicode_EncodeDecimal(Py_UNICODE *s,
5921                            Py_ssize_t length,
5922                            char *output,
5923                            const char *errors)
5924{
5925    Py_UNICODE *p, *end;
5926    PyObject *errorHandler = NULL;
5927    PyObject *exc = NULL;
5928    const char *encoding = "decimal";
5929    const char *reason = "invalid decimal Unicode string";
5930    /* the following variable is used for caching string comparisons
5931     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
5932    int known_errorHandler = -1;
5933
5934    if (output == NULL) {
5935        PyErr_BadArgument();
5936        return -1;
5937    }
5938
5939    p = s;
5940    end = s + length;
5941    while (p < end) {
5942        register Py_UNICODE ch = *p;
5943        int decimal;
5944        PyObject *repunicode;
5945        Py_ssize_t repsize;
5946        Py_ssize_t newpos;
5947        Py_UNICODE *uni2;
5948        Py_UNICODE *collstart;
5949        Py_UNICODE *collend;
5950
5951        if (Py_UNICODE_ISSPACE(ch)) {
5952            *output++ = ' ';
5953            ++p;
5954            continue;
5955        }
5956        decimal = Py_UNICODE_TODECIMAL(ch);
5957        if (decimal >= 0) {
5958            *output++ = '0' + decimal;
5959            ++p;
5960            continue;
5961        }
5962        if (0 < ch && ch < 256) {
5963            *output++ = (char)ch;
5964            ++p;
5965            continue;
5966        }
5967        /* All other characters are considered unencodable */
5968        collstart = p;
5969        collend = p+1;
5970        while (collend < end) {
5971            if ((0 < *collend && *collend < 256) ||
5972                !Py_UNICODE_ISSPACE(*collend) ||
5973                Py_UNICODE_TODECIMAL(*collend))
5974                break;
5975        }
5976        /* cache callback name lookup
5977         * (if not done yet, i.e. it's the first error) */
5978        if (known_errorHandler==-1) {
5979            if ((errors==NULL) || (!strcmp(errors, "strict")))
5980                known_errorHandler = 1;
5981            else if (!strcmp(errors, "replace"))
5982                known_errorHandler = 2;
5983            else if (!strcmp(errors, "ignore"))
5984                known_errorHandler = 3;
5985            else if (!strcmp(errors, "xmlcharrefreplace"))
5986                known_errorHandler = 4;
5987            else
5988                known_errorHandler = 0;
5989        }
5990        switch (known_errorHandler) {
5991        case 1: /* strict */
5992            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
5993            goto onError;
5994        case 2: /* replace */
5995            for (p = collstart; p < collend; ++p)
5996                *output++ = '?';
5997            /* fall through */
5998        case 3: /* ignore */
5999            p = collend;
6000            break;
6001        case 4: /* xmlcharrefreplace */
6002            /* generate replacement (temporarily (mis)uses p) */
6003            for (p = collstart; p < collend; ++p)
6004                output += sprintf(output, "&#%d;", (int)*p);
6005            p = collend;
6006            break;
6007        default:
6008            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6009                                                          encoding, reason, s, length, &exc,
6010                                                          collstart-s, collend-s, &newpos);
6011            if (repunicode == NULL)
6012                goto onError;
6013            if (!PyUnicode_Check(repunicode)) {
6014                /* Byte results not supported, since they have no decimal property. */
6015                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6016                Py_DECREF(repunicode);
6017                goto onError;
6018            }
6019            /* generate replacement  */
6020            repsize = PyUnicode_GET_SIZE(repunicode);
6021            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6022                Py_UNICODE ch = *uni2;
6023                if (Py_UNICODE_ISSPACE(ch))
6024                    *output++ = ' ';
6025                else {
6026                    decimal = Py_UNICODE_TODECIMAL(ch);
6027                    if (decimal >= 0)
6028                        *output++ = '0' + decimal;
6029                    else if (0 < ch && ch < 256)
6030                        *output++ = (char)ch;
6031                    else {
6032                        Py_DECREF(repunicode);
6033                        raise_encode_exception(&exc, encoding,
6034                                               s, length, collstart-s, collend-s, reason);
6035                        goto onError;
6036                    }
6037                }
6038            }
6039            p = s + newpos;
6040            Py_DECREF(repunicode);
6041        }
6042    }
6043    /* 0-terminate the output string */
6044    *output++ = '\0';
6045    Py_XDECREF(exc);
6046    Py_XDECREF(errorHandler);
6047    return 0;
6048
6049  onError:
6050    Py_XDECREF(exc);
6051    Py_XDECREF(errorHandler);
6052    return -1;
6053}
6054
6055/* --- Helpers ------------------------------------------------------------ */
6056
6057#include "stringlib/unicodedefs.h"
6058#include "stringlib/fastsearch.h"
6059
6060#include "stringlib/count.h"
6061#include "stringlib/find.h"
6062#include "stringlib/partition.h"
6063#include "stringlib/split.h"
6064
6065#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6066#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6067#include "stringlib/localeutil.h"
6068
6069/* helper macro to fixup start/end slice values */
6070#define ADJUST_INDICES(start, end, len)         \
6071    if (end > len)                              \
6072        end = len;                              \
6073    else if (end < 0) {                         \
6074        end += len;                             \
6075        if (end < 0)                            \
6076            end = 0;                            \
6077    }                                           \
6078    if (start < 0) {                            \
6079        start += len;                           \
6080        if (start < 0)                          \
6081            start = 0;                          \
6082    }
6083
6084Py_ssize_t PyUnicode_Count(PyObject *str,
6085                           PyObject *substr,
6086                           Py_ssize_t start,
6087                           Py_ssize_t end)
6088{
6089    Py_ssize_t result;
6090    PyUnicodeObject* str_obj;
6091    PyUnicodeObject* sub_obj;
6092
6093    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6094    if (!str_obj)
6095        return -1;
6096    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6097    if (!sub_obj) {
6098        Py_DECREF(str_obj);
6099        return -1;
6100    }
6101
6102    ADJUST_INDICES(start, end, str_obj->length);
6103    result = stringlib_count(
6104        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6105        PY_SSIZE_T_MAX
6106        );
6107
6108    Py_DECREF(sub_obj);
6109    Py_DECREF(str_obj);
6110
6111    return result;
6112}
6113
6114Py_ssize_t PyUnicode_Find(PyObject *str,
6115                          PyObject *sub,
6116                          Py_ssize_t start,
6117                          Py_ssize_t end,
6118                          int direction)
6119{
6120    Py_ssize_t result;
6121
6122    str = PyUnicode_FromObject(str);
6123    if (!str)
6124        return -2;
6125    sub = PyUnicode_FromObject(sub);
6126    if (!sub) {
6127        Py_DECREF(str);
6128        return -2;
6129    }
6130
6131    if (direction > 0)
6132        result = stringlib_find_slice(
6133            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6134            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6135            start, end
6136            );
6137    else
6138        result = stringlib_rfind_slice(
6139            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6140            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6141            start, end
6142            );
6143
6144    Py_DECREF(str);
6145    Py_DECREF(sub);
6146
6147    return result;
6148}
6149
6150static
6151int tailmatch(PyUnicodeObject *self,
6152              PyUnicodeObject *substring,
6153              Py_ssize_t start,
6154              Py_ssize_t end,
6155              int direction)
6156{
6157    if (substring->length == 0)
6158        return 1;
6159
6160    ADJUST_INDICES(start, end, self->length);
6161    end -= substring->length;
6162    if (end < start)
6163        return 0;
6164
6165    if (direction > 0) {
6166        if (Py_UNICODE_MATCH(self, end, substring))
6167            return 1;
6168    } else {
6169        if (Py_UNICODE_MATCH(self, start, substring))
6170            return 1;
6171    }
6172
6173    return 0;
6174}
6175
6176Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
6177                               PyObject *substr,
6178                               Py_ssize_t start,
6179                               Py_ssize_t end,
6180                               int direction)
6181{
6182    Py_ssize_t result;
6183
6184    str = PyUnicode_FromObject(str);
6185    if (str == NULL)
6186        return -1;
6187    substr = PyUnicode_FromObject(substr);
6188    if (substr == NULL) {
6189        Py_DECREF(str);
6190        return -1;
6191    }
6192
6193    result = tailmatch((PyUnicodeObject *)str,
6194                       (PyUnicodeObject *)substr,
6195                       start, end, direction);
6196    Py_DECREF(str);
6197    Py_DECREF(substr);
6198    return result;
6199}
6200
6201/* Apply fixfct filter to the Unicode object self and return a
6202   reference to the modified object */
6203
6204static
6205PyObject *fixup(PyUnicodeObject *self,
6206                int (*fixfct)(PyUnicodeObject *s))
6207{
6208
6209    PyUnicodeObject *u;
6210
6211    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6212    if (u == NULL)
6213        return NULL;
6214
6215    Py_UNICODE_COPY(u->str, self->str, self->length);
6216
6217    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6218        /* fixfct should return TRUE if it modified the buffer. If
6219           FALSE, return a reference to the original buffer instead
6220           (to save space, not time) */
6221        Py_INCREF(self);
6222        Py_DECREF(u);
6223        return (PyObject*) self;
6224    }
6225    return (PyObject*) u;
6226}
6227
6228static
6229int fixupper(PyUnicodeObject *self)
6230{
6231    Py_ssize_t len = self->length;
6232    Py_UNICODE *s = self->str;
6233    int status = 0;
6234
6235    while (len-- > 0) {
6236        register Py_UNICODE ch;
6237
6238        ch = Py_UNICODE_TOUPPER(*s);
6239        if (ch != *s) {
6240            status = 1;
6241            *s = ch;
6242        }
6243        s++;
6244    }
6245
6246    return status;
6247}
6248
6249static
6250int fixlower(PyUnicodeObject *self)
6251{
6252    Py_ssize_t len = self->length;
6253    Py_UNICODE *s = self->str;
6254    int status = 0;
6255
6256    while (len-- > 0) {
6257        register Py_UNICODE ch;
6258
6259        ch = Py_UNICODE_TOLOWER(*s);
6260        if (ch != *s) {
6261            status = 1;
6262            *s = ch;
6263        }
6264        s++;
6265    }
6266
6267    return status;
6268}
6269
6270static
6271int fixswapcase(PyUnicodeObject *self)
6272{
6273    Py_ssize_t len = self->length;
6274    Py_UNICODE *s = self->str;
6275    int status = 0;
6276
6277    while (len-- > 0) {
6278        if (Py_UNICODE_ISUPPER(*s)) {
6279            *s = Py_UNICODE_TOLOWER(*s);
6280            status = 1;
6281        } else if (Py_UNICODE_ISLOWER(*s)) {
6282            *s = Py_UNICODE_TOUPPER(*s);
6283            status = 1;
6284        }
6285        s++;
6286    }
6287
6288    return status;
6289}
6290
6291static
6292int fixcapitalize(PyUnicodeObject *self)
6293{
6294    Py_ssize_t len = self->length;
6295    Py_UNICODE *s = self->str;
6296    int status = 0;
6297
6298    if (len == 0)
6299        return 0;
6300    if (Py_UNICODE_ISLOWER(*s)) {
6301        *s = Py_UNICODE_TOUPPER(*s);
6302        status = 1;
6303    }
6304    s++;
6305    while (--len > 0) {
6306        if (Py_UNICODE_ISUPPER(*s)) {
6307            *s = Py_UNICODE_TOLOWER(*s);
6308            status = 1;
6309        }
6310        s++;
6311    }
6312    return status;
6313}
6314
6315static
6316int fixtitle(PyUnicodeObject *self)
6317{
6318    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6319    register Py_UNICODE *e;
6320    int previous_is_cased;
6321
6322    /* Shortcut for single character strings */
6323    if (PyUnicode_GET_SIZE(self) == 1) {
6324        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6325        if (*p != ch) {
6326            *p = ch;
6327            return 1;
6328        }
6329        else
6330            return 0;
6331    }
6332
6333    e = p + PyUnicode_GET_SIZE(self);
6334    previous_is_cased = 0;
6335    for (; p < e; p++) {
6336        register const Py_UNICODE ch = *p;
6337
6338        if (previous_is_cased)
6339            *p = Py_UNICODE_TOLOWER(ch);
6340        else
6341            *p = Py_UNICODE_TOTITLE(ch);
6342
6343        if (Py_UNICODE_ISLOWER(ch) ||
6344            Py_UNICODE_ISUPPER(ch) ||
6345            Py_UNICODE_ISTITLE(ch))
6346            previous_is_cased = 1;
6347        else
6348            previous_is_cased = 0;
6349    }
6350    return 1;
6351}
6352
6353PyObject *
6354PyUnicode_Join(PyObject *separator, PyObject *seq)
6355{
6356    const Py_UNICODE blank = ' ';
6357    const Py_UNICODE *sep = &blank;
6358    Py_ssize_t seplen = 1;
6359    PyUnicodeObject *res = NULL; /* the result */
6360    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6361    PyObject *fseq;          /* PySequence_Fast(seq) */
6362    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6363    PyObject **items;
6364    PyObject *item;
6365    Py_ssize_t sz, i;
6366
6367    fseq = PySequence_Fast(seq, "");
6368    if (fseq == NULL) {
6369        return NULL;
6370    }
6371
6372    /* NOTE: the following code can't call back into Python code,
6373     * so we are sure that fseq won't be mutated.
6374     */
6375
6376    seqlen = PySequence_Fast_GET_SIZE(fseq);
6377    /* If empty sequence, return u"". */
6378    if (seqlen == 0) {
6379        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6380        goto Done;
6381    }
6382    items = PySequence_Fast_ITEMS(fseq);
6383    /* If singleton sequence with an exact Unicode, return that. */
6384    if (seqlen == 1) {
6385        item = items[0];
6386        if (PyUnicode_CheckExact(item)) {
6387            Py_INCREF(item);
6388            res = (PyUnicodeObject *)item;
6389            goto Done;
6390        }
6391    }
6392    else {
6393        /* Set up sep and seplen */
6394        if (separator == NULL) {
6395            sep = &blank;
6396            seplen = 1;
6397        }
6398        else {
6399            if (!PyUnicode_Check(separator)) {
6400                PyErr_Format(PyExc_TypeError,
6401                             "separator: expected str instance,"
6402                             " %.80s found",
6403                             Py_TYPE(separator)->tp_name);
6404                goto onError;
6405            }
6406            sep = PyUnicode_AS_UNICODE(separator);
6407            seplen = PyUnicode_GET_SIZE(separator);
6408        }
6409    }
6410
6411    /* There are at least two things to join, or else we have a subclass
6412     * of str in the sequence.
6413     * Do a pre-pass to figure out the total amount of space we'll
6414     * need (sz), and see whether all argument are strings.
6415     */
6416    sz = 0;
6417    for (i = 0; i < seqlen; i++) {
6418        const Py_ssize_t old_sz = sz;
6419        item = items[i];
6420        if (!PyUnicode_Check(item)) {
6421            PyErr_Format(PyExc_TypeError,
6422                         "sequence item %zd: expected str instance,"
6423                         " %.80s found",
6424                         i, Py_TYPE(item)->tp_name);
6425            goto onError;
6426        }
6427        sz += PyUnicode_GET_SIZE(item);
6428        if (i != 0)
6429            sz += seplen;
6430        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6431            PyErr_SetString(PyExc_OverflowError,
6432                            "join() result is too long for a Python string");
6433            goto onError;
6434        }
6435    }
6436
6437    res = _PyUnicode_New(sz);
6438    if (res == NULL)
6439        goto onError;
6440
6441    /* Catenate everything. */
6442    res_p = PyUnicode_AS_UNICODE(res);
6443    for (i = 0; i < seqlen; ++i) {
6444        Py_ssize_t itemlen;
6445        item = items[i];
6446        itemlen = PyUnicode_GET_SIZE(item);
6447        /* Copy item, and maybe the separator. */
6448        if (i) {
6449            Py_UNICODE_COPY(res_p, sep, seplen);
6450            res_p += seplen;
6451        }
6452        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6453        res_p += itemlen;
6454    }
6455
6456  Done:
6457    Py_DECREF(fseq);
6458    return (PyObject *)res;
6459
6460  onError:
6461    Py_DECREF(fseq);
6462    Py_XDECREF(res);
6463    return NULL;
6464}
6465
6466static
6467PyUnicodeObject *pad(PyUnicodeObject *self,
6468                     Py_ssize_t left,
6469                     Py_ssize_t right,
6470                     Py_UNICODE fill)
6471{
6472    PyUnicodeObject *u;
6473
6474    if (left < 0)
6475        left = 0;
6476    if (right < 0)
6477        right = 0;
6478
6479    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6480        Py_INCREF(self);
6481        return self;
6482    }
6483
6484    if (left > PY_SSIZE_T_MAX - self->length ||
6485        right > PY_SSIZE_T_MAX - (left + self->length)) {
6486        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6487        return NULL;
6488    }
6489    u = _PyUnicode_New(left + self->length + right);
6490    if (u) {
6491        if (left)
6492            Py_UNICODE_FILL(u->str, fill, left);
6493        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6494        if (right)
6495            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6496    }
6497
6498    return u;
6499}
6500
6501PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
6502{
6503    PyObject *list;
6504
6505    string = PyUnicode_FromObject(string);
6506    if (string == NULL)
6507        return NULL;
6508
6509    list = stringlib_splitlines(
6510        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6511        PyUnicode_GET_SIZE(string), keepends);
6512
6513    Py_DECREF(string);
6514    return list;
6515}
6516
6517static
6518PyObject *split(PyUnicodeObject *self,
6519                PyUnicodeObject *substring,
6520                Py_ssize_t maxcount)
6521{
6522    if (maxcount < 0)
6523        maxcount = PY_SSIZE_T_MAX;
6524
6525    if (substring == NULL)
6526        return stringlib_split_whitespace(
6527            (PyObject*) self,  self->str, self->length, maxcount
6528            );
6529
6530    return stringlib_split(
6531        (PyObject*) self,  self->str, self->length,
6532        substring->str, substring->length,
6533        maxcount
6534        );
6535}
6536
6537static
6538PyObject *rsplit(PyUnicodeObject *self,
6539                 PyUnicodeObject *substring,
6540                 Py_ssize_t maxcount)
6541{
6542    if (maxcount < 0)
6543        maxcount = PY_SSIZE_T_MAX;
6544
6545    if (substring == NULL)
6546        return stringlib_rsplit_whitespace(
6547            (PyObject*) self,  self->str, self->length, maxcount
6548            );
6549
6550    return stringlib_rsplit(
6551        (PyObject*) self,  self->str, self->length,
6552        substring->str, substring->length,
6553        maxcount
6554        );
6555}
6556
6557static
6558PyObject *replace(PyUnicodeObject *self,
6559                  PyUnicodeObject *str1,
6560                  PyUnicodeObject *str2,
6561                  Py_ssize_t maxcount)
6562{
6563    PyUnicodeObject *u;
6564
6565    if (maxcount < 0)
6566        maxcount = PY_SSIZE_T_MAX;
6567    else if (maxcount == 0 || self->length == 0)
6568        goto nothing;
6569
6570    if (str1->length == str2->length) {
6571        Py_ssize_t i;
6572        /* same length */
6573        if (str1->length == 0)
6574            goto nothing;
6575        if (str1->length == 1) {
6576            /* replace characters */
6577            Py_UNICODE u1, u2;
6578            if (!findchar(self->str, self->length, str1->str[0]))
6579                goto nothing;
6580            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6581            if (!u)
6582                return NULL;
6583            Py_UNICODE_COPY(u->str, self->str, self->length);
6584            u1 = str1->str[0];
6585            u2 = str2->str[0];
6586            for (i = 0; i < u->length; i++)
6587                if (u->str[i] == u1) {
6588                    if (--maxcount < 0)
6589                        break;
6590                    u->str[i] = u2;
6591                }
6592        } else {
6593            i = stringlib_find(
6594                self->str, self->length, str1->str, str1->length, 0
6595                );
6596            if (i < 0)
6597                goto nothing;
6598            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6599            if (!u)
6600                return NULL;
6601            Py_UNICODE_COPY(u->str, self->str, self->length);
6602
6603            /* change everything in-place, starting with this one */
6604            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6605            i += str1->length;
6606
6607            while ( --maxcount > 0) {
6608                i = stringlib_find(self->str+i, self->length-i,
6609                                   str1->str, str1->length,
6610                                   i);
6611                if (i == -1)
6612                    break;
6613                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6614                i += str1->length;
6615            }
6616        }
6617    } else {
6618
6619        Py_ssize_t n, i, j, e;
6620        Py_ssize_t product, new_size, delta;
6621        Py_UNICODE *p;
6622
6623        /* replace strings */
6624        n = stringlib_count(self->str, self->length, str1->str, str1->length,
6625                            maxcount);
6626        if (n == 0)
6627            goto nothing;
6628        /* new_size = self->length + n * (str2->length - str1->length)); */
6629        delta = (str2->length - str1->length);
6630        if (delta == 0) {
6631            new_size = self->length;
6632        } else {
6633            product = n * (str2->length - str1->length);
6634            if ((product / (str2->length - str1->length)) != n) {
6635                PyErr_SetString(PyExc_OverflowError,
6636                                "replace string is too long");
6637                return NULL;
6638            }
6639            new_size = self->length + product;
6640            if (new_size < 0) {
6641                PyErr_SetString(PyExc_OverflowError,
6642                                "replace string is too long");
6643                return NULL;
6644            }
6645        }
6646        u = _PyUnicode_New(new_size);
6647        if (!u)
6648            return NULL;
6649        i = 0;
6650        p = u->str;
6651        e = self->length - str1->length;
6652        if (str1->length > 0) {
6653            while (n-- > 0) {
6654                /* look for next match */
6655                j = stringlib_find(self->str+i, self->length-i,
6656                                   str1->str, str1->length,
6657                                   i);
6658                if (j == -1)
6659                    break;
6660                else if (j > i) {
6661                    /* copy unchanged part [i:j] */
6662                    Py_UNICODE_COPY(p, self->str+i, j-i);
6663                    p += j - i;
6664                }
6665                /* copy substitution string */
6666                if (str2->length > 0) {
6667                    Py_UNICODE_COPY(p, str2->str, str2->length);
6668                    p += str2->length;
6669                }
6670                i = j + str1->length;
6671            }
6672            if (i < self->length)
6673                /* copy tail [i:] */
6674                Py_UNICODE_COPY(p, self->str+i, self->length-i);
6675        } else {
6676            /* interleave */
6677            while (n > 0) {
6678                Py_UNICODE_COPY(p, str2->str, str2->length);
6679                p += str2->length;
6680                if (--n <= 0)
6681                    break;
6682                *p++ = self->str[i++];
6683            }
6684            Py_UNICODE_COPY(p, self->str+i, self->length-i);
6685        }
6686    }
6687    return (PyObject *) u;
6688
6689  nothing:
6690    /* nothing to replace; return original string (when possible) */
6691    if (PyUnicode_CheckExact(self)) {
6692        Py_INCREF(self);
6693        return (PyObject *) self;
6694    }
6695    return PyUnicode_FromUnicode(self->str, self->length);
6696}
6697
6698/* --- Unicode Object Methods --------------------------------------------- */
6699
6700PyDoc_STRVAR(title__doc__,
6701             "S.title() -> str\n\
6702\n\
6703Return a titlecased version of S, i.e. words start with title case\n\
6704characters, all remaining cased characters have lower case.");
6705
6706static PyObject*
6707unicode_title(PyUnicodeObject *self)
6708{
6709    return fixup(self, fixtitle);
6710}
6711
6712PyDoc_STRVAR(capitalize__doc__,
6713             "S.capitalize() -> str\n\
6714\n\
6715Return a capitalized version of S, i.e. make the first character\n\
6716have upper case and the rest lower case.");
6717
6718static PyObject*
6719unicode_capitalize(PyUnicodeObject *self)
6720{
6721    return fixup(self, fixcapitalize);
6722}
6723
6724#if 0
6725PyDoc_STRVAR(capwords__doc__,
6726             "S.capwords() -> str\n\
6727\n\
6728Apply .capitalize() to all words in S and return the result with\n\
6729normalized whitespace (all whitespace strings are replaced by ' ').");
6730
6731static PyObject*
6732unicode_capwords(PyUnicodeObject *self)
6733{
6734    PyObject *list;
6735    PyObject *item;
6736    Py_ssize_t i;
6737
6738    /* Split into words */
6739    list = split(self, NULL, -1);
6740    if (!list)
6741        return NULL;
6742
6743    /* Capitalize each word */
6744    for (i = 0; i < PyList_GET_SIZE(list); i++) {
6745        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
6746                     fixcapitalize);
6747        if (item == NULL)
6748            goto onError;
6749        Py_DECREF(PyList_GET_ITEM(list, i));
6750        PyList_SET_ITEM(list, i, item);
6751    }
6752
6753    /* Join the words to form a new string */
6754    item = PyUnicode_Join(NULL, list);
6755
6756  onError:
6757    Py_DECREF(list);
6758    return (PyObject *)item;
6759}
6760#endif
6761
6762/* Argument converter.  Coerces to a single unicode character */
6763
6764static int
6765convert_uc(PyObject *obj, void *addr)
6766{
6767    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
6768    PyObject *uniobj;
6769    Py_UNICODE *unistr;
6770
6771    uniobj = PyUnicode_FromObject(obj);
6772    if (uniobj == NULL) {
6773        PyErr_SetString(PyExc_TypeError,
6774                        "The fill character cannot be converted to Unicode");
6775        return 0;
6776    }
6777    if (PyUnicode_GET_SIZE(uniobj) != 1) {
6778        PyErr_SetString(PyExc_TypeError,
6779                        "The fill character must be exactly one character long");
6780        Py_DECREF(uniobj);
6781        return 0;
6782    }
6783    unistr = PyUnicode_AS_UNICODE(uniobj);
6784    *fillcharloc = unistr[0];
6785    Py_DECREF(uniobj);
6786    return 1;
6787}
6788
6789PyDoc_STRVAR(center__doc__,
6790             "S.center(width[, fillchar]) -> str\n\
6791\n\
6792Return S centered in a string of length width. Padding is\n\
6793done using the specified fill character (default is a space)");
6794
6795static PyObject *
6796unicode_center(PyUnicodeObject *self, PyObject *args)
6797{
6798    Py_ssize_t marg, left;
6799    Py_ssize_t width;
6800    Py_UNICODE fillchar = ' ';
6801
6802    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
6803        return NULL;
6804
6805    if (self->length >= width && PyUnicode_CheckExact(self)) {
6806        Py_INCREF(self);
6807        return (PyObject*) self;
6808    }
6809
6810    marg = width - self->length;
6811    left = marg / 2 + (marg & width & 1);
6812
6813    return (PyObject*) pad(self, left, marg - left, fillchar);
6814}
6815
6816#if 0
6817
6818/* This code should go into some future Unicode collation support
6819   module. The basic comparison should compare ordinals on a naive
6820   basis (this is what Java does and thus Jython too). */
6821
6822/* speedy UTF-16 code point order comparison */
6823/* gleaned from: */
6824/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
6825
6826static short utf16Fixup[32] =
6827{
6828    0, 0, 0, 0, 0, 0, 0, 0,
6829    0, 0, 0, 0, 0, 0, 0, 0,
6830    0, 0, 0, 0, 0, 0, 0, 0,
6831    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
6832};
6833
6834static int
6835unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6836{
6837    Py_ssize_t len1, len2;
6838
6839    Py_UNICODE *s1 = str1->str;
6840    Py_UNICODE *s2 = str2->str;
6841
6842    len1 = str1->length;
6843    len2 = str2->length;
6844
6845    while (len1 > 0 && len2 > 0) {
6846        Py_UNICODE c1, c2;
6847
6848        c1 = *s1++;
6849        c2 = *s2++;
6850
6851        if (c1 > (1<<11) * 26)
6852            c1 += utf16Fixup[c1>>11];
6853        if (c2 > (1<<11) * 26)
6854            c2 += utf16Fixup[c2>>11];
6855        /* now c1 and c2 are in UTF-32-compatible order */
6856
6857        if (c1 != c2)
6858            return (c1 < c2) ? -1 : 1;
6859
6860        len1--; len2--;
6861    }
6862
6863    return (len1 < len2) ? -1 : (len1 != len2);
6864}
6865
6866#else
6867
6868static int
6869unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
6870{
6871    register Py_ssize_t len1, len2;
6872
6873    Py_UNICODE *s1 = str1->str;
6874    Py_UNICODE *s2 = str2->str;
6875
6876    len1 = str1->length;
6877    len2 = str2->length;
6878
6879    while (len1 > 0 && len2 > 0) {
6880        Py_UNICODE c1, c2;
6881
6882        c1 = *s1++;
6883        c2 = *s2++;
6884
6885        if (c1 != c2)
6886            return (c1 < c2) ? -1 : 1;
6887
6888        len1--; len2--;
6889    }
6890
6891    return (len1 < len2) ? -1 : (len1 != len2);
6892}
6893
6894#endif
6895
6896int PyUnicode_Compare(PyObject *left,
6897                      PyObject *right)
6898{
6899    if (PyUnicode_Check(left) && PyUnicode_Check(right))
6900        return unicode_compare((PyUnicodeObject *)left,
6901                               (PyUnicodeObject *)right);
6902    PyErr_Format(PyExc_TypeError,
6903                 "Can't compare %.100s and %.100s",
6904                 left->ob_type->tp_name,
6905                 right->ob_type->tp_name);
6906    return -1;
6907}
6908
6909int
6910PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
6911{
6912    int i;
6913    Py_UNICODE *id;
6914    assert(PyUnicode_Check(uni));
6915    id = PyUnicode_AS_UNICODE(uni);
6916    /* Compare Unicode string and source character set string */
6917    for (i = 0; id[i] && str[i]; i++)
6918        if (id[i] != str[i])
6919            return ((int)id[i] < (int)str[i]) ? -1 : 1;
6920    /* This check keeps Python strings that end in '\0' from comparing equal
6921     to C strings identical up to that point. */
6922    if (PyUnicode_GET_SIZE(uni) != i || id[i])
6923        return 1; /* uni is longer */
6924    if (str[i])
6925        return -1; /* str is longer */
6926    return 0;
6927}
6928
6929
6930#define TEST_COND(cond)                         \
6931    ((cond) ? Py_True : Py_False)
6932
6933PyObject *PyUnicode_RichCompare(PyObject *left,
6934                                PyObject *right,
6935                                int op)
6936{
6937    int result;
6938
6939    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
6940        PyObject *v;
6941        if (((PyUnicodeObject *) left)->length !=
6942            ((PyUnicodeObject *) right)->length) {
6943            if (op == Py_EQ) {
6944                Py_INCREF(Py_False);
6945                return Py_False;
6946            }
6947            if (op == Py_NE) {
6948                Py_INCREF(Py_True);
6949                return Py_True;
6950            }
6951        }
6952        if (left == right)
6953            result = 0;
6954        else
6955            result = unicode_compare((PyUnicodeObject *)left,
6956                                     (PyUnicodeObject *)right);
6957
6958        /* Convert the return value to a Boolean */
6959        switch (op) {
6960        case Py_EQ:
6961            v = TEST_COND(result == 0);
6962            break;
6963        case Py_NE:
6964            v = TEST_COND(result != 0);
6965            break;
6966        case Py_LE:
6967            v = TEST_COND(result <= 0);
6968            break;
6969        case Py_GE:
6970            v = TEST_COND(result >= 0);
6971            break;
6972        case Py_LT:
6973            v = TEST_COND(result == -1);
6974            break;
6975        case Py_GT:
6976            v = TEST_COND(result == 1);
6977            break;
6978        default:
6979            PyErr_BadArgument();
6980            return NULL;
6981        }
6982        Py_INCREF(v);
6983        return v;
6984    }
6985
6986    Py_INCREF(Py_NotImplemented);
6987    return Py_NotImplemented;
6988}
6989
6990int PyUnicode_Contains(PyObject *container,
6991                       PyObject *element)
6992{
6993    PyObject *str, *sub;
6994    int result;
6995
6996    /* Coerce the two arguments */
6997    sub = PyUnicode_FromObject(element);
6998    if (!sub) {
6999        PyErr_Format(PyExc_TypeError,
7000                     "'in <string>' requires string as left operand, not %s",
7001                     element->ob_type->tp_name);
7002        return -1;
7003    }
7004
7005    str = PyUnicode_FromObject(container);
7006    if (!str) {
7007        Py_DECREF(sub);
7008        return -1;
7009    }
7010
7011    result = stringlib_contains_obj(str, sub);
7012
7013    Py_DECREF(str);
7014    Py_DECREF(sub);
7015
7016    return result;
7017}
7018
7019/* Concat to string or Unicode object giving a new Unicode object. */
7020
7021PyObject *PyUnicode_Concat(PyObject *left,
7022                           PyObject *right)
7023{
7024    PyUnicodeObject *u = NULL, *v = NULL, *w;
7025
7026    /* Coerce the two arguments */
7027    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7028    if (u == NULL)
7029        goto onError;
7030    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7031    if (v == NULL)
7032        goto onError;
7033
7034    /* Shortcuts */
7035    if (v == unicode_empty) {
7036        Py_DECREF(v);
7037        return (PyObject *)u;
7038    }
7039    if (u == unicode_empty) {
7040        Py_DECREF(u);
7041        return (PyObject *)v;
7042    }
7043
7044    /* Concat the two Unicode strings */
7045    w = _PyUnicode_New(u->length + v->length);
7046    if (w == NULL)
7047        goto onError;
7048    Py_UNICODE_COPY(w->str, u->str, u->length);
7049    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7050
7051    Py_DECREF(u);
7052    Py_DECREF(v);
7053    return (PyObject *)w;
7054
7055  onError:
7056    Py_XDECREF(u);
7057    Py_XDECREF(v);
7058    return NULL;
7059}
7060
7061void
7062PyUnicode_Append(PyObject **pleft, PyObject *right)
7063{
7064    PyObject *new;
7065    if (*pleft == NULL)
7066        return;
7067    if (right == NULL || !PyUnicode_Check(*pleft)) {
7068        Py_DECREF(*pleft);
7069        *pleft = NULL;
7070        return;
7071    }
7072    new = PyUnicode_Concat(*pleft, right);
7073    Py_DECREF(*pleft);
7074    *pleft = new;
7075}
7076
7077void
7078PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7079{
7080    PyUnicode_Append(pleft, right);
7081    Py_XDECREF(right);
7082}
7083
7084PyDoc_STRVAR(count__doc__,
7085             "S.count(sub[, start[, end]]) -> int\n\
7086\n\
7087Return the number of non-overlapping occurrences of substring sub in\n\
7088string S[start:end].  Optional arguments start and end are\n\
7089interpreted as in slice notation.");
7090
7091static PyObject *
7092unicode_count(PyUnicodeObject *self, PyObject *args)
7093{
7094    PyUnicodeObject *substring;
7095    Py_ssize_t start = 0;
7096    Py_ssize_t end = PY_SSIZE_T_MAX;
7097    PyObject *result;
7098
7099    if (!PyArg_ParseTuple(args, "O|O&O&:count", &substring,
7100                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
7101        return NULL;
7102
7103    substring = (PyUnicodeObject *)PyUnicode_FromObject(
7104        (PyObject *)substring);
7105    if (substring == NULL)
7106        return NULL;
7107
7108    ADJUST_INDICES(start, end, self->length);
7109    result = PyLong_FromSsize_t(
7110        stringlib_count(self->str + start, end - start,
7111                        substring->str, substring->length,
7112                        PY_SSIZE_T_MAX)
7113        );
7114
7115    Py_DECREF(substring);
7116
7117    return result;
7118}
7119
7120PyDoc_STRVAR(encode__doc__,
7121             "S.encode([encoding[, errors]]) -> bytes\n\
7122\n\
7123Encode S using the codec registered for encoding. encoding defaults\n\
7124to the default encoding. errors may be given to set a different error\n\
7125handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7126a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7127'xmlcharrefreplace' as well as any other name registered with\n\
7128codecs.register_error that can handle UnicodeEncodeErrors.");
7129
7130static PyObject *
7131unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7132{
7133    static char *kwlist[] = {"encoding", "errors", 0};
7134    char *encoding = NULL;
7135    char *errors = NULL;
7136    PyObject *v;
7137
7138    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7139                                     kwlist, &encoding, &errors))
7140        return NULL;
7141    v = PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7142    if (v == NULL)
7143        goto onError;
7144    if (!PyBytes_Check(v)) {
7145        PyErr_Format(PyExc_TypeError,
7146                     "encoder did not return a bytes object "
7147                     "(type=%.400s)",
7148                     Py_TYPE(v)->tp_name);
7149        Py_DECREF(v);
7150        return NULL;
7151    }
7152    return v;
7153
7154  onError:
7155    return NULL;
7156}
7157
7158PyDoc_STRVAR(expandtabs__doc__,
7159             "S.expandtabs([tabsize]) -> str\n\
7160\n\
7161Return a copy of S where all tab characters are expanded using spaces.\n\
7162If tabsize is not given, a tab size of 8 characters is assumed.");
7163
7164static PyObject*
7165unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7166{
7167    Py_UNICODE *e;
7168    Py_UNICODE *p;
7169    Py_UNICODE *q;
7170    Py_UNICODE *qe;
7171    Py_ssize_t i, j, incr;
7172    PyUnicodeObject *u;
7173    int tabsize = 8;
7174
7175    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7176        return NULL;
7177
7178    /* First pass: determine size of output string */
7179    i = 0; /* chars up to and including most recent \n or \r */
7180    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7181    e = self->str + self->length; /* end of input */
7182    for (p = self->str; p < e; p++)
7183        if (*p == '\t') {
7184            if (tabsize > 0) {
7185                incr = tabsize - (j % tabsize); /* cannot overflow */
7186                if (j > PY_SSIZE_T_MAX - incr)
7187                    goto overflow1;
7188                j += incr;
7189            }
7190        }
7191        else {
7192            if (j > PY_SSIZE_T_MAX - 1)
7193                goto overflow1;
7194            j++;
7195            if (*p == '\n' || *p == '\r') {
7196                if (i > PY_SSIZE_T_MAX - j)
7197                    goto overflow1;
7198                i += j;
7199                j = 0;
7200            }
7201        }
7202
7203    if (i > PY_SSIZE_T_MAX - j)
7204        goto overflow1;
7205
7206    /* Second pass: create output string and fill it */
7207    u = _PyUnicode_New(i + j);
7208    if (!u)
7209        return NULL;
7210
7211    j = 0; /* same as in first pass */
7212    q = u->str; /* next output char */
7213    qe = u->str + u->length; /* end of output */
7214
7215    for (p = self->str; p < e; p++)
7216        if (*p == '\t') {
7217            if (tabsize > 0) {
7218                i = tabsize - (j % tabsize);
7219                j += i;
7220                while (i--) {
7221                    if (q >= qe)
7222                        goto overflow2;
7223                    *q++ = ' ';
7224                }
7225            }
7226        }
7227        else {
7228            if (q >= qe)
7229                goto overflow2;
7230            *q++ = *p;
7231            j++;
7232            if (*p == '\n' || *p == '\r')
7233                j = 0;
7234        }
7235
7236    return (PyObject*) u;
7237
7238  overflow2:
7239    Py_DECREF(u);
7240  overflow1:
7241    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7242    return NULL;
7243}
7244
7245PyDoc_STRVAR(find__doc__,
7246             "S.find(sub[, start[, end]]) -> int\n\
7247\n\
7248Return the lowest index in S where substring sub is found,\n\
7249such that sub is contained within s[start:end].  Optional\n\
7250arguments start and end are interpreted as in slice notation.\n\
7251\n\
7252Return -1 on failure.");
7253
7254static PyObject *
7255unicode_find(PyUnicodeObject *self, PyObject *args)
7256{
7257    PyObject *substring;
7258    Py_ssize_t start;
7259    Py_ssize_t end;
7260    Py_ssize_t result;
7261
7262    if (!_ParseTupleFinds(args, &substring, &start, &end))
7263        return NULL;
7264
7265    result = stringlib_find_slice(
7266        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7267        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7268        start, end
7269        );
7270
7271    Py_DECREF(substring);
7272
7273    return PyLong_FromSsize_t(result);
7274}
7275
7276static PyObject *
7277unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7278{
7279    if (index < 0 || index >= self->length) {
7280        PyErr_SetString(PyExc_IndexError, "string index out of range");
7281        return NULL;
7282    }
7283
7284    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7285}
7286
7287/* Believe it or not, this produces the same value for ASCII strings
7288   as string_hash(). */
7289static long
7290unicode_hash(PyUnicodeObject *self)
7291{
7292    Py_ssize_t len;
7293    Py_UNICODE *p;
7294    long x;
7295
7296    if (self->hash != -1)
7297        return self->hash;
7298    len = Py_SIZE(self);
7299    p = self->str;
7300    x = *p << 7;
7301    while (--len >= 0)
7302        x = (1000003*x) ^ *p++;
7303    x ^= Py_SIZE(self);
7304    if (x == -1)
7305        x = -2;
7306    self->hash = x;
7307    return x;
7308}
7309
7310PyDoc_STRVAR(index__doc__,
7311             "S.index(sub[, start[, end]]) -> int\n\
7312\n\
7313Like S.find() but raise ValueError when the substring is not found.");
7314
7315static PyObject *
7316unicode_index(PyUnicodeObject *self, PyObject *args)
7317{
7318    Py_ssize_t result;
7319    PyObject *substring;
7320    Py_ssize_t start;
7321    Py_ssize_t end;
7322
7323    if (!_ParseTupleFinds(args, &substring, &start, &end))
7324        return NULL;
7325
7326    result = stringlib_find_slice(
7327        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7328        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7329        start, end
7330        );
7331
7332    Py_DECREF(substring);
7333
7334    if (result < 0) {
7335        PyErr_SetString(PyExc_ValueError, "substring not found");
7336        return NULL;
7337    }
7338
7339    return PyLong_FromSsize_t(result);
7340}
7341
7342PyDoc_STRVAR(islower__doc__,
7343             "S.islower() -> bool\n\
7344\n\
7345Return True if all cased characters in S are lowercase and there is\n\
7346at least one cased character in S, False otherwise.");
7347
7348static PyObject*
7349unicode_islower(PyUnicodeObject *self)
7350{
7351    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7352    register const Py_UNICODE *e;
7353    int cased;
7354
7355    /* Shortcut for single character strings */
7356    if (PyUnicode_GET_SIZE(self) == 1)
7357        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7358
7359    /* Special case for empty strings */
7360    if (PyUnicode_GET_SIZE(self) == 0)
7361        return PyBool_FromLong(0);
7362
7363    e = p + PyUnicode_GET_SIZE(self);
7364    cased = 0;
7365    for (; p < e; p++) {
7366        register const Py_UNICODE ch = *p;
7367
7368        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7369            return PyBool_FromLong(0);
7370        else if (!cased && Py_UNICODE_ISLOWER(ch))
7371            cased = 1;
7372    }
7373    return PyBool_FromLong(cased);
7374}
7375
7376PyDoc_STRVAR(isupper__doc__,
7377             "S.isupper() -> bool\n\
7378\n\
7379Return True if all cased characters in S are uppercase and there is\n\
7380at least one cased character in S, False otherwise.");
7381
7382static PyObject*
7383unicode_isupper(PyUnicodeObject *self)
7384{
7385    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7386    register const Py_UNICODE *e;
7387    int cased;
7388
7389    /* Shortcut for single character strings */
7390    if (PyUnicode_GET_SIZE(self) == 1)
7391        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7392
7393    /* Special case for empty strings */
7394    if (PyUnicode_GET_SIZE(self) == 0)
7395        return PyBool_FromLong(0);
7396
7397    e = p + PyUnicode_GET_SIZE(self);
7398    cased = 0;
7399    for (; p < e; p++) {
7400        register const Py_UNICODE ch = *p;
7401
7402        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7403            return PyBool_FromLong(0);
7404        else if (!cased && Py_UNICODE_ISUPPER(ch))
7405            cased = 1;
7406    }
7407    return PyBool_FromLong(cased);
7408}
7409
7410PyDoc_STRVAR(istitle__doc__,
7411             "S.istitle() -> bool\n\
7412\n\
7413Return True if S is a titlecased string and there is at least one\n\
7414character in S, i.e. upper- and titlecase characters may only\n\
7415follow uncased characters and lowercase characters only cased ones.\n\
7416Return False otherwise.");
7417
7418static PyObject*
7419unicode_istitle(PyUnicodeObject *self)
7420{
7421    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7422    register const Py_UNICODE *e;
7423    int cased, previous_is_cased;
7424
7425    /* Shortcut for single character strings */
7426    if (PyUnicode_GET_SIZE(self) == 1)
7427        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7428                               (Py_UNICODE_ISUPPER(*p) != 0));
7429
7430    /* Special case for empty strings */
7431    if (PyUnicode_GET_SIZE(self) == 0)
7432        return PyBool_FromLong(0);
7433
7434    e = p + PyUnicode_GET_SIZE(self);
7435    cased = 0;
7436    previous_is_cased = 0;
7437    for (; p < e; p++) {
7438        register const Py_UNICODE ch = *p;
7439
7440        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7441            if (previous_is_cased)
7442                return PyBool_FromLong(0);
7443            previous_is_cased = 1;
7444            cased = 1;
7445        }
7446        else if (Py_UNICODE_ISLOWER(ch)) {
7447            if (!previous_is_cased)
7448                return PyBool_FromLong(0);
7449            previous_is_cased = 1;
7450            cased = 1;
7451        }
7452        else
7453            previous_is_cased = 0;
7454    }
7455    return PyBool_FromLong(cased);
7456}
7457
7458PyDoc_STRVAR(isspace__doc__,
7459             "S.isspace() -> bool\n\
7460\n\
7461Return True if all characters in S are whitespace\n\
7462and there is at least one character in S, False otherwise.");
7463
7464static PyObject*
7465unicode_isspace(PyUnicodeObject *self)
7466{
7467    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7468    register const Py_UNICODE *e;
7469
7470    /* Shortcut for single character strings */
7471    if (PyUnicode_GET_SIZE(self) == 1 &&
7472        Py_UNICODE_ISSPACE(*p))
7473        return PyBool_FromLong(1);
7474
7475    /* Special case for empty strings */
7476    if (PyUnicode_GET_SIZE(self) == 0)
7477        return PyBool_FromLong(0);
7478
7479    e = p + PyUnicode_GET_SIZE(self);
7480    for (; p < e; p++) {
7481        if (!Py_UNICODE_ISSPACE(*p))
7482            return PyBool_FromLong(0);
7483    }
7484    return PyBool_FromLong(1);
7485}
7486
7487PyDoc_STRVAR(isalpha__doc__,
7488             "S.isalpha() -> bool\n\
7489\n\
7490Return True if all characters in S are alphabetic\n\
7491and there is at least one character in S, False otherwise.");
7492
7493static PyObject*
7494unicode_isalpha(PyUnicodeObject *self)
7495{
7496    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7497    register const Py_UNICODE *e;
7498
7499    /* Shortcut for single character strings */
7500    if (PyUnicode_GET_SIZE(self) == 1 &&
7501        Py_UNICODE_ISALPHA(*p))
7502        return PyBool_FromLong(1);
7503
7504    /* Special case for empty strings */
7505    if (PyUnicode_GET_SIZE(self) == 0)
7506        return PyBool_FromLong(0);
7507
7508    e = p + PyUnicode_GET_SIZE(self);
7509    for (; p < e; p++) {
7510        if (!Py_UNICODE_ISALPHA(*p))
7511            return PyBool_FromLong(0);
7512    }
7513    return PyBool_FromLong(1);
7514}
7515
7516PyDoc_STRVAR(isalnum__doc__,
7517             "S.isalnum() -> bool\n\
7518\n\
7519Return True if all characters in S are alphanumeric\n\
7520and there is at least one character in S, False otherwise.");
7521
7522static PyObject*
7523unicode_isalnum(PyUnicodeObject *self)
7524{
7525    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7526    register const Py_UNICODE *e;
7527
7528    /* Shortcut for single character strings */
7529    if (PyUnicode_GET_SIZE(self) == 1 &&
7530        Py_UNICODE_ISALNUM(*p))
7531        return PyBool_FromLong(1);
7532
7533    /* Special case for empty strings */
7534    if (PyUnicode_GET_SIZE(self) == 0)
7535        return PyBool_FromLong(0);
7536
7537    e = p + PyUnicode_GET_SIZE(self);
7538    for (; p < e; p++) {
7539        if (!Py_UNICODE_ISALNUM(*p))
7540            return PyBool_FromLong(0);
7541    }
7542    return PyBool_FromLong(1);
7543}
7544
7545PyDoc_STRVAR(isdecimal__doc__,
7546             "S.isdecimal() -> bool\n\
7547\n\
7548Return True if there are only decimal characters in S,\n\
7549False otherwise.");
7550
7551static PyObject*
7552unicode_isdecimal(PyUnicodeObject *self)
7553{
7554    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7555    register const Py_UNICODE *e;
7556
7557    /* Shortcut for single character strings */
7558    if (PyUnicode_GET_SIZE(self) == 1 &&
7559        Py_UNICODE_ISDECIMAL(*p))
7560        return PyBool_FromLong(1);
7561
7562    /* Special case for empty strings */
7563    if (PyUnicode_GET_SIZE(self) == 0)
7564        return PyBool_FromLong(0);
7565
7566    e = p + PyUnicode_GET_SIZE(self);
7567    for (; p < e; p++) {
7568        if (!Py_UNICODE_ISDECIMAL(*p))
7569            return PyBool_FromLong(0);
7570    }
7571    return PyBool_FromLong(1);
7572}
7573
7574PyDoc_STRVAR(isdigit__doc__,
7575             "S.isdigit() -> bool\n\
7576\n\
7577Return True if all characters in S are digits\n\
7578and there is at least one character in S, False otherwise.");
7579
7580static PyObject*
7581unicode_isdigit(PyUnicodeObject *self)
7582{
7583    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7584    register const Py_UNICODE *e;
7585
7586    /* Shortcut for single character strings */
7587    if (PyUnicode_GET_SIZE(self) == 1 &&
7588        Py_UNICODE_ISDIGIT(*p))
7589        return PyBool_FromLong(1);
7590
7591    /* Special case for empty strings */
7592    if (PyUnicode_GET_SIZE(self) == 0)
7593        return PyBool_FromLong(0);
7594
7595    e = p + PyUnicode_GET_SIZE(self);
7596    for (; p < e; p++) {
7597        if (!Py_UNICODE_ISDIGIT(*p))
7598            return PyBool_FromLong(0);
7599    }
7600    return PyBool_FromLong(1);
7601}
7602
7603PyDoc_STRVAR(isnumeric__doc__,
7604             "S.isnumeric() -> bool\n\
7605\n\
7606Return True if there are only numeric characters in S,\n\
7607False otherwise.");
7608
7609static PyObject*
7610unicode_isnumeric(PyUnicodeObject *self)
7611{
7612    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7613    register const Py_UNICODE *e;
7614
7615    /* Shortcut for single character strings */
7616    if (PyUnicode_GET_SIZE(self) == 1 &&
7617        Py_UNICODE_ISNUMERIC(*p))
7618        return PyBool_FromLong(1);
7619
7620    /* Special case for empty strings */
7621    if (PyUnicode_GET_SIZE(self) == 0)
7622        return PyBool_FromLong(0);
7623
7624    e = p + PyUnicode_GET_SIZE(self);
7625    for (; p < e; p++) {
7626        if (!Py_UNICODE_ISNUMERIC(*p))
7627            return PyBool_FromLong(0);
7628    }
7629    return PyBool_FromLong(1);
7630}
7631
7632int
7633PyUnicode_IsIdentifier(PyObject *self)
7634{
7635    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7636    register const Py_UNICODE *e;
7637
7638    /* Special case for empty strings */
7639    if (PyUnicode_GET_SIZE(self) == 0)
7640        return 0;
7641
7642    /* PEP 3131 says that the first character must be in
7643       XID_Start and subsequent characters in XID_Continue,
7644       and for the ASCII range, the 2.x rules apply (i.e
7645       start with letters and underscore, continue with
7646       letters, digits, underscore). However, given the current
7647       definition of XID_Start and XID_Continue, it is sufficient
7648       to check just for these, except that _ must be allowed
7649       as starting an identifier.  */
7650    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7651        return 0;
7652
7653    e = p + PyUnicode_GET_SIZE(self);
7654    for (p++; p < e; p++) {
7655        if (!_PyUnicode_IsXidContinue(*p))
7656            return 0;
7657    }
7658    return 1;
7659}
7660
7661PyDoc_STRVAR(isidentifier__doc__,
7662             "S.isidentifier() -> bool\n\
7663\n\
7664Return True if S is a valid identifier according\n\
7665to the language definition.");
7666
7667static PyObject*
7668unicode_isidentifier(PyObject *self)
7669{
7670    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
7671}
7672
7673PyDoc_STRVAR(isprintable__doc__,
7674             "S.isprintable() -> bool\n\
7675\n\
7676Return True if all characters in S are considered\n\
7677printable in repr() or S is empty, False otherwise.");
7678
7679static PyObject*
7680unicode_isprintable(PyObject *self)
7681{
7682    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7683    register const Py_UNICODE *e;
7684
7685    /* Shortcut for single character strings */
7686    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
7687        Py_RETURN_TRUE;
7688    }
7689
7690    e = p + PyUnicode_GET_SIZE(self);
7691    for (; p < e; p++) {
7692        if (!Py_UNICODE_ISPRINTABLE(*p)) {
7693            Py_RETURN_FALSE;
7694        }
7695    }
7696    Py_RETURN_TRUE;
7697}
7698
7699PyDoc_STRVAR(join__doc__,
7700             "S.join(iterable) -> str\n\
7701\n\
7702Return a string which is the concatenation of the strings in the\n\
7703iterable.  The separator between elements is S.");
7704
7705static PyObject*
7706unicode_join(PyObject *self, PyObject *data)
7707{
7708    return PyUnicode_Join(self, data);
7709}
7710
7711static Py_ssize_t
7712unicode_length(PyUnicodeObject *self)
7713{
7714    return self->length;
7715}
7716
7717PyDoc_STRVAR(ljust__doc__,
7718             "S.ljust(width[, fillchar]) -> str\n\
7719\n\
7720Return S left-justified in a Unicode string of length width. Padding is\n\
7721done using the specified fill character (default is a space).");
7722
7723static PyObject *
7724unicode_ljust(PyUnicodeObject *self, PyObject *args)
7725{
7726    Py_ssize_t width;
7727    Py_UNICODE fillchar = ' ';
7728
7729    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
7730        return NULL;
7731
7732    if (self->length >= width && PyUnicode_CheckExact(self)) {
7733        Py_INCREF(self);
7734        return (PyObject*) self;
7735    }
7736
7737    return (PyObject*) pad(self, 0, width - self->length, fillchar);
7738}
7739
7740PyDoc_STRVAR(lower__doc__,
7741             "S.lower() -> str\n\
7742\n\
7743Return a copy of the string S converted to lowercase.");
7744
7745static PyObject*
7746unicode_lower(PyUnicodeObject *self)
7747{
7748    return fixup(self, fixlower);
7749}
7750
7751#define LEFTSTRIP 0
7752#define RIGHTSTRIP 1
7753#define BOTHSTRIP 2
7754
7755/* Arrays indexed by above */
7756static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
7757
7758#define STRIPNAME(i) (stripformat[i]+3)
7759
7760/* externally visible for str.strip(unicode) */
7761PyObject *
7762_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
7763{
7764    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7765    Py_ssize_t len = PyUnicode_GET_SIZE(self);
7766    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
7767    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
7768    Py_ssize_t i, j;
7769
7770    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
7771
7772    i = 0;
7773    if (striptype != RIGHTSTRIP) {
7774        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
7775            i++;
7776        }
7777    }
7778
7779    j = len;
7780    if (striptype != LEFTSTRIP) {
7781        do {
7782            j--;
7783        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
7784        j++;
7785    }
7786
7787    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7788        Py_INCREF(self);
7789        return (PyObject*)self;
7790    }
7791    else
7792        return PyUnicode_FromUnicode(s+i, j-i);
7793}
7794
7795
7796static PyObject *
7797do_strip(PyUnicodeObject *self, int striptype)
7798{
7799    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
7800    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
7801
7802    i = 0;
7803    if (striptype != RIGHTSTRIP) {
7804        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
7805            i++;
7806        }
7807    }
7808
7809    j = len;
7810    if (striptype != LEFTSTRIP) {
7811        do {
7812            j--;
7813        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
7814        j++;
7815    }
7816
7817    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
7818        Py_INCREF(self);
7819        return (PyObject*)self;
7820    }
7821    else
7822        return PyUnicode_FromUnicode(s+i, j-i);
7823}
7824
7825
7826static PyObject *
7827do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
7828{
7829    PyObject *sep = NULL;
7830
7831    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
7832        return NULL;
7833
7834    if (sep != NULL && sep != Py_None) {
7835        if (PyUnicode_Check(sep))
7836            return _PyUnicode_XStrip(self, striptype, sep);
7837        else {
7838            PyErr_Format(PyExc_TypeError,
7839                         "%s arg must be None or str",
7840                         STRIPNAME(striptype));
7841            return NULL;
7842        }
7843    }
7844
7845    return do_strip(self, striptype);
7846}
7847
7848
7849PyDoc_STRVAR(strip__doc__,
7850             "S.strip([chars]) -> str\n\
7851\n\
7852Return a copy of the string S with leading and trailing\n\
7853whitespace removed.\n\
7854If chars is given and not None, remove characters in chars instead.");
7855
7856static PyObject *
7857unicode_strip(PyUnicodeObject *self, PyObject *args)
7858{
7859    if (PyTuple_GET_SIZE(args) == 0)
7860        return do_strip(self, BOTHSTRIP); /* Common case */
7861    else
7862        return do_argstrip(self, BOTHSTRIP, args);
7863}
7864
7865
7866PyDoc_STRVAR(lstrip__doc__,
7867             "S.lstrip([chars]) -> str\n\
7868\n\
7869Return a copy of the string S with leading whitespace removed.\n\
7870If chars is given and not None, remove characters in chars instead.");
7871
7872static PyObject *
7873unicode_lstrip(PyUnicodeObject *self, PyObject *args)
7874{
7875    if (PyTuple_GET_SIZE(args) == 0)
7876        return do_strip(self, LEFTSTRIP); /* Common case */
7877    else
7878        return do_argstrip(self, LEFTSTRIP, args);
7879}
7880
7881
7882PyDoc_STRVAR(rstrip__doc__,
7883             "S.rstrip([chars]) -> str\n\
7884\n\
7885Return a copy of the string S with trailing whitespace removed.\n\
7886If chars is given and not None, remove characters in chars instead.");
7887
7888static PyObject *
7889unicode_rstrip(PyUnicodeObject *self, PyObject *args)
7890{
7891    if (PyTuple_GET_SIZE(args) == 0)
7892        return do_strip(self, RIGHTSTRIP); /* Common case */
7893    else
7894        return do_argstrip(self, RIGHTSTRIP, args);
7895}
7896
7897
7898static PyObject*
7899unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
7900{
7901    PyUnicodeObject *u;
7902    Py_UNICODE *p;
7903    Py_ssize_t nchars;
7904    size_t nbytes;
7905
7906    if (len < 1) {
7907        Py_INCREF(unicode_empty);
7908        return (PyObject *)unicode_empty;
7909    }
7910
7911    if (len == 1 && PyUnicode_CheckExact(str)) {
7912        /* no repeat, return original string */
7913        Py_INCREF(str);
7914        return (PyObject*) str;
7915    }
7916
7917    /* ensure # of chars needed doesn't overflow int and # of bytes
7918     * needed doesn't overflow size_t
7919     */
7920    nchars = len * str->length;
7921    if (nchars / len != str->length) {
7922        PyErr_SetString(PyExc_OverflowError,
7923                        "repeated string is too long");
7924        return NULL;
7925    }
7926    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
7927    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
7928        PyErr_SetString(PyExc_OverflowError,
7929                        "repeated string is too long");
7930        return NULL;
7931    }
7932    u = _PyUnicode_New(nchars);
7933    if (!u)
7934        return NULL;
7935
7936    p = u->str;
7937
7938    if (str->length == 1) {
7939        Py_UNICODE_FILL(p, str->str[0], len);
7940    } else {
7941        Py_ssize_t done = str->length; /* number of characters copied this far */
7942        Py_UNICODE_COPY(p, str->str, str->length);
7943        while (done < nchars) {
7944            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
7945            Py_UNICODE_COPY(p+done, p, n);
7946            done += n;
7947        }
7948    }
7949
7950    return (PyObject*) u;
7951}
7952
7953PyObject *PyUnicode_Replace(PyObject *obj,
7954                            PyObject *subobj,
7955                            PyObject *replobj,
7956                            Py_ssize_t maxcount)
7957{
7958    PyObject *self;
7959    PyObject *str1;
7960    PyObject *str2;
7961    PyObject *result;
7962
7963    self = PyUnicode_FromObject(obj);
7964    if (self == NULL)
7965        return NULL;
7966    str1 = PyUnicode_FromObject(subobj);
7967    if (str1 == NULL) {
7968        Py_DECREF(self);
7969        return NULL;
7970    }
7971    str2 = PyUnicode_FromObject(replobj);
7972    if (str2 == NULL) {
7973        Py_DECREF(self);
7974        Py_DECREF(str1);
7975        return NULL;
7976    }
7977    result = replace((PyUnicodeObject *)self,
7978                     (PyUnicodeObject *)str1,
7979                     (PyUnicodeObject *)str2,
7980                     maxcount);
7981    Py_DECREF(self);
7982    Py_DECREF(str1);
7983    Py_DECREF(str2);
7984    return result;
7985}
7986
7987PyDoc_STRVAR(replace__doc__,
7988             "S.replace(old, new[, count]) -> str\n\
7989\n\
7990Return a copy of S with all occurrences of substring\n\
7991old replaced by new.  If the optional argument count is\n\
7992given, only the first count occurrences are replaced.");
7993
7994static PyObject*
7995unicode_replace(PyUnicodeObject *self, PyObject *args)
7996{
7997    PyUnicodeObject *str1;
7998    PyUnicodeObject *str2;
7999    Py_ssize_t maxcount = -1;
8000    PyObject *result;
8001
8002    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8003        return NULL;
8004    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8005    if (str1 == NULL)
8006        return NULL;
8007    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8008    if (str2 == NULL) {
8009        Py_DECREF(str1);
8010        return NULL;
8011    }
8012
8013    result = replace(self, str1, str2, maxcount);
8014
8015    Py_DECREF(str1);
8016    Py_DECREF(str2);
8017    return result;
8018}
8019
8020static
8021PyObject *unicode_repr(PyObject *unicode)
8022{
8023    PyObject *repr;
8024    Py_UNICODE *p;
8025    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8026    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8027
8028    /* XXX(nnorwitz): rather than over-allocating, it would be
8029       better to choose a different scheme.  Perhaps scan the
8030       first N-chars of the string and allocate based on that size.
8031    */
8032    /* Initial allocation is based on the longest-possible unichr
8033       escape.
8034
8035       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8036       unichr, so in this case it's the longest unichr escape. In
8037       narrow (UTF-16) builds this is five chars per source unichr
8038       since there are two unichrs in the surrogate pair, so in narrow
8039       (UTF-16) builds it's not the longest unichr escape.
8040
8041       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8042       so in the narrow (UTF-16) build case it's the longest unichr
8043       escape.
8044    */
8045
8046    repr = PyUnicode_FromUnicode(NULL,
8047                                 2 /* quotes */
8048#ifdef Py_UNICODE_WIDE
8049                                 + 10*size
8050#else
8051                                 + 6*size
8052#endif
8053                                 + 1);
8054    if (repr == NULL)
8055        return NULL;
8056
8057    p = PyUnicode_AS_UNICODE(repr);
8058
8059    /* Add quote */
8060    *p++ = (findchar(s, size, '\'') &&
8061            !findchar(s, size, '"')) ? '"' : '\'';
8062    while (size-- > 0) {
8063        Py_UNICODE ch = *s++;
8064
8065        /* Escape quotes and backslashes */
8066        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8067            *p++ = '\\';
8068            *p++ = ch;
8069            continue;
8070        }
8071
8072        /* Map special whitespace to '\t', \n', '\r' */
8073        if (ch == '\t') {
8074            *p++ = '\\';
8075            *p++ = 't';
8076        }
8077        else if (ch == '\n') {
8078            *p++ = '\\';
8079            *p++ = 'n';
8080        }
8081        else if (ch == '\r') {
8082            *p++ = '\\';
8083            *p++ = 'r';
8084        }
8085
8086        /* Map non-printable US ASCII to '\xhh' */
8087        else if (ch < ' ' || ch == 0x7F) {
8088            *p++ = '\\';
8089            *p++ = 'x';
8090            *p++ = hexdigits[(ch >> 4) & 0x000F];
8091            *p++ = hexdigits[ch & 0x000F];
8092        }
8093
8094        /* Copy ASCII characters as-is */
8095        else if (ch < 0x7F) {
8096            *p++ = ch;
8097        }
8098
8099        /* Non-ASCII characters */
8100        else {
8101            Py_UCS4 ucs = ch;
8102
8103#ifndef Py_UNICODE_WIDE
8104            Py_UNICODE ch2 = 0;
8105            /* Get code point from surrogate pair */
8106            if (size > 0) {
8107                ch2 = *s;
8108                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8109                    && ch2 <= 0xDFFF) {
8110                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8111                        + 0x00010000;
8112                    s++;
8113                    size--;
8114                }
8115            }
8116#endif
8117            /* Map Unicode whitespace and control characters
8118               (categories Z* and C* except ASCII space)
8119            */
8120            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8121                /* Map 8-bit characters to '\xhh' */
8122                if (ucs <= 0xff) {
8123                    *p++ = '\\';
8124                    *p++ = 'x';
8125                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8126                    *p++ = hexdigits[ch & 0x000F];
8127                }
8128                /* Map 21-bit characters to '\U00xxxxxx' */
8129                else if (ucs >= 0x10000) {
8130                    *p++ = '\\';
8131                    *p++ = 'U';
8132                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8133                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8134                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8135                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8136                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8137                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8138                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8139                    *p++ = hexdigits[ucs & 0x0000000F];
8140                }
8141                /* Map 16-bit characters to '\uxxxx' */
8142                else {
8143                    *p++ = '\\';
8144                    *p++ = 'u';
8145                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8146                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8147                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8148                    *p++ = hexdigits[ucs & 0x000F];
8149                }
8150            }
8151            /* Copy characters as-is */
8152            else {
8153                *p++ = ch;
8154#ifndef Py_UNICODE_WIDE
8155                if (ucs >= 0x10000)
8156                    *p++ = ch2;
8157#endif
8158            }
8159        }
8160    }
8161    /* Add quote */
8162    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8163
8164    *p = '\0';
8165    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8166    return repr;
8167}
8168
8169PyDoc_STRVAR(rfind__doc__,
8170             "S.rfind(sub[, start[, end]]) -> int\n\
8171\n\
8172Return the highest index in S where substring sub is found,\n\
8173such that sub is contained within s[start:end].  Optional\n\
8174arguments start and end are interpreted as in slice notation.\n\
8175\n\
8176Return -1 on failure.");
8177
8178static PyObject *
8179unicode_rfind(PyUnicodeObject *self, PyObject *args)
8180{
8181    PyObject *substring;
8182    Py_ssize_t start;
8183    Py_ssize_t end;
8184    Py_ssize_t result;
8185
8186    if (!_ParseTupleFinds(args, &substring, &start, &end))
8187        return NULL;
8188
8189    result = stringlib_rfind_slice(
8190        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8191        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8192        start, end
8193        );
8194
8195    Py_DECREF(substring);
8196
8197    return PyLong_FromSsize_t(result);
8198}
8199
8200PyDoc_STRVAR(rindex__doc__,
8201             "S.rindex(sub[, start[, end]]) -> int\n\
8202\n\
8203Like S.rfind() but raise ValueError when the substring is not found.");
8204
8205static PyObject *
8206unicode_rindex(PyUnicodeObject *self, PyObject *args)
8207{
8208    PyObject *substring;
8209    Py_ssize_t start;
8210    Py_ssize_t end;
8211    Py_ssize_t result;
8212
8213    if (!_ParseTupleFinds(args, &substring, &start, &end))
8214        return NULL;
8215
8216    result = stringlib_rfind_slice(
8217        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8218        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8219        start, end
8220        );
8221
8222    Py_DECREF(substring);
8223
8224    if (result < 0) {
8225        PyErr_SetString(PyExc_ValueError, "substring not found");
8226        return NULL;
8227    }
8228    return PyLong_FromSsize_t(result);
8229}
8230
8231PyDoc_STRVAR(rjust__doc__,
8232             "S.rjust(width[, fillchar]) -> str\n\
8233\n\
8234Return S right-justified in a string of length width. Padding is\n\
8235done using the specified fill character (default is a space).");
8236
8237static PyObject *
8238unicode_rjust(PyUnicodeObject *self, PyObject *args)
8239{
8240    Py_ssize_t width;
8241    Py_UNICODE fillchar = ' ';
8242
8243    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8244        return NULL;
8245
8246    if (self->length >= width && PyUnicode_CheckExact(self)) {
8247        Py_INCREF(self);
8248        return (PyObject*) self;
8249    }
8250
8251    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8252}
8253
8254PyObject *PyUnicode_Split(PyObject *s,
8255                          PyObject *sep,
8256                          Py_ssize_t maxsplit)
8257{
8258    PyObject *result;
8259
8260    s = PyUnicode_FromObject(s);
8261    if (s == NULL)
8262        return NULL;
8263    if (sep != NULL) {
8264        sep = PyUnicode_FromObject(sep);
8265        if (sep == NULL) {
8266            Py_DECREF(s);
8267            return NULL;
8268        }
8269    }
8270
8271    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8272
8273    Py_DECREF(s);
8274    Py_XDECREF(sep);
8275    return result;
8276}
8277
8278PyDoc_STRVAR(split__doc__,
8279             "S.split([sep[, maxsplit]]) -> list of strings\n\
8280\n\
8281Return a list of the words in S, using sep as the\n\
8282delimiter string.  If maxsplit is given, at most maxsplit\n\
8283splits are done. If sep is not specified or is None, any\n\
8284whitespace string is a separator and empty strings are\n\
8285removed from the result.");
8286
8287static PyObject*
8288unicode_split(PyUnicodeObject *self, PyObject *args)
8289{
8290    PyObject *substring = Py_None;
8291    Py_ssize_t maxcount = -1;
8292
8293    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8294        return NULL;
8295
8296    if (substring == Py_None)
8297        return split(self, NULL, maxcount);
8298    else if (PyUnicode_Check(substring))
8299        return split(self, (PyUnicodeObject *)substring, maxcount);
8300    else
8301        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8302}
8303
8304PyObject *
8305PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8306{
8307    PyObject* str_obj;
8308    PyObject* sep_obj;
8309    PyObject* out;
8310
8311    str_obj = PyUnicode_FromObject(str_in);
8312    if (!str_obj)
8313        return NULL;
8314    sep_obj = PyUnicode_FromObject(sep_in);
8315    if (!sep_obj) {
8316        Py_DECREF(str_obj);
8317        return NULL;
8318    }
8319
8320    out = stringlib_partition(
8321        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8322        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8323        );
8324
8325    Py_DECREF(sep_obj);
8326    Py_DECREF(str_obj);
8327
8328    return out;
8329}
8330
8331
8332PyObject *
8333PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8334{
8335    PyObject* str_obj;
8336    PyObject* sep_obj;
8337    PyObject* out;
8338
8339    str_obj = PyUnicode_FromObject(str_in);
8340    if (!str_obj)
8341        return NULL;
8342    sep_obj = PyUnicode_FromObject(sep_in);
8343    if (!sep_obj) {
8344        Py_DECREF(str_obj);
8345        return NULL;
8346    }
8347
8348    out = stringlib_rpartition(
8349        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8350        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8351        );
8352
8353    Py_DECREF(sep_obj);
8354    Py_DECREF(str_obj);
8355
8356    return out;
8357}
8358
8359PyDoc_STRVAR(partition__doc__,
8360             "S.partition(sep) -> (head, sep, tail)\n\
8361\n\
8362Search for the separator sep in S, and return the part before it,\n\
8363the separator itself, and the part after it.  If the separator is not\n\
8364found, return S and two empty strings.");
8365
8366static PyObject*
8367unicode_partition(PyUnicodeObject *self, PyObject *separator)
8368{
8369    return PyUnicode_Partition((PyObject *)self, separator);
8370}
8371
8372PyDoc_STRVAR(rpartition__doc__,
8373             "S.rpartition(sep) -> (head, sep, tail)\n\
8374\n\
8375Search for the separator sep in S, starting at the end of S, and return\n\
8376the part before it, the separator itself, and the part after it.  If the\n\
8377separator is not found, return two empty strings and S.");
8378
8379static PyObject*
8380unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8381{
8382    return PyUnicode_RPartition((PyObject *)self, separator);
8383}
8384
8385PyObject *PyUnicode_RSplit(PyObject *s,
8386                           PyObject *sep,
8387                           Py_ssize_t maxsplit)
8388{
8389    PyObject *result;
8390
8391    s = PyUnicode_FromObject(s);
8392    if (s == NULL)
8393        return NULL;
8394    if (sep != NULL) {
8395        sep = PyUnicode_FromObject(sep);
8396        if (sep == NULL) {
8397            Py_DECREF(s);
8398            return NULL;
8399        }
8400    }
8401
8402    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8403
8404    Py_DECREF(s);
8405    Py_XDECREF(sep);
8406    return result;
8407}
8408
8409PyDoc_STRVAR(rsplit__doc__,
8410             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8411\n\
8412Return a list of the words in S, using sep as the\n\
8413delimiter string, starting at the end of the string and\n\
8414working to the front.  If maxsplit is given, at most maxsplit\n\
8415splits are done. If sep is not specified, any whitespace string\n\
8416is a separator.");
8417
8418static PyObject*
8419unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8420{
8421    PyObject *substring = Py_None;
8422    Py_ssize_t maxcount = -1;
8423
8424    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8425        return NULL;
8426
8427    if (substring == Py_None)
8428        return rsplit(self, NULL, maxcount);
8429    else if (PyUnicode_Check(substring))
8430        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8431    else
8432        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8433}
8434
8435PyDoc_STRVAR(splitlines__doc__,
8436             "S.splitlines([keepends]) -> list of strings\n\
8437\n\
8438Return a list of the lines in S, breaking at line boundaries.\n\
8439Line breaks are not included in the resulting list unless keepends\n\
8440is given and true.");
8441
8442static PyObject*
8443unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8444{
8445    int keepends = 0;
8446
8447    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8448        return NULL;
8449
8450    return PyUnicode_Splitlines((PyObject *)self, keepends);
8451}
8452
8453static
8454PyObject *unicode_str(PyObject *self)
8455{
8456    if (PyUnicode_CheckExact(self)) {
8457        Py_INCREF(self);
8458        return self;
8459    } else
8460        /* Subtype -- return genuine unicode string with the same value. */
8461        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8462                                     PyUnicode_GET_SIZE(self));
8463}
8464
8465PyDoc_STRVAR(swapcase__doc__,
8466             "S.swapcase() -> str\n\
8467\n\
8468Return a copy of S with uppercase characters converted to lowercase\n\
8469and vice versa.");
8470
8471static PyObject*
8472unicode_swapcase(PyUnicodeObject *self)
8473{
8474    return fixup(self, fixswapcase);
8475}
8476
8477PyDoc_STRVAR(maketrans__doc__,
8478             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8479\n\
8480Return a translation table usable for str.translate().\n\
8481If there is only one argument, it must be a dictionary mapping Unicode\n\
8482ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8483Character keys will be then converted to ordinals.\n\
8484If there are two arguments, they must be strings of equal length, and\n\
8485in the resulting dictionary, each character in x will be mapped to the\n\
8486character at the same position in y. If there is a third argument, it\n\
8487must be a string, whose characters will be mapped to None in the result.");
8488
8489static PyObject*
8490unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8491{
8492    PyObject *x, *y = NULL, *z = NULL;
8493    PyObject *new = NULL, *key, *value;
8494    Py_ssize_t i = 0;
8495    int res;
8496
8497    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8498        return NULL;
8499    new = PyDict_New();
8500    if (!new)
8501        return NULL;
8502    if (y != NULL) {
8503        /* x must be a string too, of equal length */
8504        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8505        if (!PyUnicode_Check(x)) {
8506            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8507                            "be a string if there is a second argument");
8508            goto err;
8509        }
8510        if (PyUnicode_GET_SIZE(x) != ylen) {
8511            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8512                            "arguments must have equal length");
8513            goto err;
8514        }
8515        /* create entries for translating chars in x to those in y */
8516        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8517            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8518            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8519            if (!key || !value)
8520                goto err;
8521            res = PyDict_SetItem(new, key, value);
8522            Py_DECREF(key);
8523            Py_DECREF(value);
8524            if (res < 0)
8525                goto err;
8526        }
8527        /* create entries for deleting chars in z */
8528        if (z != NULL) {
8529            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8530                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8531                if (!key)
8532                    goto err;
8533                res = PyDict_SetItem(new, key, Py_None);
8534                Py_DECREF(key);
8535                if (res < 0)
8536                    goto err;
8537            }
8538        }
8539    } else {
8540        /* x must be a dict */
8541        if (!PyDict_CheckExact(x)) {
8542            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8543                            "to maketrans it must be a dict");
8544            goto err;
8545        }
8546        /* copy entries into the new dict, converting string keys to int keys */
8547        while (PyDict_Next(x, &i, &key, &value)) {
8548            if (PyUnicode_Check(key)) {
8549                /* convert string keys to integer keys */
8550                PyObject *newkey;
8551                if (PyUnicode_GET_SIZE(key) != 1) {
8552                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8553                                    "table must be of length 1");
8554                    goto err;
8555                }
8556                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8557                if (!newkey)
8558                    goto err;
8559                res = PyDict_SetItem(new, newkey, value);
8560                Py_DECREF(newkey);
8561                if (res < 0)
8562                    goto err;
8563            } else if (PyLong_Check(key)) {
8564                /* just keep integer keys */
8565                if (PyDict_SetItem(new, key, value) < 0)
8566                    goto err;
8567            } else {
8568                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8569                                "be strings or integers");
8570                goto err;
8571            }
8572        }
8573    }
8574    return new;
8575  err:
8576    Py_DECREF(new);
8577    return NULL;
8578}
8579
8580PyDoc_STRVAR(translate__doc__,
8581             "S.translate(table) -> str\n\
8582\n\
8583Return a copy of the string S, where all characters have been mapped\n\
8584through the given translation table, which must be a mapping of\n\
8585Unicode ordinals to Unicode ordinals, strings, or None.\n\
8586Unmapped characters are left untouched. Characters mapped to None\n\
8587are deleted.");
8588
8589static PyObject*
8590unicode_translate(PyUnicodeObject *self, PyObject *table)
8591{
8592    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8593}
8594
8595PyDoc_STRVAR(upper__doc__,
8596             "S.upper() -> str\n\
8597\n\
8598Return a copy of S converted to uppercase.");
8599
8600static PyObject*
8601unicode_upper(PyUnicodeObject *self)
8602{
8603    return fixup(self, fixupper);
8604}
8605
8606PyDoc_STRVAR(zfill__doc__,
8607             "S.zfill(width) -> str\n\
8608\n\
8609Pad a numeric string S with zeros on the left, to fill a field\n\
8610of the specified width. The string S is never truncated.");
8611
8612static PyObject *
8613unicode_zfill(PyUnicodeObject *self, PyObject *args)
8614{
8615    Py_ssize_t fill;
8616    PyUnicodeObject *u;
8617
8618    Py_ssize_t width;
8619    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8620        return NULL;
8621
8622    if (self->length >= width) {
8623        if (PyUnicode_CheckExact(self)) {
8624            Py_INCREF(self);
8625            return (PyObject*) self;
8626        }
8627        else
8628            return PyUnicode_FromUnicode(
8629                PyUnicode_AS_UNICODE(self),
8630                PyUnicode_GET_SIZE(self)
8631                );
8632    }
8633
8634    fill = width - self->length;
8635
8636    u = pad(self, fill, 0, '0');
8637
8638    if (u == NULL)
8639        return NULL;
8640
8641    if (u->str[fill] == '+' || u->str[fill] == '-') {
8642        /* move sign to beginning of string */
8643        u->str[0] = u->str[fill];
8644        u->str[fill] = '0';
8645    }
8646
8647    return (PyObject*) u;
8648}
8649
8650#if 0
8651static PyObject*
8652unicode_freelistsize(PyUnicodeObject *self)
8653{
8654    return PyLong_FromLong(numfree);
8655}
8656#endif
8657
8658PyDoc_STRVAR(startswith__doc__,
8659             "S.startswith(prefix[, start[, end]]) -> bool\n\
8660\n\
8661Return True if S starts with the specified prefix, False otherwise.\n\
8662With optional start, test S beginning at that position.\n\
8663With optional end, stop comparing S at that position.\n\
8664prefix can also be a tuple of strings to try.");
8665
8666static PyObject *
8667unicode_startswith(PyUnicodeObject *self,
8668                   PyObject *args)
8669{
8670    PyObject *subobj;
8671    PyUnicodeObject *substring;
8672    Py_ssize_t start = 0;
8673    Py_ssize_t end = PY_SSIZE_T_MAX;
8674    int result;
8675
8676    if (!PyArg_ParseTuple(args, "O|O&O&:startswith", &subobj,
8677                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8678        return NULL;
8679    if (PyTuple_Check(subobj)) {
8680        Py_ssize_t i;
8681        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8682            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8683                PyTuple_GET_ITEM(subobj, i));
8684            if (substring == NULL)
8685                return NULL;
8686            result = tailmatch(self, substring, start, end, -1);
8687            Py_DECREF(substring);
8688            if (result) {
8689                Py_RETURN_TRUE;
8690            }
8691        }
8692        /* nothing matched */
8693        Py_RETURN_FALSE;
8694    }
8695    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8696    if (substring == NULL)
8697        return NULL;
8698    result = tailmatch(self, substring, start, end, -1);
8699    Py_DECREF(substring);
8700    return PyBool_FromLong(result);
8701}
8702
8703
8704PyDoc_STRVAR(endswith__doc__,
8705             "S.endswith(suffix[, start[, end]]) -> bool\n\
8706\n\
8707Return True if S ends with the specified suffix, False otherwise.\n\
8708With optional start, test S beginning at that position.\n\
8709With optional end, stop comparing S at that position.\n\
8710suffix can also be a tuple of strings to try.");
8711
8712static PyObject *
8713unicode_endswith(PyUnicodeObject *self,
8714                 PyObject *args)
8715{
8716    PyObject *subobj;
8717    PyUnicodeObject *substring;
8718    Py_ssize_t start = 0;
8719    Py_ssize_t end = PY_SSIZE_T_MAX;
8720    int result;
8721
8722    if (!PyArg_ParseTuple(args, "O|O&O&:endswith", &subobj,
8723                          _PyEval_SliceIndex, &start, _PyEval_SliceIndex, &end))
8724        return NULL;
8725    if (PyTuple_Check(subobj)) {
8726        Py_ssize_t i;
8727        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
8728            substring = (PyUnicodeObject *)PyUnicode_FromObject(
8729                PyTuple_GET_ITEM(subobj, i));
8730            if (substring == NULL)
8731                return NULL;
8732            result = tailmatch(self, substring, start, end, +1);
8733            Py_DECREF(substring);
8734            if (result) {
8735                Py_RETURN_TRUE;
8736            }
8737        }
8738        Py_RETURN_FALSE;
8739    }
8740    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
8741    if (substring == NULL)
8742        return NULL;
8743
8744    result = tailmatch(self, substring, start, end, +1);
8745    Py_DECREF(substring);
8746    return PyBool_FromLong(result);
8747}
8748
8749#include "stringlib/string_format.h"
8750
8751PyDoc_STRVAR(format__doc__,
8752             "S.format(*args, **kwargs) -> str\n\
8753\n\
8754");
8755
8756static PyObject *
8757unicode__format__(PyObject* self, PyObject* args)
8758{
8759    PyObject *format_spec;
8760
8761    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
8762        return NULL;
8763
8764    return _PyUnicode_FormatAdvanced(self,
8765                                     PyUnicode_AS_UNICODE(format_spec),
8766                                     PyUnicode_GET_SIZE(format_spec));
8767}
8768
8769PyDoc_STRVAR(p_format__doc__,
8770             "S.__format__(format_spec) -> str\n\
8771\n\
8772");
8773
8774static PyObject *
8775unicode__sizeof__(PyUnicodeObject *v)
8776{
8777    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
8778                              sizeof(Py_UNICODE) * (v->length + 1));
8779}
8780
8781PyDoc_STRVAR(sizeof__doc__,
8782             "S.__sizeof__() -> size of S in memory, in bytes");
8783
8784static PyObject *
8785unicode_getnewargs(PyUnicodeObject *v)
8786{
8787    return Py_BuildValue("(u#)", v->str, v->length);
8788}
8789
8790
8791static PyMethodDef unicode_methods[] = {
8792
8793    /* Order is according to common usage: often used methods should
8794       appear first, since lookup is done sequentially. */
8795
8796    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
8797    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
8798    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
8799    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
8800    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
8801    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
8802    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
8803    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
8804    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
8805    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
8806    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
8807    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
8808    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
8809    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
8810    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
8811    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
8812    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
8813    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
8814    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
8815    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
8816    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
8817    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
8818    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
8819    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
8820    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
8821    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
8822    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
8823    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
8824    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
8825    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
8826    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
8827    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
8828    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
8829    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
8830    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
8831    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
8832    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
8833    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
8834    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
8835    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
8836    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
8837    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
8838    {"_formatter_field_name_split", (PyCFunction) formatter_field_name_split, METH_NOARGS},
8839    {"_formatter_parser", (PyCFunction) formatter_parser, METH_NOARGS},
8840    {"maketrans", (PyCFunction) unicode_maketrans,
8841     METH_VARARGS | METH_STATIC, maketrans__doc__},
8842    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
8843#if 0
8844    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
8845#endif
8846
8847#if 0
8848    /* This one is just used for debugging the implementation. */
8849    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
8850#endif
8851
8852    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
8853    {NULL, NULL}
8854};
8855
8856static PyObject *
8857unicode_mod(PyObject *v, PyObject *w)
8858{
8859    if (!PyUnicode_Check(v)) {
8860        Py_INCREF(Py_NotImplemented);
8861        return Py_NotImplemented;
8862    }
8863    return PyUnicode_Format(v, w);
8864}
8865
8866static PyNumberMethods unicode_as_number = {
8867    0,              /*nb_add*/
8868    0,              /*nb_subtract*/
8869    0,              /*nb_multiply*/
8870    unicode_mod,            /*nb_remainder*/
8871};
8872
8873static PySequenceMethods unicode_as_sequence = {
8874    (lenfunc) unicode_length,       /* sq_length */
8875    PyUnicode_Concat,           /* sq_concat */
8876    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
8877    (ssizeargfunc) unicode_getitem,     /* sq_item */
8878    0,                  /* sq_slice */
8879    0,                  /* sq_ass_item */
8880    0,                  /* sq_ass_slice */
8881    PyUnicode_Contains,         /* sq_contains */
8882};
8883
8884static PyObject*
8885unicode_subscript(PyUnicodeObject* self, PyObject* item)
8886{
8887    if (PyIndex_Check(item)) {
8888        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
8889        if (i == -1 && PyErr_Occurred())
8890            return NULL;
8891        if (i < 0)
8892            i += PyUnicode_GET_SIZE(self);
8893        return unicode_getitem(self, i);
8894    } else if (PySlice_Check(item)) {
8895        Py_ssize_t start, stop, step, slicelength, cur, i;
8896        Py_UNICODE* source_buf;
8897        Py_UNICODE* result_buf;
8898        PyObject* result;
8899
8900        if (PySlice_GetIndicesEx((PySliceObject*)item, PyUnicode_GET_SIZE(self),
8901                                 &start, &stop, &step, &slicelength) < 0) {
8902            return NULL;
8903        }
8904
8905        if (slicelength <= 0) {
8906            return PyUnicode_FromUnicode(NULL, 0);
8907        } else if (start == 0 && step == 1 && slicelength == self->length &&
8908                   PyUnicode_CheckExact(self)) {
8909            Py_INCREF(self);
8910            return (PyObject *)self;
8911        } else if (step == 1) {
8912            return PyUnicode_FromUnicode(self->str + start, slicelength);
8913        } else {
8914            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
8915            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
8916                                                       sizeof(Py_UNICODE));
8917
8918            if (result_buf == NULL)
8919                return PyErr_NoMemory();
8920
8921            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
8922                result_buf[i] = source_buf[cur];
8923            }
8924
8925            result = PyUnicode_FromUnicode(result_buf, slicelength);
8926            PyObject_FREE(result_buf);
8927            return result;
8928        }
8929    } else {
8930        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
8931        return NULL;
8932    }
8933}
8934
8935static PyMappingMethods unicode_as_mapping = {
8936    (lenfunc)unicode_length,        /* mp_length */
8937    (binaryfunc)unicode_subscript,  /* mp_subscript */
8938    (objobjargproc)0,           /* mp_ass_subscript */
8939};
8940
8941
8942/* Helpers for PyUnicode_Format() */
8943
8944static PyObject *
8945getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
8946{
8947    Py_ssize_t argidx = *p_argidx;
8948    if (argidx < arglen) {
8949        (*p_argidx)++;
8950        if (arglen < 0)
8951            return args;
8952        else
8953            return PyTuple_GetItem(args, argidx);
8954    }
8955    PyErr_SetString(PyExc_TypeError,
8956                    "not enough arguments for format string");
8957    return NULL;
8958}
8959
8960/* Returns a new reference to a PyUnicode object, or NULL on failure. */
8961
8962static PyObject *
8963formatfloat(PyObject *v, int flags, int prec, int type)
8964{
8965    char *p;
8966    PyObject *result;
8967    double x;
8968
8969    x = PyFloat_AsDouble(v);
8970    if (x == -1.0 && PyErr_Occurred())
8971        return NULL;
8972
8973    if (prec < 0)
8974        prec = 6;
8975
8976    p = PyOS_double_to_string(x, type, prec,
8977                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
8978    if (p == NULL)
8979        return NULL;
8980    result = PyUnicode_FromStringAndSize(p, strlen(p));
8981    PyMem_Free(p);
8982    return result;
8983}
8984
8985static PyObject*
8986formatlong(PyObject *val, int flags, int prec, int type)
8987{
8988    char *buf;
8989    int len;
8990    PyObject *str; /* temporary string object. */
8991    PyObject *result;
8992
8993    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
8994    if (!str)
8995        return NULL;
8996    result = PyUnicode_FromStringAndSize(buf, len);
8997    Py_DECREF(str);
8998    return result;
8999}
9000
9001static int
9002formatchar(Py_UNICODE *buf,
9003           size_t buflen,
9004           PyObject *v)
9005{
9006    /* presume that the buffer is at least 3 characters long */
9007    if (PyUnicode_Check(v)) {
9008        if (PyUnicode_GET_SIZE(v) == 1) {
9009            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9010            buf[1] = '\0';
9011            return 1;
9012        }
9013#ifndef Py_UNICODE_WIDE
9014        if (PyUnicode_GET_SIZE(v) == 2) {
9015            /* Decode a valid surrogate pair */
9016            int c0 = PyUnicode_AS_UNICODE(v)[0];
9017            int c1 = PyUnicode_AS_UNICODE(v)[1];
9018            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9019                0xDC00 <= c1 && c1 <= 0xDFFF) {
9020                buf[0] = c0;
9021                buf[1] = c1;
9022                buf[2] = '\0';
9023                return 2;
9024            }
9025        }
9026#endif
9027        goto onError;
9028    }
9029    else {
9030        /* Integer input truncated to a character */
9031        long x;
9032        x = PyLong_AsLong(v);
9033        if (x == -1 && PyErr_Occurred())
9034            goto onError;
9035
9036        if (x < 0 || x > 0x10ffff) {
9037            PyErr_SetString(PyExc_OverflowError,
9038                            "%c arg not in range(0x110000)");
9039            return -1;
9040        }
9041
9042#ifndef Py_UNICODE_WIDE
9043        if (x > 0xffff) {
9044            x -= 0x10000;
9045            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9046            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9047            return 2;
9048        }
9049#endif
9050        buf[0] = (Py_UNICODE) x;
9051        buf[1] = '\0';
9052        return 1;
9053    }
9054
9055  onError:
9056    PyErr_SetString(PyExc_TypeError,
9057                    "%c requires int or char");
9058    return -1;
9059}
9060
9061/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9062   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9063*/
9064#define FORMATBUFLEN (size_t)10
9065
9066PyObject *PyUnicode_Format(PyObject *format,
9067                           PyObject *args)
9068{
9069    Py_UNICODE *fmt, *res;
9070    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9071    int args_owned = 0;
9072    PyUnicodeObject *result = NULL;
9073    PyObject *dict = NULL;
9074    PyObject *uformat;
9075
9076    if (format == NULL || args == NULL) {
9077        PyErr_BadInternalCall();
9078        return NULL;
9079    }
9080    uformat = PyUnicode_FromObject(format);
9081    if (uformat == NULL)
9082        return NULL;
9083    fmt = PyUnicode_AS_UNICODE(uformat);
9084    fmtcnt = PyUnicode_GET_SIZE(uformat);
9085
9086    reslen = rescnt = fmtcnt + 100;
9087    result = _PyUnicode_New(reslen);
9088    if (result == NULL)
9089        goto onError;
9090    res = PyUnicode_AS_UNICODE(result);
9091
9092    if (PyTuple_Check(args)) {
9093        arglen = PyTuple_Size(args);
9094        argidx = 0;
9095    }
9096    else {
9097        arglen = -1;
9098        argidx = -2;
9099    }
9100    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9101        !PyUnicode_Check(args))
9102        dict = args;
9103
9104    while (--fmtcnt >= 0) {
9105        if (*fmt != '%') {
9106            if (--rescnt < 0) {
9107                rescnt = fmtcnt + 100;
9108                reslen += rescnt;
9109                if (_PyUnicode_Resize(&result, reslen) < 0)
9110                    goto onError;
9111                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9112                --rescnt;
9113            }
9114            *res++ = *fmt++;
9115        }
9116        else {
9117            /* Got a format specifier */
9118            int flags = 0;
9119            Py_ssize_t width = -1;
9120            int prec = -1;
9121            Py_UNICODE c = '\0';
9122            Py_UNICODE fill;
9123            int isnumok;
9124            PyObject *v = NULL;
9125            PyObject *temp = NULL;
9126            Py_UNICODE *pbuf;
9127            Py_UNICODE sign;
9128            Py_ssize_t len;
9129            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9130
9131            fmt++;
9132            if (*fmt == '(') {
9133                Py_UNICODE *keystart;
9134                Py_ssize_t keylen;
9135                PyObject *key;
9136                int pcount = 1;
9137
9138                if (dict == NULL) {
9139                    PyErr_SetString(PyExc_TypeError,
9140                                    "format requires a mapping");
9141                    goto onError;
9142                }
9143                ++fmt;
9144                --fmtcnt;
9145                keystart = fmt;
9146                /* Skip over balanced parentheses */
9147                while (pcount > 0 && --fmtcnt >= 0) {
9148                    if (*fmt == ')')
9149                        --pcount;
9150                    else if (*fmt == '(')
9151                        ++pcount;
9152                    fmt++;
9153                }
9154                keylen = fmt - keystart - 1;
9155                if (fmtcnt < 0 || pcount > 0) {
9156                    PyErr_SetString(PyExc_ValueError,
9157                                    "incomplete format key");
9158                    goto onError;
9159                }
9160#if 0
9161                /* keys are converted to strings using UTF-8 and
9162                   then looked up since Python uses strings to hold
9163                   variables names etc. in its namespaces and we
9164                   wouldn't want to break common idioms. */
9165                key = PyUnicode_EncodeUTF8(keystart,
9166                                           keylen,
9167                                           NULL);
9168#else
9169                key = PyUnicode_FromUnicode(keystart, keylen);
9170#endif
9171                if (key == NULL)
9172                    goto onError;
9173                if (args_owned) {
9174                    Py_DECREF(args);
9175                    args_owned = 0;
9176                }
9177                args = PyObject_GetItem(dict, key);
9178                Py_DECREF(key);
9179                if (args == NULL) {
9180                    goto onError;
9181                }
9182                args_owned = 1;
9183                arglen = -1;
9184                argidx = -2;
9185            }
9186            while (--fmtcnt >= 0) {
9187                switch (c = *fmt++) {
9188                case '-': flags |= F_LJUST; continue;
9189                case '+': flags |= F_SIGN; continue;
9190                case ' ': flags |= F_BLANK; continue;
9191                case '#': flags |= F_ALT; continue;
9192                case '0': flags |= F_ZERO; continue;
9193                }
9194                break;
9195            }
9196            if (c == '*') {
9197                v = getnextarg(args, arglen, &argidx);
9198                if (v == NULL)
9199                    goto onError;
9200                if (!PyLong_Check(v)) {
9201                    PyErr_SetString(PyExc_TypeError,
9202                                    "* wants int");
9203                    goto onError;
9204                }
9205                width = PyLong_AsLong(v);
9206                if (width == -1 && PyErr_Occurred())
9207                    goto onError;
9208                if (width < 0) {
9209                    flags |= F_LJUST;
9210                    width = -width;
9211                }
9212                if (--fmtcnt >= 0)
9213                    c = *fmt++;
9214            }
9215            else if (c >= '0' && c <= '9') {
9216                width = c - '0';
9217                while (--fmtcnt >= 0) {
9218                    c = *fmt++;
9219                    if (c < '0' || c > '9')
9220                        break;
9221                    if ((width*10) / 10 != width) {
9222                        PyErr_SetString(PyExc_ValueError,
9223                                        "width too big");
9224                        goto onError;
9225                    }
9226                    width = width*10 + (c - '0');
9227                }
9228            }
9229            if (c == '.') {
9230                prec = 0;
9231                if (--fmtcnt >= 0)
9232                    c = *fmt++;
9233                if (c == '*') {
9234                    v = getnextarg(args, arglen, &argidx);
9235                    if (v == NULL)
9236                        goto onError;
9237                    if (!PyLong_Check(v)) {
9238                        PyErr_SetString(PyExc_TypeError,
9239                                        "* wants int");
9240                        goto onError;
9241                    }
9242                    prec = PyLong_AsLong(v);
9243                    if (prec == -1 && PyErr_Occurred())
9244                        goto onError;
9245                    if (prec < 0)
9246                        prec = 0;
9247                    if (--fmtcnt >= 0)
9248                        c = *fmt++;
9249                }
9250                else if (c >= '0' && c <= '9') {
9251                    prec = c - '0';
9252                    while (--fmtcnt >= 0) {
9253                        c = *fmt++;
9254                        if (c < '0' || c > '9')
9255                            break;
9256                        if ((prec*10) / 10 != prec) {
9257                            PyErr_SetString(PyExc_ValueError,
9258                                            "prec too big");
9259                            goto onError;
9260                        }
9261                        prec = prec*10 + (c - '0');
9262                    }
9263                }
9264            } /* prec */
9265            if (fmtcnt >= 0) {
9266                if (c == 'h' || c == 'l' || c == 'L') {
9267                    if (--fmtcnt >= 0)
9268                        c = *fmt++;
9269                }
9270            }
9271            if (fmtcnt < 0) {
9272                PyErr_SetString(PyExc_ValueError,
9273                                "incomplete format");
9274                goto onError;
9275            }
9276            if (c != '%') {
9277                v = getnextarg(args, arglen, &argidx);
9278                if (v == NULL)
9279                    goto onError;
9280            }
9281            sign = 0;
9282            fill = ' ';
9283            switch (c) {
9284
9285            case '%':
9286                pbuf = formatbuf;
9287                /* presume that buffer length is at least 1 */
9288                pbuf[0] = '%';
9289                len = 1;
9290                break;
9291
9292            case 's':
9293            case 'r':
9294            case 'a':
9295                if (PyUnicode_CheckExact(v) && c == 's') {
9296                    temp = v;
9297                    Py_INCREF(temp);
9298                }
9299                else {
9300                    if (c == 's')
9301                        temp = PyObject_Str(v);
9302                    else if (c == 'r')
9303                        temp = PyObject_Repr(v);
9304                    else
9305                        temp = PyObject_ASCII(v);
9306                    if (temp == NULL)
9307                        goto onError;
9308                    if (PyUnicode_Check(temp))
9309                        /* nothing to do */;
9310                    else {
9311                        Py_DECREF(temp);
9312                        PyErr_SetString(PyExc_TypeError,
9313                                        "%s argument has non-string str()");
9314                        goto onError;
9315                    }
9316                }
9317                pbuf = PyUnicode_AS_UNICODE(temp);
9318                len = PyUnicode_GET_SIZE(temp);
9319                if (prec >= 0 && len > prec)
9320                    len = prec;
9321                break;
9322
9323            case 'i':
9324            case 'd':
9325            case 'u':
9326            case 'o':
9327            case 'x':
9328            case 'X':
9329                if (c == 'i')
9330                    c = 'd';
9331                isnumok = 0;
9332                if (PyNumber_Check(v)) {
9333                    PyObject *iobj=NULL;
9334
9335                    if (PyLong_Check(v)) {
9336                        iobj = v;
9337                        Py_INCREF(iobj);
9338                    }
9339                    else {
9340                        iobj = PyNumber_Long(v);
9341                    }
9342                    if (iobj!=NULL) {
9343                        if (PyLong_Check(iobj)) {
9344                            isnumok = 1;
9345                            temp = formatlong(iobj, flags, prec, c);
9346                            Py_DECREF(iobj);
9347                            if (!temp)
9348                                goto onError;
9349                            pbuf = PyUnicode_AS_UNICODE(temp);
9350                            len = PyUnicode_GET_SIZE(temp);
9351                            sign = 1;
9352                        }
9353                        else {
9354                            Py_DECREF(iobj);
9355                        }
9356                    }
9357                }
9358                if (!isnumok) {
9359                    PyErr_Format(PyExc_TypeError,
9360                                 "%%%c format: a number is required, "
9361                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9362                    goto onError;
9363                }
9364                if (flags & F_ZERO)
9365                    fill = '0';
9366                break;
9367
9368            case 'e':
9369            case 'E':
9370            case 'f':
9371            case 'F':
9372            case 'g':
9373            case 'G':
9374                temp = formatfloat(v, flags, prec, c);
9375                if (!temp)
9376                    goto onError;
9377                pbuf = PyUnicode_AS_UNICODE(temp);
9378                len = PyUnicode_GET_SIZE(temp);
9379                sign = 1;
9380                if (flags & F_ZERO)
9381                    fill = '0';
9382                break;
9383
9384            case 'c':
9385                pbuf = formatbuf;
9386                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9387                if (len < 0)
9388                    goto onError;
9389                break;
9390
9391            default:
9392                PyErr_Format(PyExc_ValueError,
9393                             "unsupported format character '%c' (0x%x) "
9394                             "at index %zd",
9395                             (31<=c && c<=126) ? (char)c : '?',
9396                             (int)c,
9397                             (Py_ssize_t)(fmt - 1 -
9398                                          PyUnicode_AS_UNICODE(uformat)));
9399                goto onError;
9400            }
9401            if (sign) {
9402                if (*pbuf == '-' || *pbuf == '+') {
9403                    sign = *pbuf++;
9404                    len--;
9405                }
9406                else if (flags & F_SIGN)
9407                    sign = '+';
9408                else if (flags & F_BLANK)
9409                    sign = ' ';
9410                else
9411                    sign = 0;
9412            }
9413            if (width < len)
9414                width = len;
9415            if (rescnt - (sign != 0) < width) {
9416                reslen -= rescnt;
9417                rescnt = width + fmtcnt + 100;
9418                reslen += rescnt;
9419                if (reslen < 0) {
9420                    Py_XDECREF(temp);
9421                    PyErr_NoMemory();
9422                    goto onError;
9423                }
9424                if (_PyUnicode_Resize(&result, reslen) < 0) {
9425                    Py_XDECREF(temp);
9426                    goto onError;
9427                }
9428                res = PyUnicode_AS_UNICODE(result)
9429                    + reslen - rescnt;
9430            }
9431            if (sign) {
9432                if (fill != ' ')
9433                    *res++ = sign;
9434                rescnt--;
9435                if (width > len)
9436                    width--;
9437            }
9438            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9439                assert(pbuf[0] == '0');
9440                assert(pbuf[1] == c);
9441                if (fill != ' ') {
9442                    *res++ = *pbuf++;
9443                    *res++ = *pbuf++;
9444                }
9445                rescnt -= 2;
9446                width -= 2;
9447                if (width < 0)
9448                    width = 0;
9449                len -= 2;
9450            }
9451            if (width > len && !(flags & F_LJUST)) {
9452                do {
9453                    --rescnt;
9454                    *res++ = fill;
9455                } while (--width > len);
9456            }
9457            if (fill == ' ') {
9458                if (sign)
9459                    *res++ = sign;
9460                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9461                    assert(pbuf[0] == '0');
9462                    assert(pbuf[1] == c);
9463                    *res++ = *pbuf++;
9464                    *res++ = *pbuf++;
9465                }
9466            }
9467            Py_UNICODE_COPY(res, pbuf, len);
9468            res += len;
9469            rescnt -= len;
9470            while (--width >= len) {
9471                --rescnt;
9472                *res++ = ' ';
9473            }
9474            if (dict && (argidx < arglen) && c != '%') {
9475                PyErr_SetString(PyExc_TypeError,
9476                                "not all arguments converted during string formatting");
9477                Py_XDECREF(temp);
9478                goto onError;
9479            }
9480            Py_XDECREF(temp);
9481        } /* '%' */
9482    } /* until end */
9483    if (argidx < arglen && !dict) {
9484        PyErr_SetString(PyExc_TypeError,
9485                        "not all arguments converted during string formatting");
9486        goto onError;
9487    }
9488
9489    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9490        goto onError;
9491    if (args_owned) {
9492        Py_DECREF(args);
9493    }
9494    Py_DECREF(uformat);
9495    return (PyObject *)result;
9496
9497  onError:
9498    Py_XDECREF(result);
9499    Py_DECREF(uformat);
9500    if (args_owned) {
9501        Py_DECREF(args);
9502    }
9503    return NULL;
9504}
9505
9506static PyObject *
9507unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9508
9509static PyObject *
9510unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9511{
9512    PyObject *x = NULL;
9513    static char *kwlist[] = {"object", "encoding", "errors", 0};
9514    char *encoding = NULL;
9515    char *errors = NULL;
9516
9517    if (type != &PyUnicode_Type)
9518        return unicode_subtype_new(type, args, kwds);
9519    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9520                                     kwlist, &x, &encoding, &errors))
9521        return NULL;
9522    if (x == NULL)
9523        return (PyObject *)_PyUnicode_New(0);
9524    if (encoding == NULL && errors == NULL)
9525        return PyObject_Str(x);
9526    else
9527        return PyUnicode_FromEncodedObject(x, encoding, errors);
9528}
9529
9530static PyObject *
9531unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9532{
9533    PyUnicodeObject *tmp, *pnew;
9534    Py_ssize_t n;
9535
9536    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9537    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9538    if (tmp == NULL)
9539        return NULL;
9540    assert(PyUnicode_Check(tmp));
9541    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9542    if (pnew == NULL) {
9543        Py_DECREF(tmp);
9544        return NULL;
9545    }
9546    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9547    if (pnew->str == NULL) {
9548        _Py_ForgetReference((PyObject *)pnew);
9549        PyObject_Del(pnew);
9550        Py_DECREF(tmp);
9551        return PyErr_NoMemory();
9552    }
9553    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9554    pnew->length = n;
9555    pnew->hash = tmp->hash;
9556    Py_DECREF(tmp);
9557    return (PyObject *)pnew;
9558}
9559
9560PyDoc_STRVAR(unicode_doc,
9561             "str(string[, encoding[, errors]]) -> str\n\
9562\n\
9563Create a new string object from the given encoded string.\n\
9564encoding defaults to the current default string encoding.\n\
9565errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9566
9567static PyObject *unicode_iter(PyObject *seq);
9568
9569PyTypeObject PyUnicode_Type = {
9570    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9571    "str",              /* tp_name */
9572    sizeof(PyUnicodeObject),        /* tp_size */
9573    0,                  /* tp_itemsize */
9574    /* Slots */
9575    (destructor)unicode_dealloc,    /* tp_dealloc */
9576    0,                  /* tp_print */
9577    0,                  /* tp_getattr */
9578    0,                  /* tp_setattr */
9579    0,                  /* tp_reserved */
9580    unicode_repr,           /* tp_repr */
9581    &unicode_as_number,         /* tp_as_number */
9582    &unicode_as_sequence,       /* tp_as_sequence */
9583    &unicode_as_mapping,        /* tp_as_mapping */
9584    (hashfunc) unicode_hash,        /* tp_hash*/
9585    0,                  /* tp_call*/
9586    (reprfunc) unicode_str,     /* tp_str */
9587    PyObject_GenericGetAttr,        /* tp_getattro */
9588    0,                  /* tp_setattro */
9589    0,                  /* tp_as_buffer */
9590    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9591    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9592    unicode_doc,            /* tp_doc */
9593    0,                  /* tp_traverse */
9594    0,                  /* tp_clear */
9595    PyUnicode_RichCompare,      /* tp_richcompare */
9596    0,                  /* tp_weaklistoffset */
9597    unicode_iter,           /* tp_iter */
9598    0,                  /* tp_iternext */
9599    unicode_methods,            /* tp_methods */
9600    0,                  /* tp_members */
9601    0,                  /* tp_getset */
9602    &PyBaseObject_Type,         /* tp_base */
9603    0,                  /* tp_dict */
9604    0,                  /* tp_descr_get */
9605    0,                  /* tp_descr_set */
9606    0,                  /* tp_dictoffset */
9607    0,                  /* tp_init */
9608    0,                  /* tp_alloc */
9609    unicode_new,            /* tp_new */
9610    PyObject_Del,           /* tp_free */
9611};
9612
9613/* Initialize the Unicode implementation */
9614
9615void _PyUnicode_Init(void)
9616{
9617    int i;
9618
9619    /* XXX - move this array to unicodectype.c ? */
9620    Py_UNICODE linebreak[] = {
9621        0x000A, /* LINE FEED */
9622        0x000D, /* CARRIAGE RETURN */
9623        0x001C, /* FILE SEPARATOR */
9624        0x001D, /* GROUP SEPARATOR */
9625        0x001E, /* RECORD SEPARATOR */
9626        0x0085, /* NEXT LINE */
9627        0x2028, /* LINE SEPARATOR */
9628        0x2029, /* PARAGRAPH SEPARATOR */
9629    };
9630
9631    /* Init the implementation */
9632    free_list = NULL;
9633    numfree = 0;
9634    unicode_empty = _PyUnicode_New(0);
9635    if (!unicode_empty)
9636        return;
9637
9638    for (i = 0; i < 256; i++)
9639        unicode_latin1[i] = NULL;
9640    if (PyType_Ready(&PyUnicode_Type) < 0)
9641        Py_FatalError("Can't initialize 'unicode'");
9642
9643    /* initialize the linebreak bloom filter */
9644    bloom_linebreak = make_bloom_mask(
9645        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
9646        );
9647
9648    PyType_Ready(&EncodingMapType);
9649}
9650
9651/* Finalize the Unicode implementation */
9652
9653int
9654PyUnicode_ClearFreeList(void)
9655{
9656    int freelist_size = numfree;
9657    PyUnicodeObject *u;
9658
9659    for (u = free_list; u != NULL;) {
9660        PyUnicodeObject *v = u;
9661        u = *(PyUnicodeObject **)u;
9662        if (v->str)
9663            PyObject_DEL(v->str);
9664        Py_XDECREF(v->defenc);
9665        PyObject_Del(v);
9666        numfree--;
9667    }
9668    free_list = NULL;
9669    assert(numfree == 0);
9670    return freelist_size;
9671}
9672
9673void
9674_PyUnicode_Fini(void)
9675{
9676    int i;
9677
9678    Py_XDECREF(unicode_empty);
9679    unicode_empty = NULL;
9680
9681    for (i = 0; i < 256; i++) {
9682        if (unicode_latin1[i]) {
9683            Py_DECREF(unicode_latin1[i]);
9684            unicode_latin1[i] = NULL;
9685        }
9686    }
9687    (void)PyUnicode_ClearFreeList();
9688}
9689
9690void
9691PyUnicode_InternInPlace(PyObject **p)
9692{
9693    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
9694    PyObject *t;
9695    if (s == NULL || !PyUnicode_Check(s))
9696        Py_FatalError(
9697            "PyUnicode_InternInPlace: unicode strings only please!");
9698    /* If it's a subclass, we don't really know what putting
9699       it in the interned dict might do. */
9700    if (!PyUnicode_CheckExact(s))
9701        return;
9702    if (PyUnicode_CHECK_INTERNED(s))
9703        return;
9704    if (interned == NULL) {
9705        interned = PyDict_New();
9706        if (interned == NULL) {
9707            PyErr_Clear(); /* Don't leave an exception */
9708            return;
9709        }
9710    }
9711    /* It might be that the GetItem call fails even
9712       though the key is present in the dictionary,
9713       namely when this happens during a stack overflow. */
9714    Py_ALLOW_RECURSION
9715        t = PyDict_GetItem(interned, (PyObject *)s);
9716    Py_END_ALLOW_RECURSION
9717
9718        if (t) {
9719            Py_INCREF(t);
9720            Py_DECREF(*p);
9721            *p = t;
9722            return;
9723        }
9724
9725    PyThreadState_GET()->recursion_critical = 1;
9726    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
9727        PyErr_Clear();
9728        PyThreadState_GET()->recursion_critical = 0;
9729        return;
9730    }
9731    PyThreadState_GET()->recursion_critical = 0;
9732    /* The two references in interned are not counted by refcnt.
9733       The deallocator will take care of this */
9734    Py_REFCNT(s) -= 2;
9735    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
9736}
9737
9738void
9739PyUnicode_InternImmortal(PyObject **p)
9740{
9741    PyUnicode_InternInPlace(p);
9742    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
9743        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
9744        Py_INCREF(*p);
9745    }
9746}
9747
9748PyObject *
9749PyUnicode_InternFromString(const char *cp)
9750{
9751    PyObject *s = PyUnicode_FromString(cp);
9752    if (s == NULL)
9753        return NULL;
9754    PyUnicode_InternInPlace(&s);
9755    return s;
9756}
9757
9758void _Py_ReleaseInternedUnicodeStrings(void)
9759{
9760    PyObject *keys;
9761    PyUnicodeObject *s;
9762    Py_ssize_t i, n;
9763    Py_ssize_t immortal_size = 0, mortal_size = 0;
9764
9765    if (interned == NULL || !PyDict_Check(interned))
9766        return;
9767    keys = PyDict_Keys(interned);
9768    if (keys == NULL || !PyList_Check(keys)) {
9769        PyErr_Clear();
9770        return;
9771    }
9772
9773    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
9774       detector, interned unicode strings are not forcibly deallocated;
9775       rather, we give them their stolen references back, and then clear
9776       and DECREF the interned dict. */
9777
9778    n = PyList_GET_SIZE(keys);
9779    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
9780            n);
9781    for (i = 0; i < n; i++) {
9782        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
9783        switch (s->state) {
9784        case SSTATE_NOT_INTERNED:
9785            /* XXX Shouldn't happen */
9786            break;
9787        case SSTATE_INTERNED_IMMORTAL:
9788            Py_REFCNT(s) += 1;
9789            immortal_size += s->length;
9790            break;
9791        case SSTATE_INTERNED_MORTAL:
9792            Py_REFCNT(s) += 2;
9793            mortal_size += s->length;
9794            break;
9795        default:
9796            Py_FatalError("Inconsistent interned string state.");
9797        }
9798        s->state = SSTATE_NOT_INTERNED;
9799    }
9800    fprintf(stderr, "total size of all interned strings: "
9801            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
9802            "mortal/immortal\n", mortal_size, immortal_size);
9803    Py_DECREF(keys);
9804    PyDict_Clear(interned);
9805    Py_DECREF(interned);
9806    interned = NULL;
9807}
9808
9809
9810/********************* Unicode Iterator **************************/
9811
9812typedef struct {
9813    PyObject_HEAD
9814    Py_ssize_t it_index;
9815    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
9816} unicodeiterobject;
9817
9818static void
9819unicodeiter_dealloc(unicodeiterobject *it)
9820{
9821    _PyObject_GC_UNTRACK(it);
9822    Py_XDECREF(it->it_seq);
9823    PyObject_GC_Del(it);
9824}
9825
9826static int
9827unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
9828{
9829    Py_VISIT(it->it_seq);
9830    return 0;
9831}
9832
9833static PyObject *
9834unicodeiter_next(unicodeiterobject *it)
9835{
9836    PyUnicodeObject *seq;
9837    PyObject *item;
9838
9839    assert(it != NULL);
9840    seq = it->it_seq;
9841    if (seq == NULL)
9842        return NULL;
9843    assert(PyUnicode_Check(seq));
9844
9845    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
9846        item = PyUnicode_FromUnicode(
9847            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
9848        if (item != NULL)
9849            ++it->it_index;
9850        return item;
9851    }
9852
9853    Py_DECREF(seq);
9854    it->it_seq = NULL;
9855    return NULL;
9856}
9857
9858static PyObject *
9859unicodeiter_len(unicodeiterobject *it)
9860{
9861    Py_ssize_t len = 0;
9862    if (it->it_seq)
9863        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
9864    return PyLong_FromSsize_t(len);
9865}
9866
9867PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
9868
9869static PyMethodDef unicodeiter_methods[] = {
9870    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
9871     length_hint_doc},
9872    {NULL,      NULL}       /* sentinel */
9873};
9874
9875PyTypeObject PyUnicodeIter_Type = {
9876    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9877    "str_iterator",         /* tp_name */
9878    sizeof(unicodeiterobject),      /* tp_basicsize */
9879    0,                  /* tp_itemsize */
9880    /* methods */
9881    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
9882    0,                  /* tp_print */
9883    0,                  /* tp_getattr */
9884    0,                  /* tp_setattr */
9885    0,                  /* tp_reserved */
9886    0,                  /* tp_repr */
9887    0,                  /* tp_as_number */
9888    0,                  /* tp_as_sequence */
9889    0,                  /* tp_as_mapping */
9890    0,                  /* tp_hash */
9891    0,                  /* tp_call */
9892    0,                  /* tp_str */
9893    PyObject_GenericGetAttr,        /* tp_getattro */
9894    0,                  /* tp_setattro */
9895    0,                  /* tp_as_buffer */
9896    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
9897    0,                  /* tp_doc */
9898    (traverseproc)unicodeiter_traverse, /* tp_traverse */
9899    0,                  /* tp_clear */
9900    0,                  /* tp_richcompare */
9901    0,                  /* tp_weaklistoffset */
9902    PyObject_SelfIter,          /* tp_iter */
9903    (iternextfunc)unicodeiter_next,     /* tp_iternext */
9904    unicodeiter_methods,            /* tp_methods */
9905    0,
9906};
9907
9908static PyObject *
9909unicode_iter(PyObject *seq)
9910{
9911    unicodeiterobject *it;
9912
9913    if (!PyUnicode_Check(seq)) {
9914        PyErr_BadInternalCall();
9915        return NULL;
9916    }
9917    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
9918    if (it == NULL)
9919        return NULL;
9920    it->it_index = 0;
9921    Py_INCREF(seq);
9922    it->it_seq = (PyUnicodeObject *)seq;
9923    _PyObject_GC_TRACK(it);
9924    return (PyObject *)it;
9925}
9926
9927size_t
9928Py_UNICODE_strlen(const Py_UNICODE *u)
9929{
9930    int res = 0;
9931    while(*u++)
9932        res++;
9933    return res;
9934}
9935
9936Py_UNICODE*
9937Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
9938{
9939    Py_UNICODE *u = s1;
9940    while ((*u++ = *s2++));
9941    return s1;
9942}
9943
9944Py_UNICODE*
9945Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9946{
9947    Py_UNICODE *u = s1;
9948    while ((*u++ = *s2++))
9949        if (n-- == 0)
9950            break;
9951    return s1;
9952}
9953
9954Py_UNICODE*
9955Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
9956{
9957    Py_UNICODE *u1 = s1;
9958    u1 += Py_UNICODE_strlen(u1);
9959    Py_UNICODE_strcpy(u1, s2);
9960    return s1;
9961}
9962
9963int
9964Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
9965{
9966    while (*s1 && *s2 && *s1 == *s2)
9967        s1++, s2++;
9968    if (*s1 && *s2)
9969        return (*s1 < *s2) ? -1 : +1;
9970    if (*s1)
9971        return 1;
9972    if (*s2)
9973        return -1;
9974    return 0;
9975}
9976
9977int
9978Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
9979{
9980    register Py_UNICODE u1, u2;
9981    for (; n != 0; n--) {
9982        u1 = *s1;
9983        u2 = *s2;
9984        if (u1 != u2)
9985            return (u1 < u2) ? -1 : +1;
9986        if (u1 == '\0')
9987            return 0;
9988        s1++;
9989        s2++;
9990    }
9991    return 0;
9992}
9993
9994Py_UNICODE*
9995Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
9996{
9997    const Py_UNICODE *p;
9998    for (p = s; *p; p++)
9999        if (*p == c)
10000            return (Py_UNICODE*)p;
10001    return NULL;
10002}
10003
10004Py_UNICODE*
10005Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10006{
10007    const Py_UNICODE *p;
10008    p = s + Py_UNICODE_strlen(s);
10009    while (p != s) {
10010        p--;
10011        if (*p == c)
10012            return (Py_UNICODE*)p;
10013    }
10014    return NULL;
10015}
10016
10017Py_UNICODE*
10018PyUnicode_AsUnicodeCopy(PyObject *object)
10019{
10020    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10021    Py_UNICODE *copy;
10022    Py_ssize_t size;
10023
10024    /* Ensure we won't overflow the size. */
10025    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10026        PyErr_NoMemory();
10027        return NULL;
10028    }
10029    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10030    size *= sizeof(Py_UNICODE);
10031    copy = PyMem_Malloc(size);
10032    if (copy == NULL) {
10033        PyErr_NoMemory();
10034        return NULL;
10035    }
10036    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10037    return copy;
10038}
10039
10040#ifdef __cplusplus
10041}
10042#endif
10043