unicodeobject.c revision 53516a82df8db500a968451daa54fc72eaed7056
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "ucnhash.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Limit for the Unicode object free list */
51
52#define PyUnicode_MAXFREELIST       1024
53
54/* Limit for the Unicode object free list stay alive optimization.
55
56   The implementation will keep allocated Unicode memory intact for
57   all objects on the free list having a size less than this
58   limit. This reduces malloc() overhead for small Unicode objects.
59
60   At worst this will result in PyUnicode_MAXFREELIST *
61   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
62   malloc()-overhead) bytes of unused garbage.
63
64   Setting the limit to 0 effectively turns the feature off.
65
66   Note: This is an experimental feature ! If you get core dumps when
67   using Unicode objects, turn this feature off.
68
69*/
70
71#define KEEPALIVE_SIZE_LIMIT       9
72
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
81/* --- Globals ------------------------------------------------------------
82
83   The globals are initialized by the _PyUnicode_Init() API and should
84   not be used before calling that API.
85
86*/
87
88
89#ifdef __cplusplus
90extern "C" {
91#endif
92
93/* This dictionary holds all interned unicode strings.  Note that references
94   to strings in this dictionary are *not* counted in the string's ob_refcnt.
95   When the interned string reaches a refcnt of 0 the string deallocation
96   function will delete the reference from this dictionary.
97
98   Another way to look at this is that to say that the actual reference
99   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
100*/
101static PyObject *interned;
102
103/* Free list for Unicode objects */
104static PyUnicodeObject *free_list;
105static int numfree;
106
107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111   shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
116    0, 0, 0, 0, 0, 0, 0, 0,
117/*     case 0x0009: * CHARACTER TABULATION */
118/*     case 0x000A: * LINE FEED */
119/*     case 0x000B: * LINE TABULATION */
120/*     case 0x000C: * FORM FEED */
121/*     case 0x000D: * CARRIAGE RETURN */
122    0, 1, 1, 1, 1, 1, 0, 0,
123    0, 0, 0, 0, 0, 0, 0, 0,
124/*     case 0x001C: * FILE SEPARATOR */
125/*     case 0x001D: * GROUP SEPARATOR */
126/*     case 0x001E: * RECORD SEPARATOR */
127/*     case 0x001F: * UNIT SEPARATOR */
128    0, 0, 0, 0, 1, 1, 1, 1,
129/*     case 0x0020: * SPACE */
130    1, 0, 0, 0, 0, 0, 0, 0,
131    0, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0,
134
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0,
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0
143};
144
145static PyObject *unicode_encode_call_errorhandler(const char *errors,
146       PyObject **errorHandler,const char *encoding, const char *reason,
147       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
148       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
149
150static void raise_encode_exception(PyObject **exceptionObject,
151                                   const char *encoding,
152                                   const Py_UNICODE *unicode, Py_ssize_t size,
153                                   Py_ssize_t startpos, Py_ssize_t endpos,
154                                   const char *reason);
155
156/* Same for linebreaks */
157static unsigned char ascii_linebreak[] = {
158    0, 0, 0, 0, 0, 0, 0, 0,
159/*         0x000A, * LINE FEED */
160/*         0x000B, * LINE TABULATION */
161/*         0x000C, * FORM FEED */
162/*         0x000D, * CARRIAGE RETURN */
163    0, 0, 1, 1, 1, 1, 0, 0,
164    0, 0, 0, 0, 0, 0, 0, 0,
165/*         0x001C, * FILE SEPARATOR */
166/*         0x001D, * GROUP SEPARATOR */
167/*         0x001E, * RECORD SEPARATOR */
168    0, 0, 0, 0, 1, 1, 1, 0,
169    0, 0, 0, 0, 0, 0, 0, 0,
170    0, 0, 0, 0, 0, 0, 0, 0,
171    0, 0, 0, 0, 0, 0, 0, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173
174    0, 0, 0, 0, 0, 0, 0, 0,
175    0, 0, 0, 0, 0, 0, 0, 0,
176    0, 0, 0, 0, 0, 0, 0, 0,
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0
182};
183
184
185Py_UNICODE
186PyUnicode_GetMax(void)
187{
188#ifdef Py_UNICODE_WIDE
189    return 0x10FFFF;
190#else
191    /* This is actually an illegal character, so it should
192       not be passed to unichr. */
193    return 0xFFFF;
194#endif
195}
196
197/* --- Bloom Filters ----------------------------------------------------- */
198
199/* stuff to implement simple "bloom filters" for Unicode characters.
200   to keep things simple, we use a single bitmask, using the least 5
201   bits from each unicode characters as the bit index. */
202
203/* the linebreak mask is set up by Unicode_Init below */
204
205#if LONG_BIT >= 128
206#define BLOOM_WIDTH 128
207#elif LONG_BIT >= 64
208#define BLOOM_WIDTH 64
209#elif LONG_BIT >= 32
210#define BLOOM_WIDTH 32
211#else
212#error "LONG_BIT is smaller than 32"
213#endif
214
215#define BLOOM_MASK unsigned long
216
217static BLOOM_MASK bloom_linebreak;
218
219#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
220#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
221
222#define BLOOM_LINEBREAK(ch)                                             \
223    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
224     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
225
226Py_LOCAL_INLINE(BLOOM_MASK) make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
227{
228    /* calculate simple bloom-style bitmask for a given unicode string */
229
230    BLOOM_MASK mask;
231    Py_ssize_t i;
232
233    mask = 0;
234    for (i = 0; i < len; i++)
235        BLOOM_ADD(mask, ptr[i]);
236
237    return mask;
238}
239
240Py_LOCAL_INLINE(int) unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
241{
242    Py_ssize_t i;
243
244    for (i = 0; i < setlen; i++)
245        if (set[i] == chr)
246            return 1;
247
248    return 0;
249}
250
251#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
252    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
253
254/* --- Unicode Object ----------------------------------------------------- */
255
256static
257int unicode_resize(register PyUnicodeObject *unicode,
258                   Py_ssize_t length)
259{
260    void *oldstr;
261
262    /* Shortcut if there's nothing much to do. */
263    if (unicode->length == length)
264        goto reset;
265
266    /* Resizing shared object (unicode_empty or single character
267       objects) in-place is not allowed. Use PyUnicode_Resize()
268       instead ! */
269
270    if (unicode == unicode_empty ||
271        (unicode->length == 1 &&
272         unicode->str[0] < 256U &&
273         unicode_latin1[unicode->str[0]] == unicode)) {
274        PyErr_SetString(PyExc_SystemError,
275                        "can't resize shared str objects");
276        return -1;
277    }
278
279    /* We allocate one more byte to make sure the string is Ux0000 terminated.
280       The overallocation is also used by fastsearch, which assumes that it's
281       safe to look at str[length] (without making any assumptions about what
282       it contains). */
283
284    oldstr = unicode->str;
285    unicode->str = PyObject_REALLOC(unicode->str,
286                                    sizeof(Py_UNICODE) * (length + 1));
287    if (!unicode->str) {
288        unicode->str = (Py_UNICODE *)oldstr;
289        PyErr_NoMemory();
290        return -1;
291    }
292    unicode->str[length] = 0;
293    unicode->length = length;
294
295  reset:
296    /* Reset the object caches */
297    if (unicode->defenc) {
298        Py_CLEAR(unicode->defenc);
299    }
300    unicode->hash = -1;
301
302    return 0;
303}
304
305/* We allocate one more byte to make sure the string is
306   Ux0000 terminated; some code (e.g. new_identifier)
307   relies on that.
308
309   XXX This allocator could further be enhanced by assuring that the
310   free list never reduces its size below 1.
311
312*/
313
314static
315PyUnicodeObject *_PyUnicode_New(Py_ssize_t length)
316{
317    register PyUnicodeObject *unicode;
318
319    /* Optimization for empty strings */
320    if (length == 0 && unicode_empty != NULL) {
321        Py_INCREF(unicode_empty);
322        return unicode_empty;
323    }
324
325    /* Ensure we won't overflow the size. */
326    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
327        return (PyUnicodeObject *)PyErr_NoMemory();
328    }
329
330    /* Unicode freelist & memory allocation */
331    if (free_list) {
332        unicode = free_list;
333        free_list = *(PyUnicodeObject **)unicode;
334        numfree--;
335        if (unicode->str) {
336            /* Keep-Alive optimization: we only upsize the buffer,
337               never downsize it. */
338            if ((unicode->length < length) &&
339                unicode_resize(unicode, length) < 0) {
340                PyObject_DEL(unicode->str);
341                unicode->str = NULL;
342            }
343        }
344        else {
345            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
346            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
347        }
348        PyObject_INIT(unicode, &PyUnicode_Type);
349    }
350    else {
351        size_t new_size;
352        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
353        if (unicode == NULL)
354            return NULL;
355        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
356        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
357    }
358
359    if (!unicode->str) {
360        PyErr_NoMemory();
361        goto onError;
362    }
363    /* Initialize the first element to guard against cases where
364     * the caller fails before initializing str -- unicode_resize()
365     * reads str[0], and the Keep-Alive optimization can keep memory
366     * allocated for str alive across a call to unicode_dealloc(unicode).
367     * We don't want unicode_resize to read uninitialized memory in
368     * that case.
369     */
370    unicode->str[0] = 0;
371    unicode->str[length] = 0;
372    unicode->length = length;
373    unicode->hash = -1;
374    unicode->state = 0;
375    unicode->defenc = NULL;
376    return unicode;
377
378  onError:
379    /* XXX UNREF/NEWREF interface should be more symmetrical */
380    _Py_DEC_REFTOTAL;
381    _Py_ForgetReference((PyObject *)unicode);
382    PyObject_Del(unicode);
383    return NULL;
384}
385
386static
387void unicode_dealloc(register PyUnicodeObject *unicode)
388{
389    switch (PyUnicode_CHECK_INTERNED(unicode)) {
390    case SSTATE_NOT_INTERNED:
391        break;
392
393    case SSTATE_INTERNED_MORTAL:
394        /* revive dead object temporarily for DelItem */
395        Py_REFCNT(unicode) = 3;
396        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
397            Py_FatalError(
398                "deletion of interned string failed");
399        break;
400
401    case SSTATE_INTERNED_IMMORTAL:
402        Py_FatalError("Immortal interned string died.");
403
404    default:
405        Py_FatalError("Inconsistent interned string state.");
406    }
407
408    if (PyUnicode_CheckExact(unicode) &&
409        numfree < PyUnicode_MAXFREELIST) {
410        /* Keep-Alive optimization */
411        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
412            PyObject_DEL(unicode->str);
413            unicode->str = NULL;
414            unicode->length = 0;
415        }
416        if (unicode->defenc) {
417            Py_CLEAR(unicode->defenc);
418        }
419        /* Add to free list */
420        *(PyUnicodeObject **)unicode = free_list;
421        free_list = unicode;
422        numfree++;
423    }
424    else {
425        PyObject_DEL(unicode->str);
426        Py_XDECREF(unicode->defenc);
427        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
428    }
429}
430
431static
432int _PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
433{
434    register PyUnicodeObject *v;
435
436    /* Argument checks */
437    if (unicode == NULL) {
438        PyErr_BadInternalCall();
439        return -1;
440    }
441    v = *unicode;
442    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
443        PyErr_BadInternalCall();
444        return -1;
445    }
446
447    /* Resizing unicode_empty and single character objects is not
448       possible since these are being shared. We simply return a fresh
449       copy with the same Unicode content. */
450    if (v->length != length &&
451        (v == unicode_empty || v->length == 1)) {
452        PyUnicodeObject *w = _PyUnicode_New(length);
453        if (w == NULL)
454            return -1;
455        Py_UNICODE_COPY(w->str, v->str,
456                        length < v->length ? length : v->length);
457        Py_DECREF(*unicode);
458        *unicode = w;
459        return 0;
460    }
461
462    /* Note that we don't have to modify *unicode for unshared Unicode
463       objects, since we can modify them in-place. */
464    return unicode_resize(v, length);
465}
466
467int PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
468{
469    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
470}
471
472PyObject *PyUnicode_FromUnicode(const Py_UNICODE *u,
473                                Py_ssize_t size)
474{
475    PyUnicodeObject *unicode;
476
477    /* If the Unicode data is known at construction time, we can apply
478       some optimizations which share commonly used objects. */
479    if (u != NULL) {
480
481        /* Optimization for empty strings */
482        if (size == 0 && unicode_empty != NULL) {
483            Py_INCREF(unicode_empty);
484            return (PyObject *)unicode_empty;
485        }
486
487        /* Single character Unicode objects in the Latin-1 range are
488           shared when using this constructor */
489        if (size == 1 && *u < 256) {
490            unicode = unicode_latin1[*u];
491            if (!unicode) {
492                unicode = _PyUnicode_New(1);
493                if (!unicode)
494                    return NULL;
495                unicode->str[0] = *u;
496                unicode_latin1[*u] = unicode;
497            }
498            Py_INCREF(unicode);
499            return (PyObject *)unicode;
500        }
501    }
502
503    unicode = _PyUnicode_New(size);
504    if (!unicode)
505        return NULL;
506
507    /* Copy the Unicode data into the new object */
508    if (u != NULL)
509        Py_UNICODE_COPY(unicode->str, u, size);
510
511    return (PyObject *)unicode;
512}
513
514PyObject *PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
515{
516    PyUnicodeObject *unicode;
517
518    if (size < 0) {
519        PyErr_SetString(PyExc_SystemError,
520                        "Negative size passed to PyUnicode_FromStringAndSize");
521        return NULL;
522    }
523
524    /* If the Unicode data is known at construction time, we can apply
525       some optimizations which share commonly used objects.
526       Also, this means the input must be UTF-8, so fall back to the
527       UTF-8 decoder at the end. */
528    if (u != NULL) {
529
530        /* Optimization for empty strings */
531        if (size == 0 && unicode_empty != NULL) {
532            Py_INCREF(unicode_empty);
533            return (PyObject *)unicode_empty;
534        }
535
536        /* Single characters are shared when using this constructor.
537           Restrict to ASCII, since the input must be UTF-8. */
538        if (size == 1 && Py_CHARMASK(*u) < 128) {
539            unicode = unicode_latin1[Py_CHARMASK(*u)];
540            if (!unicode) {
541                unicode = _PyUnicode_New(1);
542                if (!unicode)
543                    return NULL;
544                unicode->str[0] = Py_CHARMASK(*u);
545                unicode_latin1[Py_CHARMASK(*u)] = unicode;
546            }
547            Py_INCREF(unicode);
548            return (PyObject *)unicode;
549        }
550
551        return PyUnicode_DecodeUTF8(u, size, NULL);
552    }
553
554    unicode = _PyUnicode_New(size);
555    if (!unicode)
556        return NULL;
557
558    return (PyObject *)unicode;
559}
560
561PyObject *PyUnicode_FromString(const char *u)
562{
563    size_t size = strlen(u);
564    if (size > PY_SSIZE_T_MAX) {
565        PyErr_SetString(PyExc_OverflowError, "input too long");
566        return NULL;
567    }
568
569    return PyUnicode_FromStringAndSize(u, size);
570}
571
572#ifdef HAVE_WCHAR_H
573
574#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
575# define CONVERT_WCHAR_TO_SURROGATES
576#endif
577
578#ifdef CONVERT_WCHAR_TO_SURROGATES
579
580/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
581   to convert from UTF32 to UTF16. */
582
583PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
584                                 Py_ssize_t size)
585{
586    PyUnicodeObject *unicode;
587    register Py_ssize_t i;
588    Py_ssize_t alloc;
589    const wchar_t *orig_w;
590
591    if (w == NULL) {
592        if (size == 0)
593            return PyUnicode_FromStringAndSize(NULL, 0);
594        PyErr_BadInternalCall();
595        return NULL;
596    }
597
598    if (size == -1) {
599        size = wcslen(w);
600    }
601
602    alloc = size;
603    orig_w = w;
604    for (i = size; i > 0; i--) {
605        if (*w > 0xFFFF)
606            alloc++;
607        w++;
608    }
609    w = orig_w;
610    unicode = _PyUnicode_New(alloc);
611    if (!unicode)
612        return NULL;
613
614    /* Copy the wchar_t data into the new object */
615    {
616        register Py_UNICODE *u;
617        u = PyUnicode_AS_UNICODE(unicode);
618        for (i = size; i > 0; i--) {
619            if (*w > 0xFFFF) {
620                wchar_t ordinal = *w++;
621                ordinal -= 0x10000;
622                *u++ = 0xD800 | (ordinal >> 10);
623                *u++ = 0xDC00 | (ordinal & 0x3FF);
624            }
625            else
626                *u++ = *w++;
627        }
628    }
629    return (PyObject *)unicode;
630}
631
632#else
633
634PyObject *PyUnicode_FromWideChar(register const wchar_t *w,
635                                 Py_ssize_t size)
636{
637    PyUnicodeObject *unicode;
638
639    if (w == NULL) {
640        if (size == 0)
641            return PyUnicode_FromStringAndSize(NULL, 0);
642        PyErr_BadInternalCall();
643        return NULL;
644    }
645
646    if (size == -1) {
647        size = wcslen(w);
648    }
649
650    unicode = _PyUnicode_New(size);
651    if (!unicode)
652        return NULL;
653
654    /* Copy the wchar_t data into the new object */
655#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
656    memcpy(unicode->str, w, size * sizeof(wchar_t));
657#else
658    {
659        register Py_UNICODE *u;
660        register Py_ssize_t i;
661        u = PyUnicode_AS_UNICODE(unicode);
662        for (i = size; i > 0; i--)
663            *u++ = *w++;
664    }
665#endif
666
667    return (PyObject *)unicode;
668}
669
670#endif /* CONVERT_WCHAR_TO_SURROGATES */
671
672#undef CONVERT_WCHAR_TO_SURROGATES
673
674static void
675makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
676        int zeropad, int width, int precision, char c)
677{
678    *fmt++ = '%';
679    if (width) {
680        if (zeropad)
681            *fmt++ = '0';
682        fmt += sprintf(fmt, "%d", width);
683    }
684    if (precision)
685        fmt += sprintf(fmt, ".%d", precision);
686    if (longflag)
687        *fmt++ = 'l';
688    else if (longlongflag) {
689        /* longlongflag should only ever be nonzero on machines with
690           HAVE_LONG_LONG defined */
691#ifdef HAVE_LONG_LONG
692        char *f = PY_FORMAT_LONG_LONG;
693        while (*f)
694            *fmt++ = *f++;
695#else
696        /* we shouldn't ever get here */
697        assert(0);
698        *fmt++ = 'l';
699#endif
700    }
701    else if (size_tflag) {
702        char *f = PY_FORMAT_SIZE_T;
703        while (*f)
704            *fmt++ = *f++;
705    }
706    *fmt++ = c;
707    *fmt = '\0';
708}
709
710#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
711
712/* size of fixed-size buffer for formatting single arguments */
713#define ITEM_BUFFER_LEN 21
714/* maximum number of characters required for output of %ld.  21 characters
715   allows for 64-bit integers (in decimal) and an optional sign. */
716#define MAX_LONG_CHARS 21
717/* maximum number of characters required for output of %lld.
718   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
719   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
720#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
721
722PyObject *
723PyUnicode_FromFormatV(const char *format, va_list vargs)
724{
725    va_list count;
726    Py_ssize_t callcount = 0;
727    PyObject **callresults = NULL;
728    PyObject **callresult = NULL;
729    Py_ssize_t n = 0;
730    int width = 0;
731    int precision = 0;
732    int zeropad;
733    const char* f;
734    Py_UNICODE *s;
735    PyObject *string;
736    /* used by sprintf */
737    char buffer[ITEM_BUFFER_LEN+1];
738    /* use abuffer instead of buffer, if we need more space
739     * (which can happen if there's a format specifier with width). */
740    char *abuffer = NULL;
741    char *realbuffer;
742    Py_ssize_t abuffersize = 0;
743    char fmt[61]; /* should be enough for %0width.precisionlld */
744    const char *copy;
745
746    Py_VA_COPY(count, vargs);
747    /* step 1: count the number of %S/%R/%A/%s format specifications
748     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
749     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
750     * result in an array) */
751    for (f = format; *f; f++) {
752         if (*f == '%') {
753             if (*(f+1)=='%')
754                 continue;
755             if (*(f+1)=='S' || *(f+1)=='R' || *(f+1)=='A' || *(f+1) == 'V')
756                 ++callcount;
757             while (Py_ISDIGIT((unsigned)*f))
758                 width = (width*10) + *f++ - '0';
759             while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
760                 ;
761             if (*f == 's')
762                 ++callcount;
763         }
764         else if (128 <= (unsigned char)*f) {
765             PyErr_Format(PyExc_ValueError,
766                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
767                "string, got a non-ASCII byte: 0x%02x",
768                (unsigned char)*f);
769             return NULL;
770         }
771    }
772    /* step 2: allocate memory for the results of
773     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
774    if (callcount) {
775        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
776        if (!callresults) {
777            PyErr_NoMemory();
778            return NULL;
779        }
780        callresult = callresults;
781    }
782    /* step 3: figure out how large a buffer we need */
783    for (f = format; *f; f++) {
784        if (*f == '%') {
785#ifdef HAVE_LONG_LONG
786            int longlongflag = 0;
787#endif
788            const char* p = f;
789            width = 0;
790            while (Py_ISDIGIT((unsigned)*f))
791                width = (width*10) + *f++ - '0';
792            while (*++f && *f != '%' && !Py_ISALPHA((unsigned)*f))
793                ;
794
795            /* skip the 'l' or 'z' in {%ld, %zd, %lu, %zu} since
796             * they don't affect the amount of space we reserve.
797             */
798            if (*f == 'l') {
799                if (f[1] == 'd' || f[1] == 'u') {
800                    ++f;
801                }
802#ifdef HAVE_LONG_LONG
803                else if (f[1] == 'l' &&
804                         (f[2] == 'd' || f[2] == 'u')) {
805                    longlongflag = 1;
806                    f += 2;
807                }
808#endif
809            }
810            else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
811                ++f;
812            }
813
814            switch (*f) {
815            case 'c':
816            {
817#ifndef Py_UNICODE_WIDE
818                int ordinal = va_arg(count, int);
819                if (ordinal > 0xffff)
820                    n += 2;
821                else
822                    n++;
823#else
824                (void)va_arg(count, int);
825                n++;
826#endif
827                break;
828            }
829            case '%':
830                n++;
831                break;
832            case 'd': case 'u': case 'i': case 'x':
833                (void) va_arg(count, int);
834#ifdef HAVE_LONG_LONG
835                if (longlongflag) {
836                    if (width < MAX_LONG_LONG_CHARS)
837                        width = MAX_LONG_LONG_CHARS;
838                }
839                else
840#endif
841                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
842                       including sign.  Decimal takes the most space.  This
843                       isn't enough for octal.  If a width is specified we
844                       need more (which we allocate later). */
845                    if (width < MAX_LONG_CHARS)
846                        width = MAX_LONG_CHARS;
847                n += width;
848                /* XXX should allow for large precision here too. */
849                if (abuffersize < width)
850                    abuffersize = width;
851                break;
852            case 's':
853            {
854                /* UTF-8 */
855                const char *s = va_arg(count, const char*);
856                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
857                if (!str)
858                    goto fail;
859                n += PyUnicode_GET_SIZE(str);
860                /* Remember the str and switch to the next slot */
861                *callresult++ = str;
862                break;
863            }
864            case 'U':
865            {
866                PyObject *obj = va_arg(count, PyObject *);
867                assert(obj && PyUnicode_Check(obj));
868                n += PyUnicode_GET_SIZE(obj);
869                break;
870            }
871            case 'V':
872            {
873                PyObject *obj = va_arg(count, PyObject *);
874                const char *str = va_arg(count, const char *);
875                PyObject *str_obj;
876                assert(obj || str);
877                assert(!obj || PyUnicode_Check(obj));
878                if (obj) {
879                    n += PyUnicode_GET_SIZE(obj);
880                    *callresult++ = NULL;
881                }
882                else {
883                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
884                    if (!str_obj)
885                        goto fail;
886                    n += PyUnicode_GET_SIZE(str_obj);
887                    *callresult++ = str_obj;
888                }
889                break;
890            }
891            case 'S':
892            {
893                PyObject *obj = va_arg(count, PyObject *);
894                PyObject *str;
895                assert(obj);
896                str = PyObject_Str(obj);
897                if (!str)
898                    goto fail;
899                n += PyUnicode_GET_SIZE(str);
900                /* Remember the str and switch to the next slot */
901                *callresult++ = str;
902                break;
903            }
904            case 'R':
905            {
906                PyObject *obj = va_arg(count, PyObject *);
907                PyObject *repr;
908                assert(obj);
909                repr = PyObject_Repr(obj);
910                if (!repr)
911                    goto fail;
912                n += PyUnicode_GET_SIZE(repr);
913                /* Remember the repr and switch to the next slot */
914                *callresult++ = repr;
915                break;
916            }
917            case 'A':
918            {
919                PyObject *obj = va_arg(count, PyObject *);
920                PyObject *ascii;
921                assert(obj);
922                ascii = PyObject_ASCII(obj);
923                if (!ascii)
924                    goto fail;
925                n += PyUnicode_GET_SIZE(ascii);
926                /* Remember the repr and switch to the next slot */
927                *callresult++ = ascii;
928                break;
929            }
930            case 'p':
931                (void) va_arg(count, int);
932                /* maximum 64-bit pointer representation:
933                 * 0xffffffffffffffff
934                 * so 19 characters is enough.
935                 * XXX I count 18 -- what's the extra for?
936                 */
937                n += 19;
938                break;
939            default:
940                /* if we stumble upon an unknown
941                   formatting code, copy the rest of
942                   the format string to the output
943                   string. (we cannot just skip the
944                   code, since there's no way to know
945                   what's in the argument list) */
946                n += strlen(p);
947                goto expand;
948            }
949        } else
950            n++;
951    }
952  expand:
953    if (abuffersize > ITEM_BUFFER_LEN) {
954        /* add 1 for sprintf's trailing null byte */
955        abuffer = PyObject_Malloc(abuffersize + 1);
956        if (!abuffer) {
957            PyErr_NoMemory();
958            goto fail;
959        }
960        realbuffer = abuffer;
961    }
962    else
963        realbuffer = buffer;
964    /* step 4: fill the buffer */
965    /* Since we've analyzed how much space we need for the worst case,
966       we don't have to resize the string.
967       There can be no errors beyond this point. */
968    string = PyUnicode_FromUnicode(NULL, n);
969    if (!string)
970        goto fail;
971
972    s = PyUnicode_AS_UNICODE(string);
973    callresult = callresults;
974
975    for (f = format; *f; f++) {
976        if (*f == '%') {
977            const char* p = f++;
978            int longflag = 0;
979            int longlongflag = 0;
980            int size_tflag = 0;
981            zeropad = (*f == '0');
982            /* parse the width.precision part */
983            width = 0;
984            while (Py_ISDIGIT((unsigned)*f))
985                width = (width*10) + *f++ - '0';
986            precision = 0;
987            if (*f == '.') {
988                f++;
989                while (Py_ISDIGIT((unsigned)*f))
990                    precision = (precision*10) + *f++ - '0';
991            }
992            /* Handle %ld, %lu, %lld and %llu. */
993            if (*f == 'l') {
994                if (f[1] == 'd' || f[1] == 'u') {
995                    longflag = 1;
996                    ++f;
997                }
998#ifdef HAVE_LONG_LONG
999                else if (f[1] == 'l' &&
1000                         (f[2] == 'd' || f[2] == 'u')) {
1001                    longlongflag = 1;
1002                    f += 2;
1003                }
1004#endif
1005            }
1006            /* handle the size_t flag. */
1007            if (*f == 'z' && (f[1] == 'd' || f[1] == 'u')) {
1008                size_tflag = 1;
1009                ++f;
1010            }
1011
1012            switch (*f) {
1013            case 'c':
1014            {
1015                int ordinal = va_arg(vargs, int);
1016#ifndef Py_UNICODE_WIDE
1017                if (ordinal > 0xffff) {
1018                    ordinal -= 0x10000;
1019                    *s++ = 0xD800 | (ordinal >> 10);
1020                    *s++ = 0xDC00 | (ordinal & 0x3FF);
1021                } else
1022#endif
1023                *s++ = ordinal;
1024                break;
1025            }
1026            case 'd':
1027                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1028                        width, precision, 'd');
1029                if (longflag)
1030                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1031#ifdef HAVE_LONG_LONG
1032                else if (longlongflag)
1033                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1034#endif
1035                else if (size_tflag)
1036                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1037                else
1038                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1039                appendstring(realbuffer);
1040                break;
1041            case 'u':
1042                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1043                        width, precision, 'u');
1044                if (longflag)
1045                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1046#ifdef HAVE_LONG_LONG
1047                else if (longlongflag)
1048                    sprintf(realbuffer, fmt, va_arg(vargs,
1049                                                    unsigned PY_LONG_LONG));
1050#endif
1051                else if (size_tflag)
1052                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1053                else
1054                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1055                appendstring(realbuffer);
1056                break;
1057            case 'i':
1058                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'i');
1059                sprintf(realbuffer, fmt, va_arg(vargs, int));
1060                appendstring(realbuffer);
1061                break;
1062            case 'x':
1063                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1064                sprintf(realbuffer, fmt, va_arg(vargs, int));
1065                appendstring(realbuffer);
1066                break;
1067            case 's':
1068            {
1069                /* unused, since we already have the result */
1070                (void) va_arg(vargs, char *);
1071                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1072                                PyUnicode_GET_SIZE(*callresult));
1073                s += PyUnicode_GET_SIZE(*callresult);
1074                /* We're done with the unicode()/repr() => forget it */
1075                Py_DECREF(*callresult);
1076                /* switch to next unicode()/repr() result */
1077                ++callresult;
1078                break;
1079            }
1080            case 'U':
1081            {
1082                PyObject *obj = va_arg(vargs, PyObject *);
1083                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1084                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1085                s += size;
1086                break;
1087            }
1088            case 'V':
1089            {
1090                PyObject *obj = va_arg(vargs, PyObject *);
1091                va_arg(vargs, const char *);
1092                if (obj) {
1093                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1094                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1095                    s += size;
1096                } else {
1097                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1098                                    PyUnicode_GET_SIZE(*callresult));
1099                    s += PyUnicode_GET_SIZE(*callresult);
1100                    Py_DECREF(*callresult);
1101                }
1102                ++callresult;
1103                break;
1104            }
1105            case 'S':
1106            case 'R':
1107            case 'A':
1108            {
1109                Py_UNICODE *ucopy;
1110                Py_ssize_t usize;
1111                Py_ssize_t upos;
1112                /* unused, since we already have the result */
1113                (void) va_arg(vargs, PyObject *);
1114                ucopy = PyUnicode_AS_UNICODE(*callresult);
1115                usize = PyUnicode_GET_SIZE(*callresult);
1116                for (upos = 0; upos<usize;)
1117                    *s++ = ucopy[upos++];
1118                /* We're done with the unicode()/repr() => forget it */
1119                Py_DECREF(*callresult);
1120                /* switch to next unicode()/repr() result */
1121                ++callresult;
1122                break;
1123            }
1124            case 'p':
1125                sprintf(buffer, "%p", va_arg(vargs, void*));
1126                /* %p is ill-defined:  ensure leading 0x. */
1127                if (buffer[1] == 'X')
1128                    buffer[1] = 'x';
1129                else if (buffer[1] != 'x') {
1130                    memmove(buffer+2, buffer, strlen(buffer)+1);
1131                    buffer[0] = '0';
1132                    buffer[1] = 'x';
1133                }
1134                appendstring(buffer);
1135                break;
1136            case '%':
1137                *s++ = '%';
1138                break;
1139            default:
1140                appendstring(p);
1141                goto end;
1142            }
1143        }
1144        else
1145            *s++ = *f;
1146    }
1147
1148  end:
1149    if (callresults)
1150        PyObject_Free(callresults);
1151    if (abuffer)
1152        PyObject_Free(abuffer);
1153    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1154    return string;
1155  fail:
1156    if (callresults) {
1157        PyObject **callresult2 = callresults;
1158        while (callresult2 < callresult) {
1159            Py_XDECREF(*callresult2);
1160            ++callresult2;
1161        }
1162        PyObject_Free(callresults);
1163    }
1164    if (abuffer)
1165        PyObject_Free(abuffer);
1166    return NULL;
1167}
1168
1169#undef appendstring
1170
1171PyObject *
1172PyUnicode_FromFormat(const char *format, ...)
1173{
1174    PyObject* ret;
1175    va_list vargs;
1176
1177#ifdef HAVE_STDARG_PROTOTYPES
1178    va_start(vargs, format);
1179#else
1180    va_start(vargs);
1181#endif
1182    ret = PyUnicode_FromFormatV(format, vargs);
1183    va_end(vargs);
1184    return ret;
1185}
1186
1187/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1188   convert a Unicode object to a wide character string.
1189
1190   - If w is NULL: return the number of wide characters (including the nul
1191     character) required to convert the unicode object. Ignore size argument.
1192
1193   - Otherwise: return the number of wide characters (excluding the nul
1194     character) written into w. Write at most size wide characters (including
1195     the nul character). */
1196static Py_ssize_t
1197unicode_aswidechar(PyUnicodeObject *unicode,
1198                   wchar_t *w,
1199                   Py_ssize_t size)
1200{
1201#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1202    Py_ssize_t res;
1203    if (w != NULL) {
1204        res = PyUnicode_GET_SIZE(unicode);
1205        if (size > res)
1206            size = res + 1;
1207        else
1208            res = size;
1209        memcpy(w, unicode->str, size * sizeof(wchar_t));
1210        return res;
1211    }
1212    else
1213        return PyUnicode_GET_SIZE(unicode) + 1;
1214#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1215    register const Py_UNICODE *u;
1216    const Py_UNICODE *uend;
1217    const wchar_t *worig, *wend;
1218    Py_ssize_t nchar;
1219
1220    u = PyUnicode_AS_UNICODE(unicode);
1221    uend = u + PyUnicode_GET_SIZE(unicode);
1222    if (w != NULL) {
1223        worig = w;
1224        wend = w + size;
1225        while (u != uend && w != wend) {
1226            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1227                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1228            {
1229                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1230                u += 2;
1231            }
1232            else {
1233                *w = *u;
1234                u++;
1235            }
1236            w++;
1237        }
1238        if (w != wend)
1239            *w = L'\0';
1240        return w - worig;
1241    }
1242    else {
1243        nchar = 1; /* nul character at the end */
1244        while (u != uend) {
1245            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1246                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1247                u += 2;
1248            else
1249                u++;
1250            nchar++;
1251        }
1252    }
1253    return nchar;
1254#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1255    register Py_UNICODE *u, *uend, ordinal;
1256    register Py_ssize_t i;
1257    wchar_t *worig, *wend;
1258    Py_ssize_t nchar;
1259
1260    u = PyUnicode_AS_UNICODE(unicode);
1261    uend = u + PyUnicode_GET_SIZE(u);
1262    if (w != NULL) {
1263        worig = w;
1264        wend = w + size;
1265        while (u != uend && w != wend) {
1266            ordinal = *u;
1267            if (ordinal > 0xffff) {
1268                ordinal -= 0x10000;
1269                *w++ = 0xD800 | (ordinal >> 10);
1270                *w++ = 0xDC00 | (ordinal & 0x3FF);
1271            }
1272            else
1273                *w++ = ordinal;
1274            u++;
1275        }
1276        if (w != wend)
1277            *w = 0;
1278        return w - worig;
1279    }
1280    else {
1281        nchar = 1; /* nul character */
1282        while (u != uend) {
1283            if (*u > 0xffff)
1284                nchar += 2;
1285            else
1286                nchar++;
1287            u++;
1288        }
1289        return nchar;
1290    }
1291#else
1292#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1293#endif
1294}
1295
1296Py_ssize_t
1297PyUnicode_AsWideChar(PyObject *unicode,
1298                     wchar_t *w,
1299                     Py_ssize_t size)
1300{
1301    if (unicode == NULL) {
1302        PyErr_BadInternalCall();
1303        return -1;
1304    }
1305    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
1306}
1307
1308wchar_t*
1309PyUnicode_AsWideCharString(PyObject *unicode,
1310                           Py_ssize_t *size)
1311{
1312    wchar_t* buffer;
1313    Py_ssize_t buflen;
1314
1315    if (unicode == NULL) {
1316        PyErr_BadInternalCall();
1317        return NULL;
1318    }
1319
1320    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1321    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1322        PyErr_NoMemory();
1323        return NULL;
1324    }
1325
1326    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1327    if (buffer == NULL) {
1328        PyErr_NoMemory();
1329        return NULL;
1330    }
1331    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1332    if (size != NULL)
1333        *size = buflen;
1334    return buffer;
1335}
1336
1337#endif
1338
1339PyObject *PyUnicode_FromOrdinal(int ordinal)
1340{
1341    Py_UNICODE s[2];
1342
1343    if (ordinal < 0 || ordinal > 0x10ffff) {
1344        PyErr_SetString(PyExc_ValueError,
1345                        "chr() arg not in range(0x110000)");
1346        return NULL;
1347    }
1348
1349#ifndef Py_UNICODE_WIDE
1350    if (ordinal > 0xffff) {
1351        ordinal -= 0x10000;
1352        s[0] = 0xD800 | (ordinal >> 10);
1353        s[1] = 0xDC00 | (ordinal & 0x3FF);
1354        return PyUnicode_FromUnicode(s, 2);
1355    }
1356#endif
1357
1358    s[0] = (Py_UNICODE)ordinal;
1359    return PyUnicode_FromUnicode(s, 1);
1360}
1361
1362PyObject *PyUnicode_FromObject(register PyObject *obj)
1363{
1364    /* XXX Perhaps we should make this API an alias of
1365       PyObject_Str() instead ?! */
1366    if (PyUnicode_CheckExact(obj)) {
1367        Py_INCREF(obj);
1368        return obj;
1369    }
1370    if (PyUnicode_Check(obj)) {
1371        /* For a Unicode subtype that's not a Unicode object,
1372           return a true Unicode object with the same data. */
1373        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1374                                     PyUnicode_GET_SIZE(obj));
1375    }
1376    PyErr_Format(PyExc_TypeError,
1377                 "Can't convert '%.100s' object to str implicitly",
1378                 Py_TYPE(obj)->tp_name);
1379    return NULL;
1380}
1381
1382PyObject *PyUnicode_FromEncodedObject(register PyObject *obj,
1383                                      const char *encoding,
1384                                      const char *errors)
1385{
1386    Py_buffer buffer;
1387    PyObject *v;
1388
1389    if (obj == NULL) {
1390        PyErr_BadInternalCall();
1391        return NULL;
1392    }
1393
1394    /* Decoding bytes objects is the most common case and should be fast */
1395    if (PyBytes_Check(obj)) {
1396        if (PyBytes_GET_SIZE(obj) == 0) {
1397            Py_INCREF(unicode_empty);
1398            v = (PyObject *) unicode_empty;
1399        }
1400        else {
1401            v = PyUnicode_Decode(
1402                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1403                    encoding, errors);
1404        }
1405        return v;
1406    }
1407
1408    if (PyUnicode_Check(obj)) {
1409        PyErr_SetString(PyExc_TypeError,
1410                        "decoding str is not supported");
1411        return NULL;
1412    }
1413
1414    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1415    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1416        PyErr_Format(PyExc_TypeError,
1417                     "coercing to str: need bytes, bytearray "
1418                     "or buffer-like object, %.80s found",
1419                     Py_TYPE(obj)->tp_name);
1420        return NULL;
1421    }
1422
1423    if (buffer.len == 0) {
1424        Py_INCREF(unicode_empty);
1425        v = (PyObject *) unicode_empty;
1426    }
1427    else
1428        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1429
1430    PyBuffer_Release(&buffer);
1431    return v;
1432}
1433
1434/* Convert encoding to lower case and replace '_' with '-' in order to
1435   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1436   1 on success. */
1437static int
1438normalize_encoding(const char *encoding,
1439                   char *lower,
1440                   size_t lower_len)
1441{
1442    const char *e;
1443    char *l;
1444    char *l_end;
1445
1446    e = encoding;
1447    l = lower;
1448    l_end = &lower[lower_len - 1];
1449    while (*e) {
1450        if (l == l_end)
1451            return 0;
1452        if (Py_ISUPPER(*e)) {
1453            *l++ = Py_TOLOWER(*e++);
1454        }
1455        else if (*e == '_') {
1456            *l++ = '-';
1457            e++;
1458        }
1459        else {
1460            *l++ = *e++;
1461        }
1462    }
1463    *l = '\0';
1464    return 1;
1465}
1466
1467PyObject *PyUnicode_Decode(const char *s,
1468                           Py_ssize_t size,
1469                           const char *encoding,
1470                           const char *errors)
1471{
1472    PyObject *buffer = NULL, *unicode;
1473    Py_buffer info;
1474    char lower[11];  /* Enough for any encoding shortcut */
1475
1476    if (encoding == NULL)
1477        encoding = PyUnicode_GetDefaultEncoding();
1478
1479    /* Shortcuts for common default encodings */
1480    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1481        if (strcmp(lower, "utf-8") == 0)
1482            return PyUnicode_DecodeUTF8(s, size, errors);
1483        else if ((strcmp(lower, "latin-1") == 0) ||
1484                 (strcmp(lower, "iso-8859-1") == 0))
1485            return PyUnicode_DecodeLatin1(s, size, errors);
1486#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1487        else if (strcmp(lower, "mbcs") == 0)
1488            return PyUnicode_DecodeMBCS(s, size, errors);
1489#endif
1490        else if (strcmp(lower, "ascii") == 0)
1491            return PyUnicode_DecodeASCII(s, size, errors);
1492        else if (strcmp(lower, "utf-16") == 0)
1493            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1494        else if (strcmp(lower, "utf-32") == 0)
1495            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1496    }
1497
1498    /* Decode via the codec registry */
1499    buffer = NULL;
1500    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1501        goto onError;
1502    buffer = PyMemoryView_FromBuffer(&info);
1503    if (buffer == NULL)
1504        goto onError;
1505    unicode = PyCodec_Decode(buffer, encoding, errors);
1506    if (unicode == NULL)
1507        goto onError;
1508    if (!PyUnicode_Check(unicode)) {
1509        PyErr_Format(PyExc_TypeError,
1510                     "decoder did not return a str object (type=%.400s)",
1511                     Py_TYPE(unicode)->tp_name);
1512        Py_DECREF(unicode);
1513        goto onError;
1514    }
1515    Py_DECREF(buffer);
1516    return unicode;
1517
1518  onError:
1519    Py_XDECREF(buffer);
1520    return NULL;
1521}
1522
1523PyObject *PyUnicode_AsDecodedObject(PyObject *unicode,
1524                                    const char *encoding,
1525                                    const char *errors)
1526{
1527    PyObject *v;
1528
1529    if (!PyUnicode_Check(unicode)) {
1530        PyErr_BadArgument();
1531        goto onError;
1532    }
1533
1534    if (encoding == NULL)
1535        encoding = PyUnicode_GetDefaultEncoding();
1536
1537    /* Decode via the codec registry */
1538    v = PyCodec_Decode(unicode, encoding, errors);
1539    if (v == NULL)
1540        goto onError;
1541    return v;
1542
1543  onError:
1544    return NULL;
1545}
1546
1547PyObject *PyUnicode_AsDecodedUnicode(PyObject *unicode,
1548                                     const char *encoding,
1549                                     const char *errors)
1550{
1551    PyObject *v;
1552
1553    if (!PyUnicode_Check(unicode)) {
1554        PyErr_BadArgument();
1555        goto onError;
1556    }
1557
1558    if (encoding == NULL)
1559        encoding = PyUnicode_GetDefaultEncoding();
1560
1561    /* Decode via the codec registry */
1562    v = PyCodec_Decode(unicode, encoding, errors);
1563    if (v == NULL)
1564        goto onError;
1565    if (!PyUnicode_Check(v)) {
1566        PyErr_Format(PyExc_TypeError,
1567                     "decoder did not return a str object (type=%.400s)",
1568                     Py_TYPE(v)->tp_name);
1569        Py_DECREF(v);
1570        goto onError;
1571    }
1572    return v;
1573
1574  onError:
1575    return NULL;
1576}
1577
1578PyObject *PyUnicode_Encode(const Py_UNICODE *s,
1579                           Py_ssize_t size,
1580                           const char *encoding,
1581                           const char *errors)
1582{
1583    PyObject *v, *unicode;
1584
1585    unicode = PyUnicode_FromUnicode(s, size);
1586    if (unicode == NULL)
1587        return NULL;
1588    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1589    Py_DECREF(unicode);
1590    return v;
1591}
1592
1593PyObject *PyUnicode_AsEncodedObject(PyObject *unicode,
1594                                    const char *encoding,
1595                                    const char *errors)
1596{
1597    PyObject *v;
1598
1599    if (!PyUnicode_Check(unicode)) {
1600        PyErr_BadArgument();
1601        goto onError;
1602    }
1603
1604    if (encoding == NULL)
1605        encoding = PyUnicode_GetDefaultEncoding();
1606
1607    /* Encode via the codec registry */
1608    v = PyCodec_Encode(unicode, encoding, errors);
1609    if (v == NULL)
1610        goto onError;
1611    return v;
1612
1613  onError:
1614    return NULL;
1615}
1616
1617PyObject *
1618PyUnicode_EncodeFSDefault(PyObject *unicode)
1619{
1620#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1621    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1622                                PyUnicode_GET_SIZE(unicode),
1623                                NULL);
1624#elif defined(__APPLE__)
1625    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1626                                PyUnicode_GET_SIZE(unicode),
1627                                "surrogateescape");
1628#else
1629    PyInterpreterState *interp = PyThreadState_GET()->interp;
1630    /* Bootstrap check: if the filesystem codec is implemented in Python, we
1631       cannot use it to encode and decode filenames before it is loaded. Load
1632       the Python codec requires to encode at least its own filename. Use the C
1633       version of the locale codec until the codec registry is initialized and
1634       the Python codec is loaded.
1635
1636       Py_FileSystemDefaultEncoding is shared between all interpreters, we
1637       cannot only rely on it: check also interp->fscodec_initialized for
1638       subinterpreters. */
1639    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1640        return PyUnicode_AsEncodedString(unicode,
1641                                         Py_FileSystemDefaultEncoding,
1642                                         "surrogateescape");
1643    }
1644    else {
1645        /* locale encoding with surrogateescape */
1646        wchar_t *wchar;
1647        char *bytes;
1648        PyObject *bytes_obj;
1649        size_t error_pos;
1650
1651        wchar = PyUnicode_AsWideCharString(unicode, NULL);
1652        if (wchar == NULL)
1653            return NULL;
1654        bytes = _Py_wchar2char(wchar, &error_pos);
1655        if (bytes == NULL) {
1656            if (error_pos != (size_t)-1) {
1657                char *errmsg = strerror(errno);
1658                PyObject *exc = NULL;
1659                if (errmsg == NULL)
1660                    errmsg = "Py_wchar2char() failed";
1661                raise_encode_exception(&exc,
1662                    "filesystemencoding",
1663                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1664                    error_pos, error_pos+1,
1665                    errmsg);
1666                Py_XDECREF(exc);
1667            }
1668            else
1669                PyErr_NoMemory();
1670            PyMem_Free(wchar);
1671            return NULL;
1672        }
1673        PyMem_Free(wchar);
1674
1675        bytes_obj = PyBytes_FromString(bytes);
1676        PyMem_Free(bytes);
1677        return bytes_obj;
1678    }
1679#endif
1680}
1681
1682PyObject *PyUnicode_AsEncodedString(PyObject *unicode,
1683                                    const char *encoding,
1684                                    const char *errors)
1685{
1686    PyObject *v;
1687    char lower[11];  /* Enough for any encoding shortcut */
1688
1689    if (!PyUnicode_Check(unicode)) {
1690        PyErr_BadArgument();
1691        return NULL;
1692    }
1693
1694    if (encoding == NULL)
1695        encoding = PyUnicode_GetDefaultEncoding();
1696
1697    /* Shortcuts for common default encodings */
1698    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1699        if (strcmp(lower, "utf-8") == 0)
1700            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1701                                        PyUnicode_GET_SIZE(unicode),
1702                                        errors);
1703        else if ((strcmp(lower, "latin-1") == 0) ||
1704                 (strcmp(lower, "iso-8859-1") == 0))
1705            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1706                                          PyUnicode_GET_SIZE(unicode),
1707                                          errors);
1708#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1709        else if (strcmp(lower, "mbcs") == 0)
1710            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1711                                        PyUnicode_GET_SIZE(unicode),
1712                                        errors);
1713#endif
1714        else if (strcmp(lower, "ascii") == 0)
1715            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1716                                         PyUnicode_GET_SIZE(unicode),
1717                                         errors);
1718    }
1719    /* During bootstrap, we may need to find the encodings
1720       package, to load the file system encoding, and require the
1721       file system encoding in order to load the encodings
1722       package.
1723
1724       Break out of this dependency by assuming that the path to
1725       the encodings module is ASCII-only.  XXX could try wcstombs
1726       instead, if the file system encoding is the locale's
1727       encoding. */
1728    if (Py_FileSystemDefaultEncoding &&
1729             strcmp(encoding, Py_FileSystemDefaultEncoding) == 0 &&
1730             !PyThreadState_GET()->interp->codecs_initialized)
1731        return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1732                                     PyUnicode_GET_SIZE(unicode),
1733                                     errors);
1734
1735    /* Encode via the codec registry */
1736    v = PyCodec_Encode(unicode, encoding, errors);
1737    if (v == NULL)
1738        return NULL;
1739
1740    /* The normal path */
1741    if (PyBytes_Check(v))
1742        return v;
1743
1744    /* If the codec returns a buffer, raise a warning and convert to bytes */
1745    if (PyByteArray_Check(v)) {
1746        int error;
1747        PyObject *b;
1748
1749        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1750            "encoder %s returned bytearray instead of bytes",
1751            encoding);
1752        if (error) {
1753            Py_DECREF(v);
1754            return NULL;
1755        }
1756
1757        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1758        Py_DECREF(v);
1759        return b;
1760    }
1761
1762    PyErr_Format(PyExc_TypeError,
1763                 "encoder did not return a bytes object (type=%.400s)",
1764                 Py_TYPE(v)->tp_name);
1765    Py_DECREF(v);
1766    return NULL;
1767}
1768
1769PyObject *PyUnicode_AsEncodedUnicode(PyObject *unicode,
1770                                     const char *encoding,
1771                                     const char *errors)
1772{
1773    PyObject *v;
1774
1775    if (!PyUnicode_Check(unicode)) {
1776        PyErr_BadArgument();
1777        goto onError;
1778    }
1779
1780    if (encoding == NULL)
1781        encoding = PyUnicode_GetDefaultEncoding();
1782
1783    /* Encode via the codec registry */
1784    v = PyCodec_Encode(unicode, encoding, errors);
1785    if (v == NULL)
1786        goto onError;
1787    if (!PyUnicode_Check(v)) {
1788        PyErr_Format(PyExc_TypeError,
1789                     "encoder did not return an str object (type=%.400s)",
1790                     Py_TYPE(v)->tp_name);
1791        Py_DECREF(v);
1792        goto onError;
1793    }
1794    return v;
1795
1796  onError:
1797    return NULL;
1798}
1799
1800PyObject *_PyUnicode_AsDefaultEncodedString(PyObject *unicode,
1801                                            const char *errors)
1802{
1803    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1804    if (v)
1805        return v;
1806    if (errors != NULL)
1807        Py_FatalError("non-NULL encoding in _PyUnicode_AsDefaultEncodedString");
1808    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1809                             PyUnicode_GET_SIZE(unicode),
1810                             NULL);
1811    if (!v)
1812        return NULL;
1813    ((PyUnicodeObject *)unicode)->defenc = v;
1814    return v;
1815}
1816
1817PyObject*
1818PyUnicode_DecodeFSDefault(const char *s) {
1819    Py_ssize_t size = (Py_ssize_t)strlen(s);
1820    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1821}
1822
1823PyObject*
1824PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1825{
1826#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
1827    return PyUnicode_DecodeMBCS(s, size, NULL);
1828#elif defined(__APPLE__)
1829    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1830#else
1831    PyInterpreterState *interp = PyThreadState_GET()->interp;
1832    /* Bootstrap check: if the filesystem codec is implemented in Python, we
1833       cannot use it to encode and decode filenames before it is loaded. Load
1834       the Python codec requires to encode at least its own filename. Use the C
1835       version of the locale codec until the codec registry is initialized and
1836       the Python codec is loaded.
1837
1838       Py_FileSystemDefaultEncoding is shared between all interpreters, we
1839       cannot only rely on it: check also interp->fscodec_initialized for
1840       subinterpreters. */
1841    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1842        return PyUnicode_Decode(s, size,
1843                                Py_FileSystemDefaultEncoding,
1844                                "surrogateescape");
1845    }
1846    else {
1847        /* locale encoding with surrogateescape */
1848        wchar_t *wchar;
1849        PyObject *unicode;
1850        size_t len;
1851
1852        if (s[size] != '\0' || size != strlen(s)) {
1853            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1854            return NULL;
1855        }
1856
1857        wchar = _Py_char2wchar(s, &len);
1858        if (wchar == NULL)
1859            return PyErr_NoMemory();
1860
1861        unicode = PyUnicode_FromWideChar(wchar, len);
1862        PyMem_Free(wchar);
1863        return unicode;
1864    }
1865#endif
1866}
1867
1868
1869int
1870PyUnicode_FSConverter(PyObject* arg, void* addr)
1871{
1872    PyObject *output = NULL;
1873    Py_ssize_t size;
1874    void *data;
1875    if (arg == NULL) {
1876        Py_DECREF(*(PyObject**)addr);
1877        return 1;
1878    }
1879    if (PyBytes_Check(arg)) {
1880        output = arg;
1881        Py_INCREF(output);
1882    }
1883    else {
1884        arg = PyUnicode_FromObject(arg);
1885        if (!arg)
1886            return 0;
1887        output = PyUnicode_EncodeFSDefault(arg);
1888        Py_DECREF(arg);
1889        if (!output)
1890            return 0;
1891        if (!PyBytes_Check(output)) {
1892            Py_DECREF(output);
1893            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1894            return 0;
1895        }
1896    }
1897    size = PyBytes_GET_SIZE(output);
1898    data = PyBytes_AS_STRING(output);
1899    if (size != strlen(data)) {
1900        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1901        Py_DECREF(output);
1902        return 0;
1903    }
1904    *(PyObject**)addr = output;
1905    return Py_CLEANUP_SUPPORTED;
1906}
1907
1908
1909int
1910PyUnicode_FSDecoder(PyObject* arg, void* addr)
1911{
1912    PyObject *output = NULL;
1913    Py_ssize_t size;
1914    void *data;
1915    if (arg == NULL) {
1916        Py_DECREF(*(PyObject**)addr);
1917        return 1;
1918    }
1919    if (PyUnicode_Check(arg)) {
1920        output = arg;
1921        Py_INCREF(output);
1922    }
1923    else {
1924        arg = PyBytes_FromObject(arg);
1925        if (!arg)
1926            return 0;
1927        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1928                                                  PyBytes_GET_SIZE(arg));
1929        Py_DECREF(arg);
1930        if (!output)
1931            return 0;
1932        if (!PyUnicode_Check(output)) {
1933            Py_DECREF(output);
1934            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1935            return 0;
1936        }
1937    }
1938    size = PyUnicode_GET_SIZE(output);
1939    data = PyUnicode_AS_UNICODE(output);
1940    if (size != Py_UNICODE_strlen(data)) {
1941        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1942        Py_DECREF(output);
1943        return 0;
1944    }
1945    *(PyObject**)addr = output;
1946    return Py_CLEANUP_SUPPORTED;
1947}
1948
1949
1950char*
1951_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1952{
1953    PyObject *bytes;
1954    if (!PyUnicode_Check(unicode)) {
1955        PyErr_BadArgument();
1956        return NULL;
1957    }
1958    bytes = _PyUnicode_AsDefaultEncodedString(unicode, NULL);
1959    if (bytes == NULL)
1960        return NULL;
1961    if (psize != NULL)
1962        *psize = PyBytes_GET_SIZE(bytes);
1963    return PyBytes_AS_STRING(bytes);
1964}
1965
1966char*
1967_PyUnicode_AsString(PyObject *unicode)
1968{
1969    return _PyUnicode_AsStringAndSize(unicode, NULL);
1970}
1971
1972Py_UNICODE *PyUnicode_AsUnicode(PyObject *unicode)
1973{
1974    if (!PyUnicode_Check(unicode)) {
1975        PyErr_BadArgument();
1976        goto onError;
1977    }
1978    return PyUnicode_AS_UNICODE(unicode);
1979
1980  onError:
1981    return NULL;
1982}
1983
1984Py_ssize_t PyUnicode_GetSize(PyObject *unicode)
1985{
1986    if (!PyUnicode_Check(unicode)) {
1987        PyErr_BadArgument();
1988        goto onError;
1989    }
1990    return PyUnicode_GET_SIZE(unicode);
1991
1992  onError:
1993    return -1;
1994}
1995
1996const char *PyUnicode_GetDefaultEncoding(void)
1997{
1998    return "utf-8";
1999}
2000
2001/* create or adjust a UnicodeDecodeError */
2002static void
2003make_decode_exception(PyObject **exceptionObject,
2004                      const char *encoding,
2005                      const char *input, Py_ssize_t length,
2006                      Py_ssize_t startpos, Py_ssize_t endpos,
2007                      const char *reason)
2008{
2009    if (*exceptionObject == NULL) {
2010        *exceptionObject = PyUnicodeDecodeError_Create(
2011            encoding, input, length, startpos, endpos, reason);
2012    }
2013    else {
2014        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2015            goto onError;
2016        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2017            goto onError;
2018        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2019            goto onError;
2020    }
2021    return;
2022
2023onError:
2024    Py_DECREF(*exceptionObject);
2025    *exceptionObject = NULL;
2026}
2027
2028/* error handling callback helper:
2029   build arguments, call the callback and check the arguments,
2030   if no exception occurred, copy the replacement to the output
2031   and adjust various state variables.
2032   return 0 on success, -1 on error
2033*/
2034
2035static
2036int unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2037                                     const char *encoding, const char *reason,
2038                                     const char **input, const char **inend, Py_ssize_t *startinpos,
2039                                     Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2040                                     PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
2041{
2042    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
2043
2044    PyObject *restuple = NULL;
2045    PyObject *repunicode = NULL;
2046    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
2047    Py_ssize_t insize;
2048    Py_ssize_t requiredsize;
2049    Py_ssize_t newpos;
2050    Py_UNICODE *repptr;
2051    PyObject *inputobj = NULL;
2052    Py_ssize_t repsize;
2053    int res = -1;
2054
2055    if (*errorHandler == NULL) {
2056        *errorHandler = PyCodec_LookupError(errors);
2057        if (*errorHandler == NULL)
2058            goto onError;
2059    }
2060
2061    make_decode_exception(exceptionObject,
2062        encoding,
2063        *input, *inend - *input,
2064        *startinpos, *endinpos,
2065        reason);
2066    if (*exceptionObject == NULL)
2067        goto onError;
2068
2069    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2070    if (restuple == NULL)
2071        goto onError;
2072    if (!PyTuple_Check(restuple)) {
2073        PyErr_SetString(PyExc_TypeError, &argparse[4]);
2074        goto onError;
2075    }
2076    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
2077        goto onError;
2078
2079    /* Copy back the bytes variables, which might have been modified by the
2080       callback */
2081    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2082    if (!inputobj)
2083        goto onError;
2084    if (!PyBytes_Check(inputobj)) {
2085        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2086    }
2087    *input = PyBytes_AS_STRING(inputobj);
2088    insize = PyBytes_GET_SIZE(inputobj);
2089    *inend = *input + insize;
2090    /* we can DECREF safely, as the exception has another reference,
2091       so the object won't go away. */
2092    Py_DECREF(inputobj);
2093
2094    if (newpos<0)
2095        newpos = insize+newpos;
2096    if (newpos<0 || newpos>insize) {
2097        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2098        goto onError;
2099    }
2100
2101    /* need more space? (at least enough for what we
2102       have+the replacement+the rest of the string (starting
2103       at the new input position), so we won't have to check space
2104       when there are no errors in the rest of the string) */
2105    repptr = PyUnicode_AS_UNICODE(repunicode);
2106    repsize = PyUnicode_GET_SIZE(repunicode);
2107    requiredsize = *outpos + repsize + insize-newpos;
2108    if (requiredsize > outsize) {
2109        if (requiredsize<2*outsize)
2110            requiredsize = 2*outsize;
2111        if (_PyUnicode_Resize(output, requiredsize) < 0)
2112            goto onError;
2113        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2114    }
2115    *endinpos = newpos;
2116    *inptr = *input + newpos;
2117    Py_UNICODE_COPY(*outptr, repptr, repsize);
2118    *outptr += repsize;
2119    *outpos += repsize;
2120
2121    /* we made it! */
2122    res = 0;
2123
2124  onError:
2125    Py_XDECREF(restuple);
2126    return res;
2127}
2128
2129/* --- UTF-7 Codec -------------------------------------------------------- */
2130
2131/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2132
2133/* Three simple macros defining base-64. */
2134
2135/* Is c a base-64 character? */
2136
2137#define IS_BASE64(c) \
2138    (((c) >= 'A' && (c) <= 'Z') ||     \
2139     ((c) >= 'a' && (c) <= 'z') ||     \
2140     ((c) >= '0' && (c) <= '9') ||     \
2141     (c) == '+' || (c) == '/')
2142
2143/* given that c is a base-64 character, what is its base-64 value? */
2144
2145#define FROM_BASE64(c)                                                  \
2146    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
2147     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
2148     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
2149     (c) == '+' ? 62 : 63)
2150
2151/* What is the base-64 character of the bottom 6 bits of n? */
2152
2153#define TO_BASE64(n)  \
2154    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2155
2156/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2157 * decoded as itself.  We are permissive on decoding; the only ASCII
2158 * byte not decoding to itself is the + which begins a base64
2159 * string. */
2160
2161#define DECODE_DIRECT(c)                                \
2162    ((c) <= 127 && (c) != '+')
2163
2164/* The UTF-7 encoder treats ASCII characters differently according to
2165 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2166 * the above).  See RFC2152.  This array identifies these different
2167 * sets:
2168 * 0 : "Set D"
2169 *     alphanumeric and '(),-./:?
2170 * 1 : "Set O"
2171 *     !"#$%&*;<=>@[]^_`{|}
2172 * 2 : "whitespace"
2173 *     ht nl cr sp
2174 * 3 : special (must be base64 encoded)
2175 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2176 */
2177
2178static
2179char utf7_category[128] = {
2180/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
2181    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
2182/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
2183    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2184/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
2185    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
2186/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
2187    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
2188/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
2189    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2190/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
2191    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
2192/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
2193    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2194/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
2195    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
2196};
2197
2198/* ENCODE_DIRECT: this character should be encoded as itself.  The
2199 * answer depends on whether we are encoding set O as itself, and also
2200 * on whether we are encoding whitespace as itself.  RFC2152 makes it
2201 * clear that the answers to these questions vary between
2202 * applications, so this code needs to be flexible.  */
2203
2204#define ENCODE_DIRECT(c, directO, directWS)             \
2205    ((c) < 128 && (c) > 0 &&                            \
2206     ((utf7_category[(c)] == 0) ||                      \
2207      (directWS && (utf7_category[(c)] == 2)) ||        \
2208      (directO && (utf7_category[(c)] == 1))))
2209
2210PyObject *PyUnicode_DecodeUTF7(const char *s,
2211                               Py_ssize_t size,
2212                               const char *errors)
2213{
2214    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2215}
2216
2217/* The decoder.  The only state we preserve is our read position,
2218 * i.e. how many characters we have consumed.  So if we end in the
2219 * middle of a shift sequence we have to back off the read position
2220 * and the output to the beginning of the sequence, otherwise we lose
2221 * all the shift state (seen bits, number of bits seen, high
2222 * surrogate). */
2223
2224PyObject *PyUnicode_DecodeUTF7Stateful(const char *s,
2225                                       Py_ssize_t size,
2226                                       const char *errors,
2227                                       Py_ssize_t *consumed)
2228{
2229    const char *starts = s;
2230    Py_ssize_t startinpos;
2231    Py_ssize_t endinpos;
2232    Py_ssize_t outpos;
2233    const char *e;
2234    PyUnicodeObject *unicode;
2235    Py_UNICODE *p;
2236    const char *errmsg = "";
2237    int inShift = 0;
2238    Py_UNICODE *shiftOutStart;
2239    unsigned int base64bits = 0;
2240    unsigned long base64buffer = 0;
2241    Py_UNICODE surrogate = 0;
2242    PyObject *errorHandler = NULL;
2243    PyObject *exc = NULL;
2244
2245    unicode = _PyUnicode_New(size);
2246    if (!unicode)
2247        return NULL;
2248    if (size == 0) {
2249        if (consumed)
2250            *consumed = 0;
2251        return (PyObject *)unicode;
2252    }
2253
2254    p = unicode->str;
2255    shiftOutStart = p;
2256    e = s + size;
2257
2258    while (s < e) {
2259        Py_UNICODE ch;
2260      restart:
2261        ch = (unsigned char) *s;
2262
2263        if (inShift) { /* in a base-64 section */
2264            if (IS_BASE64(ch)) { /* consume a base-64 character */
2265                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2266                base64bits += 6;
2267                s++;
2268                if (base64bits >= 16) {
2269                    /* we have enough bits for a UTF-16 value */
2270                    Py_UNICODE outCh = (Py_UNICODE)
2271                                       (base64buffer >> (base64bits-16));
2272                    base64bits -= 16;
2273                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2274                    if (surrogate) {
2275                        /* expecting a second surrogate */
2276                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2277#ifdef Py_UNICODE_WIDE
2278                            *p++ = (((surrogate & 0x3FF)<<10)
2279                                    | (outCh & 0x3FF)) + 0x10000;
2280#else
2281                            *p++ = surrogate;
2282                            *p++ = outCh;
2283#endif
2284                            surrogate = 0;
2285                        }
2286                        else {
2287                            surrogate = 0;
2288                            errmsg = "second surrogate missing";
2289                            goto utf7Error;
2290                        }
2291                    }
2292                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2293                        /* first surrogate */
2294                        surrogate = outCh;
2295                    }
2296                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2297                        errmsg = "unexpected second surrogate";
2298                        goto utf7Error;
2299                    }
2300                    else {
2301                        *p++ = outCh;
2302                    }
2303                }
2304            }
2305            else { /* now leaving a base-64 section */
2306                inShift = 0;
2307                s++;
2308                if (surrogate) {
2309                    errmsg = "second surrogate missing at end of shift sequence";
2310                    goto utf7Error;
2311                }
2312                if (base64bits > 0) { /* left-over bits */
2313                    if (base64bits >= 6) {
2314                        /* We've seen at least one base-64 character */
2315                        errmsg = "partial character in shift sequence";
2316                        goto utf7Error;
2317                    }
2318                    else {
2319                        /* Some bits remain; they should be zero */
2320                        if (base64buffer != 0) {
2321                            errmsg = "non-zero padding bits in shift sequence";
2322                            goto utf7Error;
2323                        }
2324                    }
2325                }
2326                if (ch != '-') {
2327                    /* '-' is absorbed; other terminating
2328                       characters are preserved */
2329                    *p++ = ch;
2330                }
2331            }
2332        }
2333        else if ( ch == '+' ) {
2334            startinpos = s-starts;
2335            s++; /* consume '+' */
2336            if (s < e && *s == '-') { /* '+-' encodes '+' */
2337                s++;
2338                *p++ = '+';
2339            }
2340            else { /* begin base64-encoded section */
2341                inShift = 1;
2342                shiftOutStart = p;
2343                base64bits = 0;
2344            }
2345        }
2346        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2347            *p++ = ch;
2348            s++;
2349        }
2350        else {
2351            startinpos = s-starts;
2352            s++;
2353            errmsg = "unexpected special character";
2354            goto utf7Error;
2355        }
2356        continue;
2357utf7Error:
2358        outpos = p-PyUnicode_AS_UNICODE(unicode);
2359        endinpos = s-starts;
2360        if (unicode_decode_call_errorhandler(
2361                errors, &errorHandler,
2362                "utf7", errmsg,
2363                &starts, &e, &startinpos, &endinpos, &exc, &s,
2364                &unicode, &outpos, &p))
2365            goto onError;
2366    }
2367
2368    /* end of string */
2369
2370    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2371        /* if we're in an inconsistent state, that's an error */
2372        if (surrogate ||
2373                (base64bits >= 6) ||
2374                (base64bits > 0 && base64buffer != 0)) {
2375            outpos = p-PyUnicode_AS_UNICODE(unicode);
2376            endinpos = size;
2377            if (unicode_decode_call_errorhandler(
2378                    errors, &errorHandler,
2379                    "utf7", "unterminated shift sequence",
2380                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2381                    &unicode, &outpos, &p))
2382                goto onError;
2383            if (s < e)
2384                goto restart;
2385        }
2386    }
2387
2388    /* return state */
2389    if (consumed) {
2390        if (inShift) {
2391            p = shiftOutStart; /* back off output */
2392            *consumed = startinpos;
2393        }
2394        else {
2395            *consumed = s-starts;
2396        }
2397    }
2398
2399    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2400        goto onError;
2401
2402    Py_XDECREF(errorHandler);
2403    Py_XDECREF(exc);
2404    return (PyObject *)unicode;
2405
2406  onError:
2407    Py_XDECREF(errorHandler);
2408    Py_XDECREF(exc);
2409    Py_DECREF(unicode);
2410    return NULL;
2411}
2412
2413
2414PyObject *PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2415                               Py_ssize_t size,
2416                               int base64SetO,
2417                               int base64WhiteSpace,
2418                               const char *errors)
2419{
2420    PyObject *v;
2421    /* It might be possible to tighten this worst case */
2422    Py_ssize_t allocated = 8 * size;
2423    int inShift = 0;
2424    Py_ssize_t i = 0;
2425    unsigned int base64bits = 0;
2426    unsigned long base64buffer = 0;
2427    char * out;
2428    char * start;
2429
2430    if (size == 0)
2431        return PyBytes_FromStringAndSize(NULL, 0);
2432
2433    if (allocated / 8 != size)
2434        return PyErr_NoMemory();
2435
2436    v = PyBytes_FromStringAndSize(NULL, allocated);
2437    if (v == NULL)
2438        return NULL;
2439
2440    start = out = PyBytes_AS_STRING(v);
2441    for (;i < size; ++i) {
2442        Py_UNICODE ch = s[i];
2443
2444        if (inShift) {
2445            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2446                /* shifting out */
2447                if (base64bits) { /* output remaining bits */
2448                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2449                    base64buffer = 0;
2450                    base64bits = 0;
2451                }
2452                inShift = 0;
2453                /* Characters not in the BASE64 set implicitly unshift the sequence
2454                   so no '-' is required, except if the character is itself a '-' */
2455                if (IS_BASE64(ch) || ch == '-') {
2456                    *out++ = '-';
2457                }
2458                *out++ = (char) ch;
2459            }
2460            else {
2461                goto encode_char;
2462            }
2463        }
2464        else { /* not in a shift sequence */
2465            if (ch == '+') {
2466                *out++ = '+';
2467                        *out++ = '-';
2468            }
2469            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2470                *out++ = (char) ch;
2471            }
2472            else {
2473                *out++ = '+';
2474                inShift = 1;
2475                goto encode_char;
2476            }
2477        }
2478        continue;
2479encode_char:
2480#ifdef Py_UNICODE_WIDE
2481        if (ch >= 0x10000) {
2482            /* code first surrogate */
2483            base64bits += 16;
2484            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2485            while (base64bits >= 6) {
2486                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2487                base64bits -= 6;
2488            }
2489            /* prepare second surrogate */
2490            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2491        }
2492#endif
2493        base64bits += 16;
2494        base64buffer = (base64buffer << 16) | ch;
2495        while (base64bits >= 6) {
2496            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2497            base64bits -= 6;
2498        }
2499    }
2500    if (base64bits)
2501        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2502    if (inShift)
2503        *out++ = '-';
2504    if (_PyBytes_Resize(&v, out - start) < 0)
2505        return NULL;
2506    return v;
2507}
2508
2509#undef IS_BASE64
2510#undef FROM_BASE64
2511#undef TO_BASE64
2512#undef DECODE_DIRECT
2513#undef ENCODE_DIRECT
2514
2515/* --- UTF-8 Codec -------------------------------------------------------- */
2516
2517static
2518char utf8_code_length[256] = {
2519    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2520       illegal prefix.  See RFC 3629 for details */
2521    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2522    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2523    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2524    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2525    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2526    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2527    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2528    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2529    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2530    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2531    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2532    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2533    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2534    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2535    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2536    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2537};
2538
2539PyObject *PyUnicode_DecodeUTF8(const char *s,
2540                               Py_ssize_t size,
2541                               const char *errors)
2542{
2543    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2544}
2545
2546/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2547#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2548
2549/* Mask to quickly check whether a C 'long' contains a
2550   non-ASCII, UTF8-encoded char. */
2551#if (SIZEOF_LONG == 8)
2552# define ASCII_CHAR_MASK 0x8080808080808080L
2553#elif (SIZEOF_LONG == 4)
2554# define ASCII_CHAR_MASK 0x80808080L
2555#else
2556# error C 'long' size should be either 4 or 8!
2557#endif
2558
2559PyObject *PyUnicode_DecodeUTF8Stateful(const char *s,
2560                                       Py_ssize_t size,
2561                                       const char *errors,
2562                                       Py_ssize_t *consumed)
2563{
2564    const char *starts = s;
2565    int n;
2566    int k;
2567    Py_ssize_t startinpos;
2568    Py_ssize_t endinpos;
2569    Py_ssize_t outpos;
2570    const char *e, *aligned_end;
2571    PyUnicodeObject *unicode;
2572    Py_UNICODE *p;
2573    const char *errmsg = "";
2574    PyObject *errorHandler = NULL;
2575    PyObject *exc = NULL;
2576
2577    /* Note: size will always be longer than the resulting Unicode
2578       character count */
2579    unicode = _PyUnicode_New(size);
2580    if (!unicode)
2581        return NULL;
2582    if (size == 0) {
2583        if (consumed)
2584            *consumed = 0;
2585        return (PyObject *)unicode;
2586    }
2587
2588    /* Unpack UTF-8 encoded data */
2589    p = unicode->str;
2590    e = s + size;
2591    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2592
2593    while (s < e) {
2594        Py_UCS4 ch = (unsigned char)*s;
2595
2596        if (ch < 0x80) {
2597            /* Fast path for runs of ASCII characters. Given that common UTF-8
2598               input will consist of an overwhelming majority of ASCII
2599               characters, we try to optimize for this case by checking
2600               as many characters as a C 'long' can contain.
2601               First, check if we can do an aligned read, as most CPUs have
2602               a penalty for unaligned reads.
2603            */
2604            if (!((size_t) s & LONG_PTR_MASK)) {
2605                /* Help register allocation */
2606                register const char *_s = s;
2607                register Py_UNICODE *_p = p;
2608                while (_s < aligned_end) {
2609                    /* Read a whole long at a time (either 4 or 8 bytes),
2610                       and do a fast unrolled copy if it only contains ASCII
2611                       characters. */
2612                    unsigned long data = *(unsigned long *) _s;
2613                    if (data & ASCII_CHAR_MASK)
2614                        break;
2615                    _p[0] = (unsigned char) _s[0];
2616                    _p[1] = (unsigned char) _s[1];
2617                    _p[2] = (unsigned char) _s[2];
2618                    _p[3] = (unsigned char) _s[3];
2619#if (SIZEOF_LONG == 8)
2620                    _p[4] = (unsigned char) _s[4];
2621                    _p[5] = (unsigned char) _s[5];
2622                    _p[6] = (unsigned char) _s[6];
2623                    _p[7] = (unsigned char) _s[7];
2624#endif
2625                    _s += SIZEOF_LONG;
2626                    _p += SIZEOF_LONG;
2627                }
2628                s = _s;
2629                p = _p;
2630                if (s == e)
2631                    break;
2632                ch = (unsigned char)*s;
2633            }
2634        }
2635
2636        if (ch < 0x80) {
2637            *p++ = (Py_UNICODE)ch;
2638            s++;
2639            continue;
2640        }
2641
2642        n = utf8_code_length[ch];
2643
2644        if (s + n > e) {
2645            if (consumed)
2646                break;
2647            else {
2648                errmsg = "unexpected end of data";
2649                startinpos = s-starts;
2650                endinpos = startinpos+1;
2651                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2652                    endinpos++;
2653                goto utf8Error;
2654            }
2655        }
2656
2657        switch (n) {
2658
2659        case 0:
2660            errmsg = "invalid start byte";
2661            startinpos = s-starts;
2662            endinpos = startinpos+1;
2663            goto utf8Error;
2664
2665        case 1:
2666            errmsg = "internal error";
2667            startinpos = s-starts;
2668            endinpos = startinpos+1;
2669            goto utf8Error;
2670
2671        case 2:
2672            if ((s[1] & 0xc0) != 0x80) {
2673                errmsg = "invalid continuation byte";
2674                startinpos = s-starts;
2675                endinpos = startinpos + 1;
2676                goto utf8Error;
2677            }
2678            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2679            assert ((ch > 0x007F) && (ch <= 0x07FF));
2680            *p++ = (Py_UNICODE)ch;
2681            break;
2682
2683        case 3:
2684            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2685               will result in surrogates in range d800-dfff. Surrogates are
2686               not valid UTF-8 so they are rejected.
2687               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2688               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2689            if ((s[1] & 0xc0) != 0x80 ||
2690                (s[2] & 0xc0) != 0x80 ||
2691                ((unsigned char)s[0] == 0xE0 &&
2692                 (unsigned char)s[1] < 0xA0) ||
2693                ((unsigned char)s[0] == 0xED &&
2694                 (unsigned char)s[1] > 0x9F)) {
2695                errmsg = "invalid continuation byte";
2696                startinpos = s-starts;
2697                endinpos = startinpos + 1;
2698
2699                /* if s[1] first two bits are 1 and 0, then the invalid
2700                   continuation byte is s[2], so increment endinpos by 1,
2701                   if not, s[1] is invalid and endinpos doesn't need to
2702                   be incremented. */
2703                if ((s[1] & 0xC0) == 0x80)
2704                    endinpos++;
2705                goto utf8Error;
2706            }
2707            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2708            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2709            *p++ = (Py_UNICODE)ch;
2710            break;
2711
2712        case 4:
2713            if ((s[1] & 0xc0) != 0x80 ||
2714                (s[2] & 0xc0) != 0x80 ||
2715                (s[3] & 0xc0) != 0x80 ||
2716                ((unsigned char)s[0] == 0xF0 &&
2717                 (unsigned char)s[1] < 0x90) ||
2718                ((unsigned char)s[0] == 0xF4 &&
2719                 (unsigned char)s[1] > 0x8F)) {
2720                errmsg = "invalid continuation byte";
2721                startinpos = s-starts;
2722                endinpos = startinpos + 1;
2723                if ((s[1] & 0xC0) == 0x80) {
2724                    endinpos++;
2725                    if ((s[2] & 0xC0) == 0x80)
2726                        endinpos++;
2727                }
2728                goto utf8Error;
2729            }
2730            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2731                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2732            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2733
2734#ifdef Py_UNICODE_WIDE
2735            *p++ = (Py_UNICODE)ch;
2736#else
2737            /*  compute and append the two surrogates: */
2738
2739            /*  translate from 10000..10FFFF to 0..FFFF */
2740            ch -= 0x10000;
2741
2742            /*  high surrogate = top 10 bits added to D800 */
2743            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2744
2745            /*  low surrogate = bottom 10 bits added to DC00 */
2746            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2747#endif
2748            break;
2749        }
2750        s += n;
2751        continue;
2752
2753      utf8Error:
2754        outpos = p-PyUnicode_AS_UNICODE(unicode);
2755        if (unicode_decode_call_errorhandler(
2756                errors, &errorHandler,
2757                "utf8", errmsg,
2758                &starts, &e, &startinpos, &endinpos, &exc, &s,
2759                &unicode, &outpos, &p))
2760            goto onError;
2761        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2762    }
2763    if (consumed)
2764        *consumed = s-starts;
2765
2766    /* Adjust length */
2767    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2768        goto onError;
2769
2770    Py_XDECREF(errorHandler);
2771    Py_XDECREF(exc);
2772    return (PyObject *)unicode;
2773
2774  onError:
2775    Py_XDECREF(errorHandler);
2776    Py_XDECREF(exc);
2777    Py_DECREF(unicode);
2778    return NULL;
2779}
2780
2781#undef ASCII_CHAR_MASK
2782
2783#ifdef __APPLE__
2784
2785/* Simplified UTF-8 decoder using surrogateescape error handler,
2786   used to decode the command line arguments on Mac OS X. */
2787
2788wchar_t*
2789_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2790{
2791    int n;
2792    const char *e;
2793    wchar_t *unicode, *p;
2794
2795    /* Note: size will always be longer than the resulting Unicode
2796       character count */
2797    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2798        PyErr_NoMemory();
2799        return NULL;
2800    }
2801    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2802    if (!unicode)
2803        return NULL;
2804
2805    /* Unpack UTF-8 encoded data */
2806    p = unicode;
2807    e = s + size;
2808    while (s < e) {
2809        Py_UCS4 ch = (unsigned char)*s;
2810
2811        if (ch < 0x80) {
2812            *p++ = (wchar_t)ch;
2813            s++;
2814            continue;
2815        }
2816
2817        n = utf8_code_length[ch];
2818        if (s + n > e) {
2819            goto surrogateescape;
2820        }
2821
2822        switch (n) {
2823        case 0:
2824        case 1:
2825            goto surrogateescape;
2826
2827        case 2:
2828            if ((s[1] & 0xc0) != 0x80)
2829                goto surrogateescape;
2830            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2831            assert ((ch > 0x007F) && (ch <= 0x07FF));
2832            *p++ = (wchar_t)ch;
2833            break;
2834
2835        case 3:
2836            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2837               will result in surrogates in range d800-dfff. Surrogates are
2838               not valid UTF-8 so they are rejected.
2839               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2840               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2841            if ((s[1] & 0xc0) != 0x80 ||
2842                (s[2] & 0xc0) != 0x80 ||
2843                ((unsigned char)s[0] == 0xE0 &&
2844                 (unsigned char)s[1] < 0xA0) ||
2845                ((unsigned char)s[0] == 0xED &&
2846                 (unsigned char)s[1] > 0x9F)) {
2847
2848                goto surrogateescape;
2849            }
2850            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2851            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2852            *p++ = (Py_UNICODE)ch;
2853            break;
2854
2855        case 4:
2856            if ((s[1] & 0xc0) != 0x80 ||
2857                (s[2] & 0xc0) != 0x80 ||
2858                (s[3] & 0xc0) != 0x80 ||
2859                ((unsigned char)s[0] == 0xF0 &&
2860                 (unsigned char)s[1] < 0x90) ||
2861                ((unsigned char)s[0] == 0xF4 &&
2862                 (unsigned char)s[1] > 0x8F)) {
2863                goto surrogateescape;
2864            }
2865            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2866                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2867            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2868
2869#if SIZEOF_WCHAR_T == 4
2870            *p++ = (wchar_t)ch;
2871#else
2872            /*  compute and append the two surrogates: */
2873
2874            /*  translate from 10000..10FFFF to 0..FFFF */
2875            ch -= 0x10000;
2876
2877            /*  high surrogate = top 10 bits added to D800 */
2878            *p++ = (wchar_t)(0xD800 + (ch >> 10));
2879
2880            /*  low surrogate = bottom 10 bits added to DC00 */
2881            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2882#endif
2883            break;
2884        }
2885        s += n;
2886        continue;
2887
2888      surrogateescape:
2889        *p++ = 0xDC00 + ch;
2890        s++;
2891    }
2892    *p = L'\0';
2893    return unicode;
2894}
2895
2896#endif /* __APPLE__ */
2897
2898/* Allocation strategy:  if the string is short, convert into a stack buffer
2899   and allocate exactly as much space needed at the end.  Else allocate the
2900   maximum possible needed (4 result bytes per Unicode character), and return
2901   the excess memory at the end.
2902*/
2903PyObject *
2904PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2905                     Py_ssize_t size,
2906                     const char *errors)
2907{
2908#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2909
2910    Py_ssize_t i;                /* index into s of next input byte */
2911    PyObject *result;            /* result string object */
2912    char *p;                     /* next free byte in output buffer */
2913    Py_ssize_t nallocated;      /* number of result bytes allocated */
2914    Py_ssize_t nneeded;            /* number of result bytes needed */
2915    char stackbuf[MAX_SHORT_UNICHARS * 4];
2916    PyObject *errorHandler = NULL;
2917    PyObject *exc = NULL;
2918
2919    assert(s != NULL);
2920    assert(size >= 0);
2921
2922    if (size <= MAX_SHORT_UNICHARS) {
2923        /* Write into the stack buffer; nallocated can't overflow.
2924         * At the end, we'll allocate exactly as much heap space as it
2925         * turns out we need.
2926         */
2927        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2928        result = NULL;   /* will allocate after we're done */
2929        p = stackbuf;
2930    }
2931    else {
2932        /* Overallocate on the heap, and give the excess back at the end. */
2933        nallocated = size * 4;
2934        if (nallocated / 4 != size)  /* overflow! */
2935            return PyErr_NoMemory();
2936        result = PyBytes_FromStringAndSize(NULL, nallocated);
2937        if (result == NULL)
2938            return NULL;
2939        p = PyBytes_AS_STRING(result);
2940    }
2941
2942    for (i = 0; i < size;) {
2943        Py_UCS4 ch = s[i++];
2944
2945        if (ch < 0x80)
2946            /* Encode ASCII */
2947            *p++ = (char) ch;
2948
2949        else if (ch < 0x0800) {
2950            /* Encode Latin-1 */
2951            *p++ = (char)(0xc0 | (ch >> 6));
2952            *p++ = (char)(0x80 | (ch & 0x3f));
2953        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2954#ifndef Py_UNICODE_WIDE
2955            /* Special case: check for high and low surrogate */
2956            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2957                Py_UCS4 ch2 = s[i];
2958                /* Combine the two surrogates to form a UCS4 value */
2959                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2960                i++;
2961
2962                /* Encode UCS4 Unicode ordinals */
2963                *p++ = (char)(0xf0 | (ch >> 18));
2964                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2965                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2966                *p++ = (char)(0x80 | (ch & 0x3f));
2967            } else {
2968#endif
2969                Py_ssize_t newpos;
2970                PyObject *rep;
2971                Py_ssize_t repsize, k;
2972                rep = unicode_encode_call_errorhandler
2973                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
2974                     s, size, &exc, i-1, i, &newpos);
2975                if (!rep)
2976                    goto error;
2977
2978                if (PyBytes_Check(rep))
2979                    repsize = PyBytes_GET_SIZE(rep);
2980                else
2981                    repsize = PyUnicode_GET_SIZE(rep);
2982
2983                if (repsize > 4) {
2984                    Py_ssize_t offset;
2985
2986                    if (result == NULL)
2987                        offset = p - stackbuf;
2988                    else
2989                        offset = p - PyBytes_AS_STRING(result);
2990
2991                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
2992                        /* integer overflow */
2993                        PyErr_NoMemory();
2994                        goto error;
2995                    }
2996                    nallocated += repsize - 4;
2997                    if (result != NULL) {
2998                        if (_PyBytes_Resize(&result, nallocated) < 0)
2999                            goto error;
3000                    } else {
3001                        result = PyBytes_FromStringAndSize(NULL, nallocated);
3002                        if (result == NULL)
3003                            goto error;
3004                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3005                    }
3006                    p = PyBytes_AS_STRING(result) + offset;
3007                }
3008
3009                if (PyBytes_Check(rep)) {
3010                    char *prep = PyBytes_AS_STRING(rep);
3011                    for(k = repsize; k > 0; k--)
3012                        *p++ = *prep++;
3013                } else /* rep is unicode */ {
3014                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3015                    Py_UNICODE c;
3016
3017                    for(k=0; k<repsize; k++) {
3018                        c = prep[k];
3019                        if (0x80 <= c) {
3020                            raise_encode_exception(&exc, "utf-8", s, size,
3021                                                   i-1, i, "surrogates not allowed");
3022                            goto error;
3023                        }
3024                        *p++ = (char)prep[k];
3025                    }
3026                }
3027                Py_DECREF(rep);
3028#ifndef Py_UNICODE_WIDE
3029            }
3030#endif
3031        } else if (ch < 0x10000) {
3032            *p++ = (char)(0xe0 | (ch >> 12));
3033            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3034            *p++ = (char)(0x80 | (ch & 0x3f));
3035        } else /* ch >= 0x10000 */ {
3036            /* Encode UCS4 Unicode ordinals */
3037            *p++ = (char)(0xf0 | (ch >> 18));
3038            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3039            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3040            *p++ = (char)(0x80 | (ch & 0x3f));
3041        }
3042    }
3043
3044    if (result == NULL) {
3045        /* This was stack allocated. */
3046        nneeded = p - stackbuf;
3047        assert(nneeded <= nallocated);
3048        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
3049    }
3050    else {
3051        /* Cut back to size actually needed. */
3052        nneeded = p - PyBytes_AS_STRING(result);
3053        assert(nneeded <= nallocated);
3054        _PyBytes_Resize(&result, nneeded);
3055    }
3056    Py_XDECREF(errorHandler);
3057    Py_XDECREF(exc);
3058    return result;
3059 error:
3060    Py_XDECREF(errorHandler);
3061    Py_XDECREF(exc);
3062    Py_XDECREF(result);
3063    return NULL;
3064
3065#undef MAX_SHORT_UNICHARS
3066}
3067
3068PyObject *PyUnicode_AsUTF8String(PyObject *unicode)
3069{
3070    if (!PyUnicode_Check(unicode)) {
3071        PyErr_BadArgument();
3072        return NULL;
3073    }
3074    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
3075                                PyUnicode_GET_SIZE(unicode),
3076                                NULL);
3077}
3078
3079/* --- UTF-32 Codec ------------------------------------------------------- */
3080
3081PyObject *
3082PyUnicode_DecodeUTF32(const char *s,
3083                      Py_ssize_t size,
3084                      const char *errors,
3085                      int *byteorder)
3086{
3087    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3088}
3089
3090PyObject *
3091PyUnicode_DecodeUTF32Stateful(const char *s,
3092                              Py_ssize_t size,
3093                              const char *errors,
3094                              int *byteorder,
3095                              Py_ssize_t *consumed)
3096{
3097    const char *starts = s;
3098    Py_ssize_t startinpos;
3099    Py_ssize_t endinpos;
3100    Py_ssize_t outpos;
3101    PyUnicodeObject *unicode;
3102    Py_UNICODE *p;
3103#ifndef Py_UNICODE_WIDE
3104    int pairs = 0;
3105    const unsigned char *qq;
3106#else
3107    const int pairs = 0;
3108#endif
3109    const unsigned char *q, *e;
3110    int bo = 0;       /* assume native ordering by default */
3111    const char *errmsg = "";
3112    /* Offsets from q for retrieving bytes in the right order. */
3113#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3114    int iorder[] = {0, 1, 2, 3};
3115#else
3116    int iorder[] = {3, 2, 1, 0};
3117#endif
3118    PyObject *errorHandler = NULL;
3119    PyObject *exc = NULL;
3120
3121    q = (unsigned char *)s;
3122    e = q + size;
3123
3124    if (byteorder)
3125        bo = *byteorder;
3126
3127    /* Check for BOM marks (U+FEFF) in the input and adjust current
3128       byte order setting accordingly. In native mode, the leading BOM
3129       mark is skipped, in all other modes, it is copied to the output
3130       stream as-is (giving a ZWNBSP character). */
3131    if (bo == 0) {
3132        if (size >= 4) {
3133            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3134                (q[iorder[1]] << 8) | q[iorder[0]];
3135#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3136            if (bom == 0x0000FEFF) {
3137                q += 4;
3138                bo = -1;
3139            }
3140            else if (bom == 0xFFFE0000) {
3141                q += 4;
3142                bo = 1;
3143            }
3144#else
3145            if (bom == 0x0000FEFF) {
3146                q += 4;
3147                bo = 1;
3148            }
3149            else if (bom == 0xFFFE0000) {
3150                q += 4;
3151                bo = -1;
3152            }
3153#endif
3154        }
3155    }
3156
3157    if (bo == -1) {
3158        /* force LE */
3159        iorder[0] = 0;
3160        iorder[1] = 1;
3161        iorder[2] = 2;
3162        iorder[3] = 3;
3163    }
3164    else if (bo == 1) {
3165        /* force BE */
3166        iorder[0] = 3;
3167        iorder[1] = 2;
3168        iorder[2] = 1;
3169        iorder[3] = 0;
3170    }
3171
3172    /* On narrow builds we split characters outside the BMP into two
3173       codepoints => count how much extra space we need. */
3174#ifndef Py_UNICODE_WIDE
3175    for (qq = q; qq < e; qq += 4)
3176        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3177            pairs++;
3178#endif
3179
3180    /* This might be one to much, because of a BOM */
3181    unicode = _PyUnicode_New((size+3)/4+pairs);
3182    if (!unicode)
3183        return NULL;
3184    if (size == 0)
3185        return (PyObject *)unicode;
3186
3187    /* Unpack UTF-32 encoded data */
3188    p = unicode->str;
3189
3190    while (q < e) {
3191        Py_UCS4 ch;
3192        /* remaining bytes at the end? (size should be divisible by 4) */
3193        if (e-q<4) {
3194            if (consumed)
3195                break;
3196            errmsg = "truncated data";
3197            startinpos = ((const char *)q)-starts;
3198            endinpos = ((const char *)e)-starts;
3199            goto utf32Error;
3200            /* The remaining input chars are ignored if the callback
3201               chooses to skip the input */
3202        }
3203        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3204            (q[iorder[1]] << 8) | q[iorder[0]];
3205
3206        if (ch >= 0x110000)
3207        {
3208            errmsg = "codepoint not in range(0x110000)";
3209            startinpos = ((const char *)q)-starts;
3210            endinpos = startinpos+4;
3211            goto utf32Error;
3212        }
3213#ifndef Py_UNICODE_WIDE
3214        if (ch >= 0x10000)
3215        {
3216            *p++ = 0xD800 | ((ch-0x10000) >> 10);
3217            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3218        }
3219        else
3220#endif
3221            *p++ = ch;
3222        q += 4;
3223        continue;
3224      utf32Error:
3225        outpos = p-PyUnicode_AS_UNICODE(unicode);
3226        if (unicode_decode_call_errorhandler(
3227                errors, &errorHandler,
3228                "utf32", errmsg,
3229                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3230                &unicode, &outpos, &p))
3231            goto onError;
3232    }
3233
3234    if (byteorder)
3235        *byteorder = bo;
3236
3237    if (consumed)
3238        *consumed = (const char *)q-starts;
3239
3240    /* Adjust length */
3241    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3242        goto onError;
3243
3244    Py_XDECREF(errorHandler);
3245    Py_XDECREF(exc);
3246    return (PyObject *)unicode;
3247
3248  onError:
3249    Py_DECREF(unicode);
3250    Py_XDECREF(errorHandler);
3251    Py_XDECREF(exc);
3252    return NULL;
3253}
3254
3255PyObject *
3256PyUnicode_EncodeUTF32(const Py_UNICODE *s,
3257                      Py_ssize_t size,
3258                      const char *errors,
3259                      int byteorder)
3260{
3261    PyObject *v;
3262    unsigned char *p;
3263    Py_ssize_t nsize, bytesize;
3264#ifndef Py_UNICODE_WIDE
3265    Py_ssize_t i, pairs;
3266#else
3267    const int pairs = 0;
3268#endif
3269    /* Offsets from p for storing byte pairs in the right order. */
3270#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3271    int iorder[] = {0, 1, 2, 3};
3272#else
3273    int iorder[] = {3, 2, 1, 0};
3274#endif
3275
3276#define STORECHAR(CH)                           \
3277    do {                                        \
3278        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
3279        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
3280        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
3281        p[iorder[0]] = (CH) & 0xff;             \
3282        p += 4;                                 \
3283    } while(0)
3284
3285    /* In narrow builds we can output surrogate pairs as one codepoint,
3286       so we need less space. */
3287#ifndef Py_UNICODE_WIDE
3288    for (i = pairs = 0; i < size-1; i++)
3289        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3290            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3291            pairs++;
3292#endif
3293    nsize = (size - pairs + (byteorder == 0));
3294    bytesize = nsize * 4;
3295    if (bytesize / 4 != nsize)
3296        return PyErr_NoMemory();
3297    v = PyBytes_FromStringAndSize(NULL, bytesize);
3298    if (v == NULL)
3299        return NULL;
3300
3301    p = (unsigned char *)PyBytes_AS_STRING(v);
3302    if (byteorder == 0)
3303        STORECHAR(0xFEFF);
3304    if (size == 0)
3305        goto done;
3306
3307    if (byteorder == -1) {
3308        /* force LE */
3309        iorder[0] = 0;
3310        iorder[1] = 1;
3311        iorder[2] = 2;
3312        iorder[3] = 3;
3313    }
3314    else if (byteorder == 1) {
3315        /* force BE */
3316        iorder[0] = 3;
3317        iorder[1] = 2;
3318        iorder[2] = 1;
3319        iorder[3] = 0;
3320    }
3321
3322    while (size-- > 0) {
3323        Py_UCS4 ch = *s++;
3324#ifndef Py_UNICODE_WIDE
3325        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3326            Py_UCS4 ch2 = *s;
3327            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3328                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3329                s++;
3330                size--;
3331            }
3332        }
3333#endif
3334        STORECHAR(ch);
3335    }
3336
3337  done:
3338    return v;
3339#undef STORECHAR
3340}
3341
3342PyObject *PyUnicode_AsUTF32String(PyObject *unicode)
3343{
3344    if (!PyUnicode_Check(unicode)) {
3345        PyErr_BadArgument();
3346        return NULL;
3347    }
3348    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3349                                 PyUnicode_GET_SIZE(unicode),
3350                                 NULL,
3351                                 0);
3352}
3353
3354/* --- UTF-16 Codec ------------------------------------------------------- */
3355
3356PyObject *
3357PyUnicode_DecodeUTF16(const char *s,
3358                      Py_ssize_t size,
3359                      const char *errors,
3360                      int *byteorder)
3361{
3362    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3363}
3364
3365/* Two masks for fast checking of whether a C 'long' may contain
3366   UTF16-encoded surrogate characters. This is an efficient heuristic,
3367   assuming that non-surrogate characters with a code point >= 0x8000 are
3368   rare in most input.
3369   FAST_CHAR_MASK is used when the input is in native byte ordering,
3370   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3371*/
3372#if (SIZEOF_LONG == 8)
3373# define FAST_CHAR_MASK         0x8000800080008000L
3374# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3375#elif (SIZEOF_LONG == 4)
3376# define FAST_CHAR_MASK         0x80008000L
3377# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3378#else
3379# error C 'long' size should be either 4 or 8!
3380#endif
3381
3382PyObject *
3383PyUnicode_DecodeUTF16Stateful(const char *s,
3384                              Py_ssize_t size,
3385                              const char *errors,
3386                              int *byteorder,
3387                              Py_ssize_t *consumed)
3388{
3389    const char *starts = s;
3390    Py_ssize_t startinpos;
3391    Py_ssize_t endinpos;
3392    Py_ssize_t outpos;
3393    PyUnicodeObject *unicode;
3394    Py_UNICODE *p;
3395    const unsigned char *q, *e, *aligned_end;
3396    int bo = 0;       /* assume native ordering by default */
3397    int native_ordering = 0;
3398    const char *errmsg = "";
3399    /* Offsets from q for retrieving byte pairs in the right order. */
3400#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3401    int ihi = 1, ilo = 0;
3402#else
3403    int ihi = 0, ilo = 1;
3404#endif
3405    PyObject *errorHandler = NULL;
3406    PyObject *exc = NULL;
3407
3408    /* Note: size will always be longer than the resulting Unicode
3409       character count */
3410    unicode = _PyUnicode_New(size);
3411    if (!unicode)
3412        return NULL;
3413    if (size == 0)
3414        return (PyObject *)unicode;
3415
3416    /* Unpack UTF-16 encoded data */
3417    p = unicode->str;
3418    q = (unsigned char *)s;
3419    e = q + size - 1;
3420
3421    if (byteorder)
3422        bo = *byteorder;
3423
3424    /* Check for BOM marks (U+FEFF) in the input and adjust current
3425       byte order setting accordingly. In native mode, the leading BOM
3426       mark is skipped, in all other modes, it is copied to the output
3427       stream as-is (giving a ZWNBSP character). */
3428    if (bo == 0) {
3429        if (size >= 2) {
3430            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3431#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3432            if (bom == 0xFEFF) {
3433                q += 2;
3434                bo = -1;
3435            }
3436            else if (bom == 0xFFFE) {
3437                q += 2;
3438                bo = 1;
3439            }
3440#else
3441            if (bom == 0xFEFF) {
3442                q += 2;
3443                bo = 1;
3444            }
3445            else if (bom == 0xFFFE) {
3446                q += 2;
3447                bo = -1;
3448            }
3449#endif
3450        }
3451    }
3452
3453    if (bo == -1) {
3454        /* force LE */
3455        ihi = 1;
3456        ilo = 0;
3457    }
3458    else if (bo == 1) {
3459        /* force BE */
3460        ihi = 0;
3461        ilo = 1;
3462    }
3463#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3464    native_ordering = ilo < ihi;
3465#else
3466    native_ordering = ilo > ihi;
3467#endif
3468
3469    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3470    while (q < e) {
3471        Py_UNICODE ch;
3472        /* First check for possible aligned read of a C 'long'. Unaligned
3473           reads are more expensive, better to defer to another iteration. */
3474        if (!((size_t) q & LONG_PTR_MASK)) {
3475            /* Fast path for runs of non-surrogate chars. */
3476            register const unsigned char *_q = q;
3477            Py_UNICODE *_p = p;
3478            if (native_ordering) {
3479                /* Native ordering is simple: as long as the input cannot
3480                   possibly contain a surrogate char, do an unrolled copy
3481                   of several 16-bit code points to the target object.
3482                   The non-surrogate check is done on several input bytes
3483                   at a time (as many as a C 'long' can contain). */
3484                while (_q < aligned_end) {
3485                    unsigned long data = * (unsigned long *) _q;
3486                    if (data & FAST_CHAR_MASK)
3487                        break;
3488                    _p[0] = ((unsigned short *) _q)[0];
3489                    _p[1] = ((unsigned short *) _q)[1];
3490#if (SIZEOF_LONG == 8)
3491                    _p[2] = ((unsigned short *) _q)[2];
3492                    _p[3] = ((unsigned short *) _q)[3];
3493#endif
3494                    _q += SIZEOF_LONG;
3495                    _p += SIZEOF_LONG / 2;
3496                }
3497            }
3498            else {
3499                /* Byteswapped ordering is similar, but we must decompose
3500                   the copy bytewise, and take care of zero'ing out the
3501                   upper bytes if the target object is in 32-bit units
3502                   (that is, in UCS-4 builds). */
3503                while (_q < aligned_end) {
3504                    unsigned long data = * (unsigned long *) _q;
3505                    if (data & SWAPPED_FAST_CHAR_MASK)
3506                        break;
3507                    /* Zero upper bytes in UCS-4 builds */
3508#if (Py_UNICODE_SIZE > 2)
3509                    _p[0] = 0;
3510                    _p[1] = 0;
3511#if (SIZEOF_LONG == 8)
3512                    _p[2] = 0;
3513                    _p[3] = 0;
3514#endif
3515#endif
3516                    /* Issue #4916; UCS-4 builds on big endian machines must
3517                       fill the two last bytes of each 4-byte unit. */
3518#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3519# define OFF 2
3520#else
3521# define OFF 0
3522#endif
3523                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3524                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3525                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3526                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3527#if (SIZEOF_LONG == 8)
3528                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3529                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3530                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3531                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3532#endif
3533#undef OFF
3534                    _q += SIZEOF_LONG;
3535                    _p += SIZEOF_LONG / 2;
3536                }
3537            }
3538            p = _p;
3539            q = _q;
3540            if (q >= e)
3541                break;
3542        }
3543        ch = (q[ihi] << 8) | q[ilo];
3544
3545        q += 2;
3546
3547        if (ch < 0xD800 || ch > 0xDFFF) {
3548            *p++ = ch;
3549            continue;
3550        }
3551
3552        /* UTF-16 code pair: */
3553        if (q > e) {
3554            errmsg = "unexpected end of data";
3555            startinpos = (((const char *)q) - 2) - starts;
3556            endinpos = ((const char *)e) + 1 - starts;
3557            goto utf16Error;
3558        }
3559        if (0xD800 <= ch && ch <= 0xDBFF) {
3560            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3561            q += 2;
3562            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3563#ifndef Py_UNICODE_WIDE
3564                *p++ = ch;
3565                *p++ = ch2;
3566#else
3567                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3568#endif
3569                continue;
3570            }
3571            else {
3572                errmsg = "illegal UTF-16 surrogate";
3573                startinpos = (((const char *)q)-4)-starts;
3574                endinpos = startinpos+2;
3575                goto utf16Error;
3576            }
3577
3578        }
3579        errmsg = "illegal encoding";
3580        startinpos = (((const char *)q)-2)-starts;
3581        endinpos = startinpos+2;
3582        /* Fall through to report the error */
3583
3584      utf16Error:
3585        outpos = p - PyUnicode_AS_UNICODE(unicode);
3586        if (unicode_decode_call_errorhandler(
3587                errors,
3588                &errorHandler,
3589                "utf16", errmsg,
3590                &starts,
3591                (const char **)&e,
3592                &startinpos,
3593                &endinpos,
3594                &exc,
3595                (const char **)&q,
3596                &unicode,
3597                &outpos,
3598                &p))
3599            goto onError;
3600    }
3601    /* remaining byte at the end? (size should be even) */
3602    if (e == q) {
3603        if (!consumed) {
3604            errmsg = "truncated data";
3605            startinpos = ((const char *)q) - starts;
3606            endinpos = ((const char *)e) + 1 - starts;
3607            outpos = p - PyUnicode_AS_UNICODE(unicode);
3608            if (unicode_decode_call_errorhandler(
3609                    errors,
3610                    &errorHandler,
3611                    "utf16", errmsg,
3612                    &starts,
3613                    (const char **)&e,
3614                    &startinpos,
3615                    &endinpos,
3616                    &exc,
3617                    (const char **)&q,
3618                    &unicode,
3619                    &outpos,
3620                    &p))
3621                goto onError;
3622            /* The remaining input chars are ignored if the callback
3623               chooses to skip the input */
3624        }
3625    }
3626
3627    if (byteorder)
3628        *byteorder = bo;
3629
3630    if (consumed)
3631        *consumed = (const char *)q-starts;
3632
3633    /* Adjust length */
3634    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3635        goto onError;
3636
3637    Py_XDECREF(errorHandler);
3638    Py_XDECREF(exc);
3639    return (PyObject *)unicode;
3640
3641  onError:
3642    Py_DECREF(unicode);
3643    Py_XDECREF(errorHandler);
3644    Py_XDECREF(exc);
3645    return NULL;
3646}
3647
3648#undef FAST_CHAR_MASK
3649#undef SWAPPED_FAST_CHAR_MASK
3650
3651PyObject *
3652PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3653                      Py_ssize_t size,
3654                      const char *errors,
3655                      int byteorder)
3656{
3657    PyObject *v;
3658    unsigned char *p;
3659    Py_ssize_t nsize, bytesize;
3660#ifdef Py_UNICODE_WIDE
3661    Py_ssize_t i, pairs;
3662#else
3663    const int pairs = 0;
3664#endif
3665    /* Offsets from p for storing byte pairs in the right order. */
3666#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3667    int ihi = 1, ilo = 0;
3668#else
3669    int ihi = 0, ilo = 1;
3670#endif
3671
3672#define STORECHAR(CH)                           \
3673    do {                                        \
3674        p[ihi] = ((CH) >> 8) & 0xff;            \
3675        p[ilo] = (CH) & 0xff;                   \
3676        p += 2;                                 \
3677    } while(0)
3678
3679#ifdef Py_UNICODE_WIDE
3680    for (i = pairs = 0; i < size; i++)
3681        if (s[i] >= 0x10000)
3682            pairs++;
3683#endif
3684    /* 2 * (size + pairs + (byteorder == 0)) */
3685    if (size > PY_SSIZE_T_MAX ||
3686        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3687        return PyErr_NoMemory();
3688    nsize = size + pairs + (byteorder == 0);
3689    bytesize = nsize * 2;
3690    if (bytesize / 2 != nsize)
3691        return PyErr_NoMemory();
3692    v = PyBytes_FromStringAndSize(NULL, bytesize);
3693    if (v == NULL)
3694        return NULL;
3695
3696    p = (unsigned char *)PyBytes_AS_STRING(v);
3697    if (byteorder == 0)
3698        STORECHAR(0xFEFF);
3699    if (size == 0)
3700        goto done;
3701
3702    if (byteorder == -1) {
3703        /* force LE */
3704        ihi = 1;
3705        ilo = 0;
3706    }
3707    else if (byteorder == 1) {
3708        /* force BE */
3709        ihi = 0;
3710        ilo = 1;
3711    }
3712
3713    while (size-- > 0) {
3714        Py_UNICODE ch = *s++;
3715        Py_UNICODE ch2 = 0;
3716#ifdef Py_UNICODE_WIDE
3717        if (ch >= 0x10000) {
3718            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3719            ch  = 0xD800 | ((ch-0x10000) >> 10);
3720        }
3721#endif
3722        STORECHAR(ch);
3723        if (ch2)
3724            STORECHAR(ch2);
3725    }
3726
3727  done:
3728    return v;
3729#undef STORECHAR
3730}
3731
3732PyObject *PyUnicode_AsUTF16String(PyObject *unicode)
3733{
3734    if (!PyUnicode_Check(unicode)) {
3735        PyErr_BadArgument();
3736        return NULL;
3737    }
3738    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3739                                 PyUnicode_GET_SIZE(unicode),
3740                                 NULL,
3741                                 0);
3742}
3743
3744/* --- Unicode Escape Codec ----------------------------------------------- */
3745
3746static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3747
3748PyObject *PyUnicode_DecodeUnicodeEscape(const char *s,
3749                                        Py_ssize_t size,
3750                                        const char *errors)
3751{
3752    const char *starts = s;
3753    Py_ssize_t startinpos;
3754    Py_ssize_t endinpos;
3755    Py_ssize_t outpos;
3756    int i;
3757    PyUnicodeObject *v;
3758    Py_UNICODE *p;
3759    const char *end;
3760    char* message;
3761    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3762    PyObject *errorHandler = NULL;
3763    PyObject *exc = NULL;
3764
3765    /* Escaped strings will always be longer than the resulting
3766       Unicode string, so we start with size here and then reduce the
3767       length after conversion to the true value.
3768       (but if the error callback returns a long replacement string
3769       we'll have to allocate more space) */
3770    v = _PyUnicode_New(size);
3771    if (v == NULL)
3772        goto onError;
3773    if (size == 0)
3774        return (PyObject *)v;
3775
3776    p = PyUnicode_AS_UNICODE(v);
3777    end = s + size;
3778
3779    while (s < end) {
3780        unsigned char c;
3781        Py_UNICODE x;
3782        int digits;
3783
3784        /* Non-escape characters are interpreted as Unicode ordinals */
3785        if (*s != '\\') {
3786            *p++ = (unsigned char) *s++;
3787            continue;
3788        }
3789
3790        startinpos = s-starts;
3791        /* \ - Escapes */
3792        s++;
3793        c = *s++;
3794        if (s > end)
3795            c = '\0'; /* Invalid after \ */
3796        switch (c) {
3797
3798            /* \x escapes */
3799        case '\n': break;
3800        case '\\': *p++ = '\\'; break;
3801        case '\'': *p++ = '\''; break;
3802        case '\"': *p++ = '\"'; break;
3803        case 'b': *p++ = '\b'; break;
3804        case 'f': *p++ = '\014'; break; /* FF */
3805        case 't': *p++ = '\t'; break;
3806        case 'n': *p++ = '\n'; break;
3807        case 'r': *p++ = '\r'; break;
3808        case 'v': *p++ = '\013'; break; /* VT */
3809        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3810
3811            /* \OOO (octal) escapes */
3812        case '0': case '1': case '2': case '3':
3813        case '4': case '5': case '6': case '7':
3814            x = s[-1] - '0';
3815            if (s < end && '0' <= *s && *s <= '7') {
3816                x = (x<<3) + *s++ - '0';
3817                if (s < end && '0' <= *s && *s <= '7')
3818                    x = (x<<3) + *s++ - '0';
3819            }
3820            *p++ = x;
3821            break;
3822
3823            /* hex escapes */
3824            /* \xXX */
3825        case 'x':
3826            digits = 2;
3827            message = "truncated \\xXX escape";
3828            goto hexescape;
3829
3830            /* \uXXXX */
3831        case 'u':
3832            digits = 4;
3833            message = "truncated \\uXXXX escape";
3834            goto hexescape;
3835
3836            /* \UXXXXXXXX */
3837        case 'U':
3838            digits = 8;
3839            message = "truncated \\UXXXXXXXX escape";
3840        hexescape:
3841            chr = 0;
3842            outpos = p-PyUnicode_AS_UNICODE(v);
3843            if (s+digits>end) {
3844                endinpos = size;
3845                if (unicode_decode_call_errorhandler(
3846                        errors, &errorHandler,
3847                        "unicodeescape", "end of string in escape sequence",
3848                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3849                        &v, &outpos, &p))
3850                    goto onError;
3851                goto nextByte;
3852            }
3853            for (i = 0; i < digits; ++i) {
3854                c = (unsigned char) s[i];
3855                if (!Py_ISXDIGIT(c)) {
3856                    endinpos = (s+i+1)-starts;
3857                    if (unicode_decode_call_errorhandler(
3858                            errors, &errorHandler,
3859                            "unicodeescape", message,
3860                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3861                            &v, &outpos, &p))
3862                        goto onError;
3863                    goto nextByte;
3864                }
3865                chr = (chr<<4) & ~0xF;
3866                if (c >= '0' && c <= '9')
3867                    chr += c - '0';
3868                else if (c >= 'a' && c <= 'f')
3869                    chr += 10 + c - 'a';
3870                else
3871                    chr += 10 + c - 'A';
3872            }
3873            s += i;
3874            if (chr == 0xffffffff && PyErr_Occurred())
3875                /* _decoding_error will have already written into the
3876                   target buffer. */
3877                break;
3878        store:
3879            /* when we get here, chr is a 32-bit unicode character */
3880            if (chr <= 0xffff)
3881                /* UCS-2 character */
3882                *p++ = (Py_UNICODE) chr;
3883            else if (chr <= 0x10ffff) {
3884                /* UCS-4 character. Either store directly, or as
3885                   surrogate pair. */
3886#ifdef Py_UNICODE_WIDE
3887                *p++ = chr;
3888#else
3889                chr -= 0x10000L;
3890                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3891                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3892#endif
3893            } else {
3894                endinpos = s-starts;
3895                outpos = p-PyUnicode_AS_UNICODE(v);
3896                if (unicode_decode_call_errorhandler(
3897                        errors, &errorHandler,
3898                        "unicodeescape", "illegal Unicode character",
3899                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3900                        &v, &outpos, &p))
3901                    goto onError;
3902            }
3903            break;
3904
3905            /* \N{name} */
3906        case 'N':
3907            message = "malformed \\N character escape";
3908            if (ucnhash_CAPI == NULL) {
3909                /* load the unicode data module */
3910                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3911                if (ucnhash_CAPI == NULL)
3912                    goto ucnhashError;
3913            }
3914            if (*s == '{') {
3915                const char *start = s+1;
3916                /* look for the closing brace */
3917                while (*s != '}' && s < end)
3918                    s++;
3919                if (s > start && s < end && *s == '}') {
3920                    /* found a name.  look it up in the unicode database */
3921                    message = "unknown Unicode character name";
3922                    s++;
3923                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3924                        goto store;
3925                }
3926            }
3927            endinpos = s-starts;
3928            outpos = p-PyUnicode_AS_UNICODE(v);
3929            if (unicode_decode_call_errorhandler(
3930                    errors, &errorHandler,
3931                    "unicodeescape", message,
3932                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3933                    &v, &outpos, &p))
3934                goto onError;
3935            break;
3936
3937        default:
3938            if (s > end) {
3939                message = "\\ at end of string";
3940                s--;
3941                endinpos = s-starts;
3942                outpos = p-PyUnicode_AS_UNICODE(v);
3943                if (unicode_decode_call_errorhandler(
3944                        errors, &errorHandler,
3945                        "unicodeescape", message,
3946                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3947                        &v, &outpos, &p))
3948                    goto onError;
3949            }
3950            else {
3951                *p++ = '\\';
3952                *p++ = (unsigned char)s[-1];
3953            }
3954            break;
3955        }
3956      nextByte:
3957        ;
3958    }
3959    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
3960        goto onError;
3961    Py_XDECREF(errorHandler);
3962    Py_XDECREF(exc);
3963    return (PyObject *)v;
3964
3965  ucnhashError:
3966    PyErr_SetString(
3967        PyExc_UnicodeError,
3968        "\\N escapes not supported (can't load unicodedata module)"
3969        );
3970    Py_XDECREF(v);
3971    Py_XDECREF(errorHandler);
3972    Py_XDECREF(exc);
3973    return NULL;
3974
3975  onError:
3976    Py_XDECREF(v);
3977    Py_XDECREF(errorHandler);
3978    Py_XDECREF(exc);
3979    return NULL;
3980}
3981
3982/* Return a Unicode-Escape string version of the Unicode object.
3983
3984   If quotes is true, the string is enclosed in u"" or u'' quotes as
3985   appropriate.
3986
3987*/
3988
3989Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
3990                                             Py_ssize_t size,
3991                                             Py_UNICODE ch)
3992{
3993    /* like wcschr, but doesn't stop at NULL characters */
3994
3995    while (size-- > 0) {
3996        if (*s == ch)
3997            return s;
3998        s++;
3999    }
4000
4001    return NULL;
4002}
4003
4004static const char *hexdigits = "0123456789abcdef";
4005
4006PyObject *PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4007                                        Py_ssize_t size)
4008{
4009    PyObject *repr;
4010    char *p;
4011
4012#ifdef Py_UNICODE_WIDE
4013    const Py_ssize_t expandsize = 10;
4014#else
4015    const Py_ssize_t expandsize = 6;
4016#endif
4017
4018    /* XXX(nnorwitz): rather than over-allocating, it would be
4019       better to choose a different scheme.  Perhaps scan the
4020       first N-chars of the string and allocate based on that size.
4021    */
4022    /* Initial allocation is based on the longest-possible unichr
4023       escape.
4024
4025       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4026       unichr, so in this case it's the longest unichr escape. In
4027       narrow (UTF-16) builds this is five chars per source unichr
4028       since there are two unichrs in the surrogate pair, so in narrow
4029       (UTF-16) builds it's not the longest unichr escape.
4030
4031       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4032       so in the narrow (UTF-16) build case it's the longest unichr
4033       escape.
4034    */
4035
4036    if (size == 0)
4037        return PyBytes_FromStringAndSize(NULL, 0);
4038
4039    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
4040        return PyErr_NoMemory();
4041
4042    repr = PyBytes_FromStringAndSize(NULL,
4043                                     2
4044                                     + expandsize*size
4045                                     + 1);
4046    if (repr == NULL)
4047        return NULL;
4048
4049    p = PyBytes_AS_STRING(repr);
4050
4051    while (size-- > 0) {
4052        Py_UNICODE ch = *s++;
4053
4054        /* Escape backslashes */
4055        if (ch == '\\') {
4056            *p++ = '\\';
4057            *p++ = (char) ch;
4058            continue;
4059        }
4060
4061#ifdef Py_UNICODE_WIDE
4062        /* Map 21-bit characters to '\U00xxxxxx' */
4063        else if (ch >= 0x10000) {
4064            *p++ = '\\';
4065            *p++ = 'U';
4066            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4067            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4068            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4069            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4070            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4071            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4072            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4073            *p++ = hexdigits[ch & 0x0000000F];
4074            continue;
4075        }
4076#else
4077        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4078        else if (ch >= 0xD800 && ch < 0xDC00) {
4079            Py_UNICODE ch2;
4080            Py_UCS4 ucs;
4081
4082            ch2 = *s++;
4083            size--;
4084            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4085                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4086                *p++ = '\\';
4087                *p++ = 'U';
4088                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4089                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4090                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4091                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4092                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4093                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4094                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4095                *p++ = hexdigits[ucs & 0x0000000F];
4096                continue;
4097            }
4098            /* Fall through: isolated surrogates are copied as-is */
4099            s--;
4100            size++;
4101        }
4102#endif
4103
4104        /* Map 16-bit characters to '\uxxxx' */
4105        if (ch >= 256) {
4106            *p++ = '\\';
4107            *p++ = 'u';
4108            *p++ = hexdigits[(ch >> 12) & 0x000F];
4109            *p++ = hexdigits[(ch >> 8) & 0x000F];
4110            *p++ = hexdigits[(ch >> 4) & 0x000F];
4111            *p++ = hexdigits[ch & 0x000F];
4112        }
4113
4114        /* Map special whitespace to '\t', \n', '\r' */
4115        else if (ch == '\t') {
4116            *p++ = '\\';
4117            *p++ = 't';
4118        }
4119        else if (ch == '\n') {
4120            *p++ = '\\';
4121            *p++ = 'n';
4122        }
4123        else if (ch == '\r') {
4124            *p++ = '\\';
4125            *p++ = 'r';
4126        }
4127
4128        /* Map non-printable US ASCII to '\xhh' */
4129        else if (ch < ' ' || ch >= 0x7F) {
4130            *p++ = '\\';
4131            *p++ = 'x';
4132            *p++ = hexdigits[(ch >> 4) & 0x000F];
4133            *p++ = hexdigits[ch & 0x000F];
4134        }
4135
4136        /* Copy everything else as-is */
4137        else
4138            *p++ = (char) ch;
4139    }
4140
4141    assert(p - PyBytes_AS_STRING(repr) > 0);
4142    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4143        return NULL;
4144    return repr;
4145}
4146
4147PyObject *PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4148{
4149    PyObject *s;
4150    if (!PyUnicode_Check(unicode)) {
4151        PyErr_BadArgument();
4152        return NULL;
4153    }
4154    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4155                                      PyUnicode_GET_SIZE(unicode));
4156    return s;
4157}
4158
4159/* --- Raw Unicode Escape Codec ------------------------------------------- */
4160
4161PyObject *PyUnicode_DecodeRawUnicodeEscape(const char *s,
4162                                           Py_ssize_t size,
4163                                           const char *errors)
4164{
4165    const char *starts = s;
4166    Py_ssize_t startinpos;
4167    Py_ssize_t endinpos;
4168    Py_ssize_t outpos;
4169    PyUnicodeObject *v;
4170    Py_UNICODE *p;
4171    const char *end;
4172    const char *bs;
4173    PyObject *errorHandler = NULL;
4174    PyObject *exc = NULL;
4175
4176    /* Escaped strings will always be longer than the resulting
4177       Unicode string, so we start with size here and then reduce the
4178       length after conversion to the true value. (But decoding error
4179       handler might have to resize the string) */
4180    v = _PyUnicode_New(size);
4181    if (v == NULL)
4182        goto onError;
4183    if (size == 0)
4184        return (PyObject *)v;
4185    p = PyUnicode_AS_UNICODE(v);
4186    end = s + size;
4187    while (s < end) {
4188        unsigned char c;
4189        Py_UCS4 x;
4190        int i;
4191        int count;
4192
4193        /* Non-escape characters are interpreted as Unicode ordinals */
4194        if (*s != '\\') {
4195            *p++ = (unsigned char)*s++;
4196            continue;
4197        }
4198        startinpos = s-starts;
4199
4200        /* \u-escapes are only interpreted iff the number of leading
4201           backslashes if odd */
4202        bs = s;
4203        for (;s < end;) {
4204            if (*s != '\\')
4205                break;
4206            *p++ = (unsigned char)*s++;
4207        }
4208        if (((s - bs) & 1) == 0 ||
4209            s >= end ||
4210            (*s != 'u' && *s != 'U')) {
4211            continue;
4212        }
4213        p--;
4214        count = *s=='u' ? 4 : 8;
4215        s++;
4216
4217        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4218        outpos = p-PyUnicode_AS_UNICODE(v);
4219        for (x = 0, i = 0; i < count; ++i, ++s) {
4220            c = (unsigned char)*s;
4221            if (!Py_ISXDIGIT(c)) {
4222                endinpos = s-starts;
4223                if (unicode_decode_call_errorhandler(
4224                        errors, &errorHandler,
4225                        "rawunicodeescape", "truncated \\uXXXX",
4226                        &starts, &end, &startinpos, &endinpos, &exc, &s,
4227                        &v, &outpos, &p))
4228                    goto onError;
4229                goto nextByte;
4230            }
4231            x = (x<<4) & ~0xF;
4232            if (c >= '0' && c <= '9')
4233                x += c - '0';
4234            else if (c >= 'a' && c <= 'f')
4235                x += 10 + c - 'a';
4236            else
4237                x += 10 + c - 'A';
4238        }
4239        if (x <= 0xffff)
4240            /* UCS-2 character */
4241            *p++ = (Py_UNICODE) x;
4242        else if (x <= 0x10ffff) {
4243            /* UCS-4 character. Either store directly, or as
4244               surrogate pair. */
4245#ifdef Py_UNICODE_WIDE
4246            *p++ = (Py_UNICODE) x;
4247#else
4248            x -= 0x10000L;
4249            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4250            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
4251#endif
4252        } else {
4253            endinpos = s-starts;
4254            outpos = p-PyUnicode_AS_UNICODE(v);
4255            if (unicode_decode_call_errorhandler(
4256                    errors, &errorHandler,
4257                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
4258                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4259                    &v, &outpos, &p))
4260                goto onError;
4261        }
4262      nextByte:
4263        ;
4264    }
4265    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4266        goto onError;
4267    Py_XDECREF(errorHandler);
4268    Py_XDECREF(exc);
4269    return (PyObject *)v;
4270
4271  onError:
4272    Py_XDECREF(v);
4273    Py_XDECREF(errorHandler);
4274    Py_XDECREF(exc);
4275    return NULL;
4276}
4277
4278PyObject *PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4279                                           Py_ssize_t size)
4280{
4281    PyObject *repr;
4282    char *p;
4283    char *q;
4284
4285#ifdef Py_UNICODE_WIDE
4286    const Py_ssize_t expandsize = 10;
4287#else
4288    const Py_ssize_t expandsize = 6;
4289#endif
4290
4291    if (size > PY_SSIZE_T_MAX / expandsize)
4292        return PyErr_NoMemory();
4293
4294    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4295    if (repr == NULL)
4296        return NULL;
4297    if (size == 0)
4298        return repr;
4299
4300    p = q = PyBytes_AS_STRING(repr);
4301    while (size-- > 0) {
4302        Py_UNICODE ch = *s++;
4303#ifdef Py_UNICODE_WIDE
4304        /* Map 32-bit characters to '\Uxxxxxxxx' */
4305        if (ch >= 0x10000) {
4306            *p++ = '\\';
4307            *p++ = 'U';
4308            *p++ = hexdigits[(ch >> 28) & 0xf];
4309            *p++ = hexdigits[(ch >> 24) & 0xf];
4310            *p++ = hexdigits[(ch >> 20) & 0xf];
4311            *p++ = hexdigits[(ch >> 16) & 0xf];
4312            *p++ = hexdigits[(ch >> 12) & 0xf];
4313            *p++ = hexdigits[(ch >> 8) & 0xf];
4314            *p++ = hexdigits[(ch >> 4) & 0xf];
4315            *p++ = hexdigits[ch & 15];
4316        }
4317        else
4318#else
4319            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4320            if (ch >= 0xD800 && ch < 0xDC00) {
4321                Py_UNICODE ch2;
4322                Py_UCS4 ucs;
4323
4324                ch2 = *s++;
4325                size--;
4326                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4327                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4328                    *p++ = '\\';
4329                    *p++ = 'U';
4330                    *p++ = hexdigits[(ucs >> 28) & 0xf];
4331                    *p++ = hexdigits[(ucs >> 24) & 0xf];
4332                    *p++ = hexdigits[(ucs >> 20) & 0xf];
4333                    *p++ = hexdigits[(ucs >> 16) & 0xf];
4334                    *p++ = hexdigits[(ucs >> 12) & 0xf];
4335                    *p++ = hexdigits[(ucs >> 8) & 0xf];
4336                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4337                    *p++ = hexdigits[ucs & 0xf];
4338                    continue;
4339                }
4340                /* Fall through: isolated surrogates are copied as-is */
4341                s--;
4342                size++;
4343            }
4344#endif
4345        /* Map 16-bit characters to '\uxxxx' */
4346        if (ch >= 256) {
4347            *p++ = '\\';
4348            *p++ = 'u';
4349            *p++ = hexdigits[(ch >> 12) & 0xf];
4350            *p++ = hexdigits[(ch >> 8) & 0xf];
4351            *p++ = hexdigits[(ch >> 4) & 0xf];
4352            *p++ = hexdigits[ch & 15];
4353        }
4354        /* Copy everything else as-is */
4355        else
4356            *p++ = (char) ch;
4357    }
4358    size = p - q;
4359
4360    assert(size > 0);
4361    if (_PyBytes_Resize(&repr, size) < 0)
4362        return NULL;
4363    return repr;
4364}
4365
4366PyObject *PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4367{
4368    PyObject *s;
4369    if (!PyUnicode_Check(unicode)) {
4370        PyErr_BadArgument();
4371        return NULL;
4372    }
4373    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4374                                         PyUnicode_GET_SIZE(unicode));
4375
4376    return s;
4377}
4378
4379/* --- Unicode Internal Codec ------------------------------------------- */
4380
4381PyObject *_PyUnicode_DecodeUnicodeInternal(const char *s,
4382                                           Py_ssize_t size,
4383                                           const char *errors)
4384{
4385    const char *starts = s;
4386    Py_ssize_t startinpos;
4387    Py_ssize_t endinpos;
4388    Py_ssize_t outpos;
4389    PyUnicodeObject *v;
4390    Py_UNICODE *p;
4391    const char *end;
4392    const char *reason;
4393    PyObject *errorHandler = NULL;
4394    PyObject *exc = NULL;
4395
4396#ifdef Py_UNICODE_WIDE
4397    Py_UNICODE unimax = PyUnicode_GetMax();
4398#endif
4399
4400    /* XXX overflow detection missing */
4401    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4402    if (v == NULL)
4403        goto onError;
4404    if (PyUnicode_GetSize((PyObject *)v) == 0)
4405        return (PyObject *)v;
4406    p = PyUnicode_AS_UNICODE(v);
4407    end = s + size;
4408
4409    while (s < end) {
4410        memcpy(p, s, sizeof(Py_UNICODE));
4411        /* We have to sanity check the raw data, otherwise doom looms for
4412           some malformed UCS-4 data. */
4413        if (
4414#ifdef Py_UNICODE_WIDE
4415            *p > unimax || *p < 0 ||
4416#endif
4417            end-s < Py_UNICODE_SIZE
4418            )
4419        {
4420            startinpos = s - starts;
4421            if (end-s < Py_UNICODE_SIZE) {
4422                endinpos = end-starts;
4423                reason = "truncated input";
4424            }
4425            else {
4426                endinpos = s - starts + Py_UNICODE_SIZE;
4427                reason = "illegal code point (> 0x10FFFF)";
4428            }
4429            outpos = p - PyUnicode_AS_UNICODE(v);
4430            if (unicode_decode_call_errorhandler(
4431                    errors, &errorHandler,
4432                    "unicode_internal", reason,
4433                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4434                    &v, &outpos, &p)) {
4435                goto onError;
4436            }
4437        }
4438        else {
4439            p++;
4440            s += Py_UNICODE_SIZE;
4441        }
4442    }
4443
4444    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4445        goto onError;
4446    Py_XDECREF(errorHandler);
4447    Py_XDECREF(exc);
4448    return (PyObject *)v;
4449
4450  onError:
4451    Py_XDECREF(v);
4452    Py_XDECREF(errorHandler);
4453    Py_XDECREF(exc);
4454    return NULL;
4455}
4456
4457/* --- Latin-1 Codec ------------------------------------------------------ */
4458
4459PyObject *PyUnicode_DecodeLatin1(const char *s,
4460                                 Py_ssize_t size,
4461                                 const char *errors)
4462{
4463    PyUnicodeObject *v;
4464    Py_UNICODE *p;
4465    const char *e, *unrolled_end;
4466
4467    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4468    if (size == 1) {
4469        Py_UNICODE r = *(unsigned char*)s;
4470        return PyUnicode_FromUnicode(&r, 1);
4471    }
4472
4473    v = _PyUnicode_New(size);
4474    if (v == NULL)
4475        goto onError;
4476    if (size == 0)
4477        return (PyObject *)v;
4478    p = PyUnicode_AS_UNICODE(v);
4479    e = s + size;
4480    /* Unrolling the copy makes it much faster by reducing the looping
4481       overhead. This is similar to what many memcpy() implementations do. */
4482    unrolled_end = e - 4;
4483    while (s < unrolled_end) {
4484        p[0] = (unsigned char) s[0];
4485        p[1] = (unsigned char) s[1];
4486        p[2] = (unsigned char) s[2];
4487        p[3] = (unsigned char) s[3];
4488        s += 4;
4489        p += 4;
4490    }
4491    while (s < e)
4492        *p++ = (unsigned char) *s++;
4493    return (PyObject *)v;
4494
4495  onError:
4496    Py_XDECREF(v);
4497    return NULL;
4498}
4499
4500/* create or adjust a UnicodeEncodeError */
4501static void make_encode_exception(PyObject **exceptionObject,
4502                                  const char *encoding,
4503                                  const Py_UNICODE *unicode, Py_ssize_t size,
4504                                  Py_ssize_t startpos, Py_ssize_t endpos,
4505                                  const char *reason)
4506{
4507    if (*exceptionObject == NULL) {
4508        *exceptionObject = PyUnicodeEncodeError_Create(
4509            encoding, unicode, size, startpos, endpos, reason);
4510    }
4511    else {
4512        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4513            goto onError;
4514        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4515            goto onError;
4516        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4517            goto onError;
4518        return;
4519      onError:
4520        Py_DECREF(*exceptionObject);
4521        *exceptionObject = NULL;
4522    }
4523}
4524
4525/* raises a UnicodeEncodeError */
4526static void raise_encode_exception(PyObject **exceptionObject,
4527                                   const char *encoding,
4528                                   const Py_UNICODE *unicode, Py_ssize_t size,
4529                                   Py_ssize_t startpos, Py_ssize_t endpos,
4530                                   const char *reason)
4531{
4532    make_encode_exception(exceptionObject,
4533                          encoding, unicode, size, startpos, endpos, reason);
4534    if (*exceptionObject != NULL)
4535        PyCodec_StrictErrors(*exceptionObject);
4536}
4537
4538/* error handling callback helper:
4539   build arguments, call the callback and check the arguments,
4540   put the result into newpos and return the replacement string, which
4541   has to be freed by the caller */
4542static PyObject *unicode_encode_call_errorhandler(const char *errors,
4543                                                  PyObject **errorHandler,
4544                                                  const char *encoding, const char *reason,
4545                                                  const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4546                                                  Py_ssize_t startpos, Py_ssize_t endpos,
4547                                                  Py_ssize_t *newpos)
4548{
4549    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4550
4551    PyObject *restuple;
4552    PyObject *resunicode;
4553
4554    if (*errorHandler == NULL) {
4555        *errorHandler = PyCodec_LookupError(errors);
4556        if (*errorHandler == NULL)
4557            return NULL;
4558    }
4559
4560    make_encode_exception(exceptionObject,
4561                          encoding, unicode, size, startpos, endpos, reason);
4562    if (*exceptionObject == NULL)
4563        return NULL;
4564
4565    restuple = PyObject_CallFunctionObjArgs(
4566        *errorHandler, *exceptionObject, NULL);
4567    if (restuple == NULL)
4568        return NULL;
4569    if (!PyTuple_Check(restuple)) {
4570        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4571        Py_DECREF(restuple);
4572        return NULL;
4573    }
4574    if (!PyArg_ParseTuple(restuple, argparse,
4575                          &resunicode, newpos)) {
4576        Py_DECREF(restuple);
4577        return NULL;
4578    }
4579    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4580        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4581        Py_DECREF(restuple);
4582        return NULL;
4583    }
4584    if (*newpos<0)
4585        *newpos = size+*newpos;
4586    if (*newpos<0 || *newpos>size) {
4587        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4588        Py_DECREF(restuple);
4589        return NULL;
4590    }
4591    Py_INCREF(resunicode);
4592    Py_DECREF(restuple);
4593    return resunicode;
4594}
4595
4596static PyObject *unicode_encode_ucs1(const Py_UNICODE *p,
4597                                     Py_ssize_t size,
4598                                     const char *errors,
4599                                     int limit)
4600{
4601    /* output object */
4602    PyObject *res;
4603    /* pointers to the beginning and end+1 of input */
4604    const Py_UNICODE *startp = p;
4605    const Py_UNICODE *endp = p + size;
4606    /* pointer to the beginning of the unencodable characters */
4607    /* const Py_UNICODE *badp = NULL; */
4608    /* pointer into the output */
4609    char *str;
4610    /* current output position */
4611    Py_ssize_t ressize;
4612    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4613    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4614    PyObject *errorHandler = NULL;
4615    PyObject *exc = NULL;
4616    /* the following variable is used for caching string comparisons
4617     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4618    int known_errorHandler = -1;
4619
4620    /* allocate enough for a simple encoding without
4621       replacements, if we need more, we'll resize */
4622    if (size == 0)
4623        return PyBytes_FromStringAndSize(NULL, 0);
4624    res = PyBytes_FromStringAndSize(NULL, size);
4625    if (res == NULL)
4626        return NULL;
4627    str = PyBytes_AS_STRING(res);
4628    ressize = size;
4629
4630    while (p<endp) {
4631        Py_UNICODE c = *p;
4632
4633        /* can we encode this? */
4634        if (c<limit) {
4635            /* no overflow check, because we know that the space is enough */
4636            *str++ = (char)c;
4637            ++p;
4638        }
4639        else {
4640            Py_ssize_t unicodepos = p-startp;
4641            Py_ssize_t requiredsize;
4642            PyObject *repunicode;
4643            Py_ssize_t repsize;
4644            Py_ssize_t newpos;
4645            Py_ssize_t respos;
4646            Py_UNICODE *uni2;
4647            /* startpos for collecting unencodable chars */
4648            const Py_UNICODE *collstart = p;
4649            const Py_UNICODE *collend = p;
4650            /* find all unecodable characters */
4651            while ((collend < endp) && ((*collend)>=limit))
4652                ++collend;
4653            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4654            if (known_errorHandler==-1) {
4655                if ((errors==NULL) || (!strcmp(errors, "strict")))
4656                    known_errorHandler = 1;
4657                else if (!strcmp(errors, "replace"))
4658                    known_errorHandler = 2;
4659                else if (!strcmp(errors, "ignore"))
4660                    known_errorHandler = 3;
4661                else if (!strcmp(errors, "xmlcharrefreplace"))
4662                    known_errorHandler = 4;
4663                else
4664                    known_errorHandler = 0;
4665            }
4666            switch (known_errorHandler) {
4667            case 1: /* strict */
4668                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4669                goto onError;
4670            case 2: /* replace */
4671                while (collstart++<collend)
4672                    *str++ = '?'; /* fall through */
4673            case 3: /* ignore */
4674                p = collend;
4675                break;
4676            case 4: /* xmlcharrefreplace */
4677                respos = str - PyBytes_AS_STRING(res);
4678                /* determine replacement size (temporarily (mis)uses p) */
4679                for (p = collstart, repsize = 0; p < collend; ++p) {
4680                    if (*p<10)
4681                        repsize += 2+1+1;
4682                    else if (*p<100)
4683                        repsize += 2+2+1;
4684                    else if (*p<1000)
4685                        repsize += 2+3+1;
4686                    else if (*p<10000)
4687                        repsize += 2+4+1;
4688#ifndef Py_UNICODE_WIDE
4689                    else
4690                        repsize += 2+5+1;
4691#else
4692                    else if (*p<100000)
4693                        repsize += 2+5+1;
4694                    else if (*p<1000000)
4695                        repsize += 2+6+1;
4696                    else
4697                        repsize += 2+7+1;
4698#endif
4699                }
4700                requiredsize = respos+repsize+(endp-collend);
4701                if (requiredsize > ressize) {
4702                    if (requiredsize<2*ressize)
4703                        requiredsize = 2*ressize;
4704                    if (_PyBytes_Resize(&res, requiredsize))
4705                        goto onError;
4706                    str = PyBytes_AS_STRING(res) + respos;
4707                    ressize = requiredsize;
4708                }
4709                /* generate replacement (temporarily (mis)uses p) */
4710                for (p = collstart; p < collend; ++p) {
4711                    str += sprintf(str, "&#%d;", (int)*p);
4712                }
4713                p = collend;
4714                break;
4715            default:
4716                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4717                                                              encoding, reason, startp, size, &exc,
4718                                                              collstart-startp, collend-startp, &newpos);
4719                if (repunicode == NULL)
4720                    goto onError;
4721                if (PyBytes_Check(repunicode)) {
4722                    /* Directly copy bytes result to output. */
4723                    repsize = PyBytes_Size(repunicode);
4724                    if (repsize > 1) {
4725                        /* Make room for all additional bytes. */
4726                        respos = str - PyBytes_AS_STRING(res);
4727                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4728                            Py_DECREF(repunicode);
4729                            goto onError;
4730                        }
4731                        str = PyBytes_AS_STRING(res) + respos;
4732                        ressize += repsize-1;
4733                    }
4734                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4735                    str += repsize;
4736                    p = startp + newpos;
4737                    Py_DECREF(repunicode);
4738                    break;
4739                }
4740                /* need more space? (at least enough for what we
4741                   have+the replacement+the rest of the string, so
4742                   we won't have to check space for encodable characters) */
4743                respos = str - PyBytes_AS_STRING(res);
4744                repsize = PyUnicode_GET_SIZE(repunicode);
4745                requiredsize = respos+repsize+(endp-collend);
4746                if (requiredsize > ressize) {
4747                    if (requiredsize<2*ressize)
4748                        requiredsize = 2*ressize;
4749                    if (_PyBytes_Resize(&res, requiredsize)) {
4750                        Py_DECREF(repunicode);
4751                        goto onError;
4752                    }
4753                    str = PyBytes_AS_STRING(res) + respos;
4754                    ressize = requiredsize;
4755                }
4756                /* check if there is anything unencodable in the replacement
4757                   and copy it to the output */
4758                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4759                    c = *uni2;
4760                    if (c >= limit) {
4761                        raise_encode_exception(&exc, encoding, startp, size,
4762                                               unicodepos, unicodepos+1, reason);
4763                        Py_DECREF(repunicode);
4764                        goto onError;
4765                    }
4766                    *str = (char)c;
4767                }
4768                p = startp + newpos;
4769                Py_DECREF(repunicode);
4770            }
4771        }
4772    }
4773    /* Resize if we allocated to much */
4774    size = str - PyBytes_AS_STRING(res);
4775    if (size < ressize) { /* If this falls res will be NULL */
4776        assert(size >= 0);
4777        if (_PyBytes_Resize(&res, size) < 0)
4778            goto onError;
4779    }
4780
4781    Py_XDECREF(errorHandler);
4782    Py_XDECREF(exc);
4783    return res;
4784
4785  onError:
4786    Py_XDECREF(res);
4787    Py_XDECREF(errorHandler);
4788    Py_XDECREF(exc);
4789    return NULL;
4790}
4791
4792PyObject *PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4793                                 Py_ssize_t size,
4794                                 const char *errors)
4795{
4796    return unicode_encode_ucs1(p, size, errors, 256);
4797}
4798
4799PyObject *PyUnicode_AsLatin1String(PyObject *unicode)
4800{
4801    if (!PyUnicode_Check(unicode)) {
4802        PyErr_BadArgument();
4803        return NULL;
4804    }
4805    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4806                                  PyUnicode_GET_SIZE(unicode),
4807                                  NULL);
4808}
4809
4810/* --- 7-bit ASCII Codec -------------------------------------------------- */
4811
4812PyObject *PyUnicode_DecodeASCII(const char *s,
4813                                Py_ssize_t size,
4814                                const char *errors)
4815{
4816    const char *starts = s;
4817    PyUnicodeObject *v;
4818    Py_UNICODE *p;
4819    Py_ssize_t startinpos;
4820    Py_ssize_t endinpos;
4821    Py_ssize_t outpos;
4822    const char *e;
4823    PyObject *errorHandler = NULL;
4824    PyObject *exc = NULL;
4825
4826    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4827    if (size == 1 && *(unsigned char*)s < 128) {
4828        Py_UNICODE r = *(unsigned char*)s;
4829        return PyUnicode_FromUnicode(&r, 1);
4830    }
4831
4832    v = _PyUnicode_New(size);
4833    if (v == NULL)
4834        goto onError;
4835    if (size == 0)
4836        return (PyObject *)v;
4837    p = PyUnicode_AS_UNICODE(v);
4838    e = s + size;
4839    while (s < e) {
4840        register unsigned char c = (unsigned char)*s;
4841        if (c < 128) {
4842            *p++ = c;
4843            ++s;
4844        }
4845        else {
4846            startinpos = s-starts;
4847            endinpos = startinpos + 1;
4848            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4849            if (unicode_decode_call_errorhandler(
4850                    errors, &errorHandler,
4851                    "ascii", "ordinal not in range(128)",
4852                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4853                    &v, &outpos, &p))
4854                goto onError;
4855        }
4856    }
4857    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4858        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4859            goto onError;
4860    Py_XDECREF(errorHandler);
4861    Py_XDECREF(exc);
4862    return (PyObject *)v;
4863
4864  onError:
4865    Py_XDECREF(v);
4866    Py_XDECREF(errorHandler);
4867    Py_XDECREF(exc);
4868    return NULL;
4869}
4870
4871PyObject *PyUnicode_EncodeASCII(const Py_UNICODE *p,
4872                                Py_ssize_t size,
4873                                const char *errors)
4874{
4875    return unicode_encode_ucs1(p, size, errors, 128);
4876}
4877
4878PyObject *PyUnicode_AsASCIIString(PyObject *unicode)
4879{
4880    if (!PyUnicode_Check(unicode)) {
4881        PyErr_BadArgument();
4882        return NULL;
4883    }
4884    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4885                                 PyUnicode_GET_SIZE(unicode),
4886                                 NULL);
4887}
4888
4889#if defined(MS_WINDOWS) && defined(HAVE_USABLE_WCHAR_T)
4890
4891/* --- MBCS codecs for Windows -------------------------------------------- */
4892
4893#if SIZEOF_INT < SIZEOF_SIZE_T
4894#define NEED_RETRY
4895#endif
4896
4897/* XXX This code is limited to "true" double-byte encodings, as
4898   a) it assumes an incomplete character consists of a single byte, and
4899   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4900   encodings, see IsDBCSLeadByteEx documentation. */
4901
4902static int is_dbcs_lead_byte(const char *s, int offset)
4903{
4904    const char *curr = s + offset;
4905
4906    if (IsDBCSLeadByte(*curr)) {
4907        const char *prev = CharPrev(s, curr);
4908        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4909    }
4910    return 0;
4911}
4912
4913/*
4914 * Decode MBCS string into unicode object. If 'final' is set, converts
4915 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4916 */
4917static int decode_mbcs(PyUnicodeObject **v,
4918                       const char *s, /* MBCS string */
4919                       int size, /* sizeof MBCS string */
4920                       int final,
4921                       const char *errors)
4922{
4923    Py_UNICODE *p;
4924    Py_ssize_t n;
4925    DWORD usize;
4926    DWORD flags;
4927
4928    assert(size >= 0);
4929
4930    /* check and handle 'errors' arg */
4931    if (errors==NULL || strcmp(errors, "strict")==0)
4932        flags = MB_ERR_INVALID_CHARS;
4933    else if (strcmp(errors, "ignore")==0)
4934        flags = 0;
4935    else {
4936        PyErr_Format(PyExc_ValueError,
4937                     "mbcs encoding does not support errors='%s'",
4938                     errors);
4939        return -1;
4940    }
4941
4942    /* Skip trailing lead-byte unless 'final' is set */
4943    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
4944        --size;
4945
4946    /* First get the size of the result */
4947    if (size > 0) {
4948        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
4949        if (usize==0)
4950            goto mbcs_decode_error;
4951    } else
4952        usize = 0;
4953
4954    if (*v == NULL) {
4955        /* Create unicode object */
4956        *v = _PyUnicode_New(usize);
4957        if (*v == NULL)
4958            return -1;
4959        n = 0;
4960    }
4961    else {
4962        /* Extend unicode object */
4963        n = PyUnicode_GET_SIZE(*v);
4964        if (_PyUnicode_Resize(v, n + usize) < 0)
4965            return -1;
4966    }
4967
4968    /* Do the conversion */
4969    if (usize > 0) {
4970        p = PyUnicode_AS_UNICODE(*v) + n;
4971        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
4972            goto mbcs_decode_error;
4973        }
4974    }
4975    return size;
4976
4977mbcs_decode_error:
4978    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
4979       we raise a UnicodeDecodeError - else it is a 'generic'
4980       windows error
4981     */
4982    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
4983        /* Ideally, we should get reason from FormatMessage - this
4984           is the Windows 2000 English version of the message
4985        */
4986        PyObject *exc = NULL;
4987        const char *reason = "No mapping for the Unicode character exists "
4988                             "in the target multi-byte code page.";
4989        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
4990        if (exc != NULL) {
4991            PyCodec_StrictErrors(exc);
4992            Py_DECREF(exc);
4993        }
4994    } else {
4995        PyErr_SetFromWindowsErrWithFilename(0, NULL);
4996    }
4997    return -1;
4998}
4999
5000PyObject *PyUnicode_DecodeMBCSStateful(const char *s,
5001                                       Py_ssize_t size,
5002                                       const char *errors,
5003                                       Py_ssize_t *consumed)
5004{
5005    PyUnicodeObject *v = NULL;
5006    int done;
5007
5008    if (consumed)
5009        *consumed = 0;
5010
5011#ifdef NEED_RETRY
5012  retry:
5013    if (size > INT_MAX)
5014        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
5015    else
5016#endif
5017        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
5018
5019    if (done < 0) {
5020        Py_XDECREF(v);
5021        return NULL;
5022    }
5023
5024    if (consumed)
5025        *consumed += done;
5026
5027#ifdef NEED_RETRY
5028    if (size > INT_MAX) {
5029        s += done;
5030        size -= done;
5031        goto retry;
5032    }
5033#endif
5034
5035    return (PyObject *)v;
5036}
5037
5038PyObject *PyUnicode_DecodeMBCS(const char *s,
5039                               Py_ssize_t size,
5040                               const char *errors)
5041{
5042    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5043}
5044
5045/*
5046 * Convert unicode into string object (MBCS).
5047 * Returns 0 if succeed, -1 otherwise.
5048 */
5049static int encode_mbcs(PyObject **repr,
5050                       const Py_UNICODE *p, /* unicode */
5051                       int size, /* size of unicode */
5052                       const char* errors)
5053{
5054    BOOL usedDefaultChar = FALSE;
5055    BOOL *pusedDefaultChar;
5056    int mbcssize;
5057    Py_ssize_t n;
5058    PyObject *exc = NULL;
5059    DWORD flags;
5060
5061    assert(size >= 0);
5062
5063    /* check and handle 'errors' arg */
5064    if (errors==NULL || strcmp(errors, "strict")==0) {
5065        flags = WC_NO_BEST_FIT_CHARS;
5066        pusedDefaultChar = &usedDefaultChar;
5067    } else if (strcmp(errors, "replace")==0) {
5068        flags = 0;
5069        pusedDefaultChar = NULL;
5070    } else {
5071         PyErr_Format(PyExc_ValueError,
5072                      "mbcs encoding does not support errors='%s'",
5073                      errors);
5074         return -1;
5075    }
5076
5077    /* First get the size of the result */
5078    if (size > 0) {
5079        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5080                                       NULL, pusedDefaultChar);
5081        if (mbcssize == 0) {
5082            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5083            return -1;
5084        }
5085        /* If we used a default char, then we failed! */
5086        if (pusedDefaultChar && *pusedDefaultChar)
5087            goto mbcs_encode_error;
5088    } else {
5089        mbcssize = 0;
5090    }
5091
5092    if (*repr == NULL) {
5093        /* Create string object */
5094        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5095        if (*repr == NULL)
5096            return -1;
5097        n = 0;
5098    }
5099    else {
5100        /* Extend string object */
5101        n = PyBytes_Size(*repr);
5102        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5103            return -1;
5104    }
5105
5106    /* Do the conversion */
5107    if (size > 0) {
5108        char *s = PyBytes_AS_STRING(*repr) + n;
5109        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5110                                     NULL, pusedDefaultChar)) {
5111            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5112            return -1;
5113        }
5114        if (pusedDefaultChar && *pusedDefaultChar)
5115            goto mbcs_encode_error;
5116    }
5117    return 0;
5118
5119mbcs_encode_error:
5120    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5121    Py_XDECREF(exc);
5122    return -1;
5123}
5124
5125PyObject *PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5126                               Py_ssize_t size,
5127                               const char *errors)
5128{
5129    PyObject *repr = NULL;
5130    int ret;
5131
5132#ifdef NEED_RETRY
5133  retry:
5134    if (size > INT_MAX)
5135        ret = encode_mbcs(&repr, p, INT_MAX, errors);
5136    else
5137#endif
5138        ret = encode_mbcs(&repr, p, (int)size, errors);
5139
5140    if (ret < 0) {
5141        Py_XDECREF(repr);
5142        return NULL;
5143    }
5144
5145#ifdef NEED_RETRY
5146    if (size > INT_MAX) {
5147        p += INT_MAX;
5148        size -= INT_MAX;
5149        goto retry;
5150    }
5151#endif
5152
5153    return repr;
5154}
5155
5156PyObject *PyUnicode_AsMBCSString(PyObject *unicode)
5157{
5158    if (!PyUnicode_Check(unicode)) {
5159        PyErr_BadArgument();
5160        return NULL;
5161    }
5162    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
5163                                PyUnicode_GET_SIZE(unicode),
5164                                NULL);
5165}
5166
5167#undef NEED_RETRY
5168
5169#endif /* MS_WINDOWS */
5170
5171/* --- Character Mapping Codec -------------------------------------------- */
5172
5173PyObject *PyUnicode_DecodeCharmap(const char *s,
5174                                  Py_ssize_t size,
5175                                  PyObject *mapping,
5176                                  const char *errors)
5177{
5178    const char *starts = s;
5179    Py_ssize_t startinpos;
5180    Py_ssize_t endinpos;
5181    Py_ssize_t outpos;
5182    const char *e;
5183    PyUnicodeObject *v;
5184    Py_UNICODE *p;
5185    Py_ssize_t extrachars = 0;
5186    PyObject *errorHandler = NULL;
5187    PyObject *exc = NULL;
5188    Py_UNICODE *mapstring = NULL;
5189    Py_ssize_t maplen = 0;
5190
5191    /* Default to Latin-1 */
5192    if (mapping == NULL)
5193        return PyUnicode_DecodeLatin1(s, size, errors);
5194
5195    v = _PyUnicode_New(size);
5196    if (v == NULL)
5197        goto onError;
5198    if (size == 0)
5199        return (PyObject *)v;
5200    p = PyUnicode_AS_UNICODE(v);
5201    e = s + size;
5202    if (PyUnicode_CheckExact(mapping)) {
5203        mapstring = PyUnicode_AS_UNICODE(mapping);
5204        maplen = PyUnicode_GET_SIZE(mapping);
5205        while (s < e) {
5206            unsigned char ch = *s;
5207            Py_UNICODE x = 0xfffe; /* illegal value */
5208
5209            if (ch < maplen)
5210                x = mapstring[ch];
5211
5212            if (x == 0xfffe) {
5213                /* undefined mapping */
5214                outpos = p-PyUnicode_AS_UNICODE(v);
5215                startinpos = s-starts;
5216                endinpos = startinpos+1;
5217                if (unicode_decode_call_errorhandler(
5218                        errors, &errorHandler,
5219                        "charmap", "character maps to <undefined>",
5220                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5221                        &v, &outpos, &p)) {
5222                    goto onError;
5223                }
5224                continue;
5225            }
5226            *p++ = x;
5227            ++s;
5228        }
5229    }
5230    else {
5231        while (s < e) {
5232            unsigned char ch = *s;
5233            PyObject *w, *x;
5234
5235            /* Get mapping (char ordinal -> integer, Unicode char or None) */
5236            w = PyLong_FromLong((long)ch);
5237            if (w == NULL)
5238                goto onError;
5239            x = PyObject_GetItem(mapping, w);
5240            Py_DECREF(w);
5241            if (x == NULL) {
5242                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5243                    /* No mapping found means: mapping is undefined. */
5244                    PyErr_Clear();
5245                    x = Py_None;
5246                    Py_INCREF(x);
5247                } else
5248                    goto onError;
5249            }
5250
5251            /* Apply mapping */
5252            if (PyLong_Check(x)) {
5253                long value = PyLong_AS_LONG(x);
5254                if (value < 0 || value > 65535) {
5255                    PyErr_SetString(PyExc_TypeError,
5256                                    "character mapping must be in range(65536)");
5257                    Py_DECREF(x);
5258                    goto onError;
5259                }
5260                *p++ = (Py_UNICODE)value;
5261            }
5262            else if (x == Py_None) {
5263                /* undefined mapping */
5264                outpos = p-PyUnicode_AS_UNICODE(v);
5265                startinpos = s-starts;
5266                endinpos = startinpos+1;
5267                if (unicode_decode_call_errorhandler(
5268                        errors, &errorHandler,
5269                        "charmap", "character maps to <undefined>",
5270                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5271                        &v, &outpos, &p)) {
5272                    Py_DECREF(x);
5273                    goto onError;
5274                }
5275                Py_DECREF(x);
5276                continue;
5277            }
5278            else if (PyUnicode_Check(x)) {
5279                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
5280
5281                if (targetsize == 1)
5282                    /* 1-1 mapping */
5283                    *p++ = *PyUnicode_AS_UNICODE(x);
5284
5285                else if (targetsize > 1) {
5286                    /* 1-n mapping */
5287                    if (targetsize > extrachars) {
5288                        /* resize first */
5289                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5290                        Py_ssize_t needed = (targetsize - extrachars) + \
5291                            (targetsize << 2);
5292                        extrachars += needed;
5293                        /* XXX overflow detection missing */
5294                        if (_PyUnicode_Resize(&v,
5295                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
5296                            Py_DECREF(x);
5297                            goto onError;
5298                        }
5299                        p = PyUnicode_AS_UNICODE(v) + oldpos;
5300                    }
5301                    Py_UNICODE_COPY(p,
5302                                    PyUnicode_AS_UNICODE(x),
5303                                    targetsize);
5304                    p += targetsize;
5305                    extrachars -= targetsize;
5306                }
5307                /* 1-0 mapping: skip the character */
5308            }
5309            else {
5310                /* wrong return value */
5311                PyErr_SetString(PyExc_TypeError,
5312                                "character mapping must return integer, None or str");
5313                Py_DECREF(x);
5314                goto onError;
5315            }
5316            Py_DECREF(x);
5317            ++s;
5318        }
5319    }
5320    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
5321        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5322            goto onError;
5323    Py_XDECREF(errorHandler);
5324    Py_XDECREF(exc);
5325    return (PyObject *)v;
5326
5327  onError:
5328    Py_XDECREF(errorHandler);
5329    Py_XDECREF(exc);
5330    Py_XDECREF(v);
5331    return NULL;
5332}
5333
5334/* Charmap encoding: the lookup table */
5335
5336struct encoding_map{
5337    PyObject_HEAD
5338    unsigned char level1[32];
5339    int count2, count3;
5340    unsigned char level23[1];
5341};
5342
5343static PyObject*
5344encoding_map_size(PyObject *obj, PyObject* args)
5345{
5346    struct encoding_map *map = (struct encoding_map*)obj;
5347    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5348                           128*map->count3);
5349}
5350
5351static PyMethodDef encoding_map_methods[] = {
5352    {"size", encoding_map_size, METH_NOARGS,
5353     PyDoc_STR("Return the size (in bytes) of this object") },
5354    { 0 }
5355};
5356
5357static void
5358encoding_map_dealloc(PyObject* o)
5359{
5360    PyObject_FREE(o);
5361}
5362
5363static PyTypeObject EncodingMapType = {
5364    PyVarObject_HEAD_INIT(NULL, 0)
5365    "EncodingMap",          /*tp_name*/
5366    sizeof(struct encoding_map),   /*tp_basicsize*/
5367    0,                      /*tp_itemsize*/
5368    /* methods */
5369    encoding_map_dealloc,   /*tp_dealloc*/
5370    0,                      /*tp_print*/
5371    0,                      /*tp_getattr*/
5372    0,                      /*tp_setattr*/
5373    0,                      /*tp_reserved*/
5374    0,                      /*tp_repr*/
5375    0,                      /*tp_as_number*/
5376    0,                      /*tp_as_sequence*/
5377    0,                      /*tp_as_mapping*/
5378    0,                      /*tp_hash*/
5379    0,                      /*tp_call*/
5380    0,                      /*tp_str*/
5381    0,                      /*tp_getattro*/
5382    0,                      /*tp_setattro*/
5383    0,                      /*tp_as_buffer*/
5384    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5385    0,                      /*tp_doc*/
5386    0,                      /*tp_traverse*/
5387    0,                      /*tp_clear*/
5388    0,                      /*tp_richcompare*/
5389    0,                      /*tp_weaklistoffset*/
5390    0,                      /*tp_iter*/
5391    0,                      /*tp_iternext*/
5392    encoding_map_methods,   /*tp_methods*/
5393    0,                      /*tp_members*/
5394    0,                      /*tp_getset*/
5395    0,                      /*tp_base*/
5396    0,                      /*tp_dict*/
5397    0,                      /*tp_descr_get*/
5398    0,                      /*tp_descr_set*/
5399    0,                      /*tp_dictoffset*/
5400    0,                      /*tp_init*/
5401    0,                      /*tp_alloc*/
5402    0,                      /*tp_new*/
5403    0,                      /*tp_free*/
5404    0,                      /*tp_is_gc*/
5405};
5406
5407PyObject*
5408PyUnicode_BuildEncodingMap(PyObject* string)
5409{
5410    Py_UNICODE *decode;
5411    PyObject *result;
5412    struct encoding_map *mresult;
5413    int i;
5414    int need_dict = 0;
5415    unsigned char level1[32];
5416    unsigned char level2[512];
5417    unsigned char *mlevel1, *mlevel2, *mlevel3;
5418    int count2 = 0, count3 = 0;
5419
5420    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5421        PyErr_BadArgument();
5422        return NULL;
5423    }
5424    decode = PyUnicode_AS_UNICODE(string);
5425    memset(level1, 0xFF, sizeof level1);
5426    memset(level2, 0xFF, sizeof level2);
5427
5428    /* If there isn't a one-to-one mapping of NULL to \0,
5429       or if there are non-BMP characters, we need to use
5430       a mapping dictionary. */
5431    if (decode[0] != 0)
5432        need_dict = 1;
5433    for (i = 1; i < 256; i++) {
5434        int l1, l2;
5435        if (decode[i] == 0
5436#ifdef Py_UNICODE_WIDE
5437            || decode[i] > 0xFFFF
5438#endif
5439            ) {
5440            need_dict = 1;
5441            break;
5442        }
5443        if (decode[i] == 0xFFFE)
5444            /* unmapped character */
5445            continue;
5446        l1 = decode[i] >> 11;
5447        l2 = decode[i] >> 7;
5448        if (level1[l1] == 0xFF)
5449            level1[l1] = count2++;
5450        if (level2[l2] == 0xFF)
5451            level2[l2] = count3++;
5452    }
5453
5454    if (count2 >= 0xFF || count3 >= 0xFF)
5455        need_dict = 1;
5456
5457    if (need_dict) {
5458        PyObject *result = PyDict_New();
5459        PyObject *key, *value;
5460        if (!result)
5461            return NULL;
5462        for (i = 0; i < 256; i++) {
5463            key = value = NULL;
5464            key = PyLong_FromLong(decode[i]);
5465            value = PyLong_FromLong(i);
5466            if (!key || !value)
5467                goto failed1;
5468            if (PyDict_SetItem(result, key, value) == -1)
5469                goto failed1;
5470            Py_DECREF(key);
5471            Py_DECREF(value);
5472        }
5473        return result;
5474      failed1:
5475        Py_XDECREF(key);
5476        Py_XDECREF(value);
5477        Py_DECREF(result);
5478        return NULL;
5479    }
5480
5481    /* Create a three-level trie */
5482    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5483                             16*count2 + 128*count3 - 1);
5484    if (!result)
5485        return PyErr_NoMemory();
5486    PyObject_Init(result, &EncodingMapType);
5487    mresult = (struct encoding_map*)result;
5488    mresult->count2 = count2;
5489    mresult->count3 = count3;
5490    mlevel1 = mresult->level1;
5491    mlevel2 = mresult->level23;
5492    mlevel3 = mresult->level23 + 16*count2;
5493    memcpy(mlevel1, level1, 32);
5494    memset(mlevel2, 0xFF, 16*count2);
5495    memset(mlevel3, 0, 128*count3);
5496    count3 = 0;
5497    for (i = 1; i < 256; i++) {
5498        int o1, o2, o3, i2, i3;
5499        if (decode[i] == 0xFFFE)
5500            /* unmapped character */
5501            continue;
5502        o1 = decode[i]>>11;
5503        o2 = (decode[i]>>7) & 0xF;
5504        i2 = 16*mlevel1[o1] + o2;
5505        if (mlevel2[i2] == 0xFF)
5506            mlevel2[i2] = count3++;
5507        o3 = decode[i] & 0x7F;
5508        i3 = 128*mlevel2[i2] + o3;
5509        mlevel3[i3] = i;
5510    }
5511    return result;
5512}
5513
5514static int
5515encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5516{
5517    struct encoding_map *map = (struct encoding_map*)mapping;
5518    int l1 = c>>11;
5519    int l2 = (c>>7) & 0xF;
5520    int l3 = c & 0x7F;
5521    int i;
5522
5523#ifdef Py_UNICODE_WIDE
5524    if (c > 0xFFFF) {
5525        return -1;
5526    }
5527#endif
5528    if (c == 0)
5529        return 0;
5530    /* level 1*/
5531    i = map->level1[l1];
5532    if (i == 0xFF) {
5533        return -1;
5534    }
5535    /* level 2*/
5536    i = map->level23[16*i+l2];
5537    if (i == 0xFF) {
5538        return -1;
5539    }
5540    /* level 3 */
5541    i = map->level23[16*map->count2 + 128*i + l3];
5542    if (i == 0) {
5543        return -1;
5544    }
5545    return i;
5546}
5547
5548/* Lookup the character ch in the mapping. If the character
5549   can't be found, Py_None is returned (or NULL, if another
5550   error occurred). */
5551static PyObject *charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5552{
5553    PyObject *w = PyLong_FromLong((long)c);
5554    PyObject *x;
5555
5556    if (w == NULL)
5557        return NULL;
5558    x = PyObject_GetItem(mapping, w);
5559    Py_DECREF(w);
5560    if (x == NULL) {
5561        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5562            /* No mapping found means: mapping is undefined. */
5563            PyErr_Clear();
5564            x = Py_None;
5565            Py_INCREF(x);
5566            return x;
5567        } else
5568            return NULL;
5569    }
5570    else if (x == Py_None)
5571        return x;
5572    else if (PyLong_Check(x)) {
5573        long value = PyLong_AS_LONG(x);
5574        if (value < 0 || value > 255) {
5575            PyErr_SetString(PyExc_TypeError,
5576                            "character mapping must be in range(256)");
5577            Py_DECREF(x);
5578            return NULL;
5579        }
5580        return x;
5581    }
5582    else if (PyBytes_Check(x))
5583        return x;
5584    else {
5585        /* wrong return value */
5586        PyErr_Format(PyExc_TypeError,
5587                     "character mapping must return integer, bytes or None, not %.400s",
5588                     x->ob_type->tp_name);
5589        Py_DECREF(x);
5590        return NULL;
5591    }
5592}
5593
5594static int
5595charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5596{
5597    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5598    /* exponentially overallocate to minimize reallocations */
5599    if (requiredsize < 2*outsize)
5600        requiredsize = 2*outsize;
5601    if (_PyBytes_Resize(outobj, requiredsize))
5602        return -1;
5603    return 0;
5604}
5605
5606typedef enum charmapencode_result {
5607    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5608}charmapencode_result;
5609/* lookup the character, put the result in the output string and adjust
5610   various state variables. Resize the output bytes object if not enough
5611   space is available. Return a new reference to the object that
5612   was put in the output buffer, or Py_None, if the mapping was undefined
5613   (in which case no character was written) or NULL, if a
5614   reallocation error occurred. The caller must decref the result */
5615static
5616charmapencode_result charmapencode_output(Py_UNICODE c, PyObject *mapping,
5617                                          PyObject **outobj, Py_ssize_t *outpos)
5618{
5619    PyObject *rep;
5620    char *outstart;
5621    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5622
5623    if (Py_TYPE(mapping) == &EncodingMapType) {
5624        int res = encoding_map_lookup(c, mapping);
5625        Py_ssize_t requiredsize = *outpos+1;
5626        if (res == -1)
5627            return enc_FAILED;
5628        if (outsize<requiredsize)
5629            if (charmapencode_resize(outobj, outpos, requiredsize))
5630                return enc_EXCEPTION;
5631        outstart = PyBytes_AS_STRING(*outobj);
5632        outstart[(*outpos)++] = (char)res;
5633        return enc_SUCCESS;
5634    }
5635
5636    rep = charmapencode_lookup(c, mapping);
5637    if (rep==NULL)
5638        return enc_EXCEPTION;
5639    else if (rep==Py_None) {
5640        Py_DECREF(rep);
5641        return enc_FAILED;
5642    } else {
5643        if (PyLong_Check(rep)) {
5644            Py_ssize_t requiredsize = *outpos+1;
5645            if (outsize<requiredsize)
5646                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5647                    Py_DECREF(rep);
5648                    return enc_EXCEPTION;
5649                }
5650            outstart = PyBytes_AS_STRING(*outobj);
5651            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5652        }
5653        else {
5654            const char *repchars = PyBytes_AS_STRING(rep);
5655            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5656            Py_ssize_t requiredsize = *outpos+repsize;
5657            if (outsize<requiredsize)
5658                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5659                    Py_DECREF(rep);
5660                    return enc_EXCEPTION;
5661                }
5662            outstart = PyBytes_AS_STRING(*outobj);
5663            memcpy(outstart + *outpos, repchars, repsize);
5664            *outpos += repsize;
5665        }
5666    }
5667    Py_DECREF(rep);
5668    return enc_SUCCESS;
5669}
5670
5671/* handle an error in PyUnicode_EncodeCharmap
5672   Return 0 on success, -1 on error */
5673static
5674int charmap_encoding_error(
5675    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5676    PyObject **exceptionObject,
5677    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5678    PyObject **res, Py_ssize_t *respos)
5679{
5680    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5681    Py_ssize_t repsize;
5682    Py_ssize_t newpos;
5683    Py_UNICODE *uni2;
5684    /* startpos for collecting unencodable chars */
5685    Py_ssize_t collstartpos = *inpos;
5686    Py_ssize_t collendpos = *inpos+1;
5687    Py_ssize_t collpos;
5688    char *encoding = "charmap";
5689    char *reason = "character maps to <undefined>";
5690    charmapencode_result x;
5691
5692    /* find all unencodable characters */
5693    while (collendpos < size) {
5694        PyObject *rep;
5695        if (Py_TYPE(mapping) == &EncodingMapType) {
5696            int res = encoding_map_lookup(p[collendpos], mapping);
5697            if (res != -1)
5698                break;
5699            ++collendpos;
5700            continue;
5701        }
5702
5703        rep = charmapencode_lookup(p[collendpos], mapping);
5704        if (rep==NULL)
5705            return -1;
5706        else if (rep!=Py_None) {
5707            Py_DECREF(rep);
5708            break;
5709        }
5710        Py_DECREF(rep);
5711        ++collendpos;
5712    }
5713    /* cache callback name lookup
5714     * (if not done yet, i.e. it's the first error) */
5715    if (*known_errorHandler==-1) {
5716        if ((errors==NULL) || (!strcmp(errors, "strict")))
5717            *known_errorHandler = 1;
5718        else if (!strcmp(errors, "replace"))
5719            *known_errorHandler = 2;
5720        else if (!strcmp(errors, "ignore"))
5721            *known_errorHandler = 3;
5722        else if (!strcmp(errors, "xmlcharrefreplace"))
5723            *known_errorHandler = 4;
5724        else
5725            *known_errorHandler = 0;
5726    }
5727    switch (*known_errorHandler) {
5728    case 1: /* strict */
5729        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5730        return -1;
5731    case 2: /* replace */
5732        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5733            x = charmapencode_output('?', mapping, res, respos);
5734            if (x==enc_EXCEPTION) {
5735                return -1;
5736            }
5737            else if (x==enc_FAILED) {
5738                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5739                return -1;
5740            }
5741        }
5742        /* fall through */
5743    case 3: /* ignore */
5744        *inpos = collendpos;
5745        break;
5746    case 4: /* xmlcharrefreplace */
5747        /* generate replacement (temporarily (mis)uses p) */
5748        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5749            char buffer[2+29+1+1];
5750            char *cp;
5751            sprintf(buffer, "&#%d;", (int)p[collpos]);
5752            for (cp = buffer; *cp; ++cp) {
5753                x = charmapencode_output(*cp, mapping, res, respos);
5754                if (x==enc_EXCEPTION)
5755                    return -1;
5756                else if (x==enc_FAILED) {
5757                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5758                    return -1;
5759                }
5760            }
5761        }
5762        *inpos = collendpos;
5763        break;
5764    default:
5765        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5766                                                      encoding, reason, p, size, exceptionObject,
5767                                                      collstartpos, collendpos, &newpos);
5768        if (repunicode == NULL)
5769            return -1;
5770        if (PyBytes_Check(repunicode)) {
5771            /* Directly copy bytes result to output. */
5772            Py_ssize_t outsize = PyBytes_Size(*res);
5773            Py_ssize_t requiredsize;
5774            repsize = PyBytes_Size(repunicode);
5775            requiredsize = *respos + repsize;
5776            if (requiredsize > outsize)
5777                /* Make room for all additional bytes. */
5778                if (charmapencode_resize(res, respos, requiredsize)) {
5779                    Py_DECREF(repunicode);
5780                    return -1;
5781                }
5782            memcpy(PyBytes_AsString(*res) + *respos,
5783                   PyBytes_AsString(repunicode),  repsize);
5784            *respos += repsize;
5785            *inpos = newpos;
5786            Py_DECREF(repunicode);
5787            break;
5788        }
5789        /* generate replacement  */
5790        repsize = PyUnicode_GET_SIZE(repunicode);
5791        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5792            x = charmapencode_output(*uni2, mapping, res, respos);
5793            if (x==enc_EXCEPTION) {
5794                return -1;
5795            }
5796            else if (x==enc_FAILED) {
5797                Py_DECREF(repunicode);
5798                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5799                return -1;
5800            }
5801        }
5802        *inpos = newpos;
5803        Py_DECREF(repunicode);
5804    }
5805    return 0;
5806}
5807
5808PyObject *PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5809                                  Py_ssize_t size,
5810                                  PyObject *mapping,
5811                                  const char *errors)
5812{
5813    /* output object */
5814    PyObject *res = NULL;
5815    /* current input position */
5816    Py_ssize_t inpos = 0;
5817    /* current output position */
5818    Py_ssize_t respos = 0;
5819    PyObject *errorHandler = NULL;
5820    PyObject *exc = NULL;
5821    /* the following variable is used for caching string comparisons
5822     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5823     * 3=ignore, 4=xmlcharrefreplace */
5824    int known_errorHandler = -1;
5825
5826    /* Default to Latin-1 */
5827    if (mapping == NULL)
5828        return PyUnicode_EncodeLatin1(p, size, errors);
5829
5830    /* allocate enough for a simple encoding without
5831       replacements, if we need more, we'll resize */
5832    res = PyBytes_FromStringAndSize(NULL, size);
5833    if (res == NULL)
5834        goto onError;
5835    if (size == 0)
5836        return res;
5837
5838    while (inpos<size) {
5839        /* try to encode it */
5840        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5841        if (x==enc_EXCEPTION) /* error */
5842            goto onError;
5843        if (x==enc_FAILED) { /* unencodable character */
5844            if (charmap_encoding_error(p, size, &inpos, mapping,
5845                                       &exc,
5846                                       &known_errorHandler, &errorHandler, errors,
5847                                       &res, &respos)) {
5848                goto onError;
5849            }
5850        }
5851        else
5852            /* done with this character => adjust input position */
5853            ++inpos;
5854    }
5855
5856    /* Resize if we allocated to much */
5857    if (respos<PyBytes_GET_SIZE(res))
5858        if (_PyBytes_Resize(&res, respos) < 0)
5859            goto onError;
5860
5861    Py_XDECREF(exc);
5862    Py_XDECREF(errorHandler);
5863    return res;
5864
5865  onError:
5866    Py_XDECREF(res);
5867    Py_XDECREF(exc);
5868    Py_XDECREF(errorHandler);
5869    return NULL;
5870}
5871
5872PyObject *PyUnicode_AsCharmapString(PyObject *unicode,
5873                                    PyObject *mapping)
5874{
5875    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5876        PyErr_BadArgument();
5877        return NULL;
5878    }
5879    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5880                                   PyUnicode_GET_SIZE(unicode),
5881                                   mapping,
5882                                   NULL);
5883}
5884
5885/* create or adjust a UnicodeTranslateError */
5886static void make_translate_exception(PyObject **exceptionObject,
5887                                     const Py_UNICODE *unicode, Py_ssize_t size,
5888                                     Py_ssize_t startpos, Py_ssize_t endpos,
5889                                     const char *reason)
5890{
5891    if (*exceptionObject == NULL) {
5892        *exceptionObject = PyUnicodeTranslateError_Create(
5893            unicode, size, startpos, endpos, reason);
5894    }
5895    else {
5896        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5897            goto onError;
5898        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5899            goto onError;
5900        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5901            goto onError;
5902        return;
5903      onError:
5904        Py_DECREF(*exceptionObject);
5905        *exceptionObject = NULL;
5906    }
5907}
5908
5909/* raises a UnicodeTranslateError */
5910static void raise_translate_exception(PyObject **exceptionObject,
5911                                      const Py_UNICODE *unicode, Py_ssize_t size,
5912                                      Py_ssize_t startpos, Py_ssize_t endpos,
5913                                      const char *reason)
5914{
5915    make_translate_exception(exceptionObject,
5916                             unicode, size, startpos, endpos, reason);
5917    if (*exceptionObject != NULL)
5918        PyCodec_StrictErrors(*exceptionObject);
5919}
5920
5921/* error handling callback helper:
5922   build arguments, call the callback and check the arguments,
5923   put the result into newpos and return the replacement string, which
5924   has to be freed by the caller */
5925static PyObject *unicode_translate_call_errorhandler(const char *errors,
5926                                                     PyObject **errorHandler,
5927                                                     const char *reason,
5928                                                     const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5929                                                     Py_ssize_t startpos, Py_ssize_t endpos,
5930                                                     Py_ssize_t *newpos)
5931{
5932    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
5933
5934    Py_ssize_t i_newpos;
5935    PyObject *restuple;
5936    PyObject *resunicode;
5937
5938    if (*errorHandler == NULL) {
5939        *errorHandler = PyCodec_LookupError(errors);
5940        if (*errorHandler == NULL)
5941            return NULL;
5942    }
5943
5944    make_translate_exception(exceptionObject,
5945                             unicode, size, startpos, endpos, reason);
5946    if (*exceptionObject == NULL)
5947        return NULL;
5948
5949    restuple = PyObject_CallFunctionObjArgs(
5950        *errorHandler, *exceptionObject, NULL);
5951    if (restuple == NULL)
5952        return NULL;
5953    if (!PyTuple_Check(restuple)) {
5954        PyErr_SetString(PyExc_TypeError, &argparse[4]);
5955        Py_DECREF(restuple);
5956        return NULL;
5957    }
5958    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
5959                          &resunicode, &i_newpos)) {
5960        Py_DECREF(restuple);
5961        return NULL;
5962    }
5963    if (i_newpos<0)
5964        *newpos = size+i_newpos;
5965    else
5966        *newpos = i_newpos;
5967    if (*newpos<0 || *newpos>size) {
5968        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
5969        Py_DECREF(restuple);
5970        return NULL;
5971    }
5972    Py_INCREF(resunicode);
5973    Py_DECREF(restuple);
5974    return resunicode;
5975}
5976
5977/* Lookup the character ch in the mapping and put the result in result,
5978   which must be decrefed by the caller.
5979   Return 0 on success, -1 on error */
5980static
5981int charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
5982{
5983    PyObject *w = PyLong_FromLong((long)c);
5984    PyObject *x;
5985
5986    if (w == NULL)
5987        return -1;
5988    x = PyObject_GetItem(mapping, w);
5989    Py_DECREF(w);
5990    if (x == NULL) {
5991        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5992            /* No mapping found means: use 1:1 mapping. */
5993            PyErr_Clear();
5994            *result = NULL;
5995            return 0;
5996        } else
5997            return -1;
5998    }
5999    else if (x == Py_None) {
6000        *result = x;
6001        return 0;
6002    }
6003    else if (PyLong_Check(x)) {
6004        long value = PyLong_AS_LONG(x);
6005        long max = PyUnicode_GetMax();
6006        if (value < 0 || value > max) {
6007            PyErr_Format(PyExc_TypeError,
6008                         "character mapping must be in range(0x%x)", max+1);
6009            Py_DECREF(x);
6010            return -1;
6011        }
6012        *result = x;
6013        return 0;
6014    }
6015    else if (PyUnicode_Check(x)) {
6016        *result = x;
6017        return 0;
6018    }
6019    else {
6020        /* wrong return value */
6021        PyErr_SetString(PyExc_TypeError,
6022                        "character mapping must return integer, None or str");
6023        Py_DECREF(x);
6024        return -1;
6025    }
6026}
6027/* ensure that *outobj is at least requiredsize characters long,
6028   if not reallocate and adjust various state variables.
6029   Return 0 on success, -1 on error */
6030static
6031int charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
6032                               Py_ssize_t requiredsize)
6033{
6034    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
6035    if (requiredsize > oldsize) {
6036        /* remember old output position */
6037        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6038        /* exponentially overallocate to minimize reallocations */
6039        if (requiredsize < 2 * oldsize)
6040            requiredsize = 2 * oldsize;
6041        if (PyUnicode_Resize(outobj, requiredsize) < 0)
6042            return -1;
6043        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
6044    }
6045    return 0;
6046}
6047/* lookup the character, put the result in the output string and adjust
6048   various state variables. Return a new reference to the object that
6049   was put in the output buffer in *result, or Py_None, if the mapping was
6050   undefined (in which case no character was written).
6051   The called must decref result.
6052   Return 0 on success, -1 on error. */
6053static
6054int charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6055                            Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6056                            PyObject **res)
6057{
6058    if (charmaptranslate_lookup(*curinp, mapping, res))
6059        return -1;
6060    if (*res==NULL) {
6061        /* not found => default to 1:1 mapping */
6062        *(*outp)++ = *curinp;
6063    }
6064    else if (*res==Py_None)
6065        ;
6066    else if (PyLong_Check(*res)) {
6067        /* no overflow check, because we know that the space is enough */
6068        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
6069    }
6070    else if (PyUnicode_Check(*res)) {
6071        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6072        if (repsize==1) {
6073            /* no overflow check, because we know that the space is enough */
6074            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6075        }
6076        else if (repsize!=0) {
6077            /* more than one character */
6078            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6079                (insize - (curinp-startinp)) +
6080                repsize - 1;
6081            if (charmaptranslate_makespace(outobj, outp, requiredsize))
6082                return -1;
6083            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6084            *outp += repsize;
6085        }
6086    }
6087    else
6088        return -1;
6089    return 0;
6090}
6091
6092PyObject *PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6093                                     Py_ssize_t size,
6094                                     PyObject *mapping,
6095                                     const char *errors)
6096{
6097    /* output object */
6098    PyObject *res = NULL;
6099    /* pointers to the beginning and end+1 of input */
6100    const Py_UNICODE *startp = p;
6101    const Py_UNICODE *endp = p + size;
6102    /* pointer into the output */
6103    Py_UNICODE *str;
6104    /* current output position */
6105    Py_ssize_t respos = 0;
6106    char *reason = "character maps to <undefined>";
6107    PyObject *errorHandler = NULL;
6108    PyObject *exc = NULL;
6109    /* the following variable is used for caching string comparisons
6110     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6111     * 3=ignore, 4=xmlcharrefreplace */
6112    int known_errorHandler = -1;
6113
6114    if (mapping == NULL) {
6115        PyErr_BadArgument();
6116        return NULL;
6117    }
6118
6119    /* allocate enough for a simple 1:1 translation without
6120       replacements, if we need more, we'll resize */
6121    res = PyUnicode_FromUnicode(NULL, size);
6122    if (res == NULL)
6123        goto onError;
6124    if (size == 0)
6125        return res;
6126    str = PyUnicode_AS_UNICODE(res);
6127
6128    while (p<endp) {
6129        /* try to encode it */
6130        PyObject *x = NULL;
6131        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6132            Py_XDECREF(x);
6133            goto onError;
6134        }
6135        Py_XDECREF(x);
6136        if (x!=Py_None) /* it worked => adjust input pointer */
6137            ++p;
6138        else { /* untranslatable character */
6139            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6140            Py_ssize_t repsize;
6141            Py_ssize_t newpos;
6142            Py_UNICODE *uni2;
6143            /* startpos for collecting untranslatable chars */
6144            const Py_UNICODE *collstart = p;
6145            const Py_UNICODE *collend = p+1;
6146            const Py_UNICODE *coll;
6147
6148            /* find all untranslatable characters */
6149            while (collend < endp) {
6150                if (charmaptranslate_lookup(*collend, mapping, &x))
6151                    goto onError;
6152                Py_XDECREF(x);
6153                if (x!=Py_None)
6154                    break;
6155                ++collend;
6156            }
6157            /* cache callback name lookup
6158             * (if not done yet, i.e. it's the first error) */
6159            if (known_errorHandler==-1) {
6160                if ((errors==NULL) || (!strcmp(errors, "strict")))
6161                    known_errorHandler = 1;
6162                else if (!strcmp(errors, "replace"))
6163                    known_errorHandler = 2;
6164                else if (!strcmp(errors, "ignore"))
6165                    known_errorHandler = 3;
6166                else if (!strcmp(errors, "xmlcharrefreplace"))
6167                    known_errorHandler = 4;
6168                else
6169                    known_errorHandler = 0;
6170            }
6171            switch (known_errorHandler) {
6172            case 1: /* strict */
6173                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
6174                goto onError;
6175            case 2: /* replace */
6176                /* No need to check for space, this is a 1:1 replacement */
6177                for (coll = collstart; coll<collend; ++coll)
6178                    *str++ = '?';
6179                /* fall through */
6180            case 3: /* ignore */
6181                p = collend;
6182                break;
6183            case 4: /* xmlcharrefreplace */
6184                /* generate replacement (temporarily (mis)uses p) */
6185                for (p = collstart; p < collend; ++p) {
6186                    char buffer[2+29+1+1];
6187                    char *cp;
6188                    sprintf(buffer, "&#%d;", (int)*p);
6189                    if (charmaptranslate_makespace(&res, &str,
6190                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6191                        goto onError;
6192                    for (cp = buffer; *cp; ++cp)
6193                        *str++ = *cp;
6194                }
6195                p = collend;
6196                break;
6197            default:
6198                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6199                                                                 reason, startp, size, &exc,
6200                                                                 collstart-startp, collend-startp, &newpos);
6201                if (repunicode == NULL)
6202                    goto onError;
6203                /* generate replacement  */
6204                repsize = PyUnicode_GET_SIZE(repunicode);
6205                if (charmaptranslate_makespace(&res, &str,
6206                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6207                    Py_DECREF(repunicode);
6208                    goto onError;
6209                }
6210                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6211                    *str++ = *uni2;
6212                p = startp + newpos;
6213                Py_DECREF(repunicode);
6214            }
6215        }
6216    }
6217    /* Resize if we allocated to much */
6218    respos = str-PyUnicode_AS_UNICODE(res);
6219    if (respos<PyUnicode_GET_SIZE(res)) {
6220        if (PyUnicode_Resize(&res, respos) < 0)
6221            goto onError;
6222    }
6223    Py_XDECREF(exc);
6224    Py_XDECREF(errorHandler);
6225    return res;
6226
6227  onError:
6228    Py_XDECREF(res);
6229    Py_XDECREF(exc);
6230    Py_XDECREF(errorHandler);
6231    return NULL;
6232}
6233
6234PyObject *PyUnicode_Translate(PyObject *str,
6235                              PyObject *mapping,
6236                              const char *errors)
6237{
6238    PyObject *result;
6239
6240    str = PyUnicode_FromObject(str);
6241    if (str == NULL)
6242        goto onError;
6243    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6244                                        PyUnicode_GET_SIZE(str),
6245                                        mapping,
6246                                        errors);
6247    Py_DECREF(str);
6248    return result;
6249
6250  onError:
6251    Py_XDECREF(str);
6252    return NULL;
6253}
6254
6255PyObject *
6256PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6257                                  Py_ssize_t length)
6258{
6259    PyObject *result;
6260    Py_UNICODE *p; /* write pointer into result */
6261    Py_ssize_t i;
6262    /* Copy to a new string */
6263    result = (PyObject *)_PyUnicode_New(length);
6264    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6265    if (result == NULL)
6266        return result;
6267    p = PyUnicode_AS_UNICODE(result);
6268    /* Iterate over code points */
6269    for (i = 0; i < length; i++) {
6270        Py_UNICODE ch =s[i];
6271        if (ch > 127) {
6272            int decimal = Py_UNICODE_TODECIMAL(ch);
6273            if (decimal >= 0)
6274                p[i] = '0' + decimal;
6275        }
6276    }
6277    return result;
6278}
6279/* --- Decimal Encoder ---------------------------------------------------- */
6280
6281int PyUnicode_EncodeDecimal(Py_UNICODE *s,
6282                            Py_ssize_t length,
6283                            char *output,
6284                            const char *errors)
6285{
6286    Py_UNICODE *p, *end;
6287    PyObject *errorHandler = NULL;
6288    PyObject *exc = NULL;
6289    const char *encoding = "decimal";
6290    const char *reason = "invalid decimal Unicode string";
6291    /* the following variable is used for caching string comparisons
6292     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6293    int known_errorHandler = -1;
6294
6295    if (output == NULL) {
6296        PyErr_BadArgument();
6297        return -1;
6298    }
6299
6300    p = s;
6301    end = s + length;
6302    while (p < end) {
6303        register Py_UNICODE ch = *p;
6304        int decimal;
6305        PyObject *repunicode;
6306        Py_ssize_t repsize;
6307        Py_ssize_t newpos;
6308        Py_UNICODE *uni2;
6309        Py_UNICODE *collstart;
6310        Py_UNICODE *collend;
6311
6312        if (Py_UNICODE_ISSPACE(ch)) {
6313            *output++ = ' ';
6314            ++p;
6315            continue;
6316        }
6317        decimal = Py_UNICODE_TODECIMAL(ch);
6318        if (decimal >= 0) {
6319            *output++ = '0' + decimal;
6320            ++p;
6321            continue;
6322        }
6323        if (0 < ch && ch < 256) {
6324            *output++ = (char)ch;
6325            ++p;
6326            continue;
6327        }
6328        /* All other characters are considered unencodable */
6329        collstart = p;
6330        collend = p+1;
6331        while (collend < end) {
6332            if ((0 < *collend && *collend < 256) ||
6333                !Py_UNICODE_ISSPACE(*collend) ||
6334                Py_UNICODE_TODECIMAL(*collend))
6335                break;
6336        }
6337        /* cache callback name lookup
6338         * (if not done yet, i.e. it's the first error) */
6339        if (known_errorHandler==-1) {
6340            if ((errors==NULL) || (!strcmp(errors, "strict")))
6341                known_errorHandler = 1;
6342            else if (!strcmp(errors, "replace"))
6343                known_errorHandler = 2;
6344            else if (!strcmp(errors, "ignore"))
6345                known_errorHandler = 3;
6346            else if (!strcmp(errors, "xmlcharrefreplace"))
6347                known_errorHandler = 4;
6348            else
6349                known_errorHandler = 0;
6350        }
6351        switch (known_errorHandler) {
6352        case 1: /* strict */
6353            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6354            goto onError;
6355        case 2: /* replace */
6356            for (p = collstart; p < collend; ++p)
6357                *output++ = '?';
6358            /* fall through */
6359        case 3: /* ignore */
6360            p = collend;
6361            break;
6362        case 4: /* xmlcharrefreplace */
6363            /* generate replacement (temporarily (mis)uses p) */
6364            for (p = collstart; p < collend; ++p)
6365                output += sprintf(output, "&#%d;", (int)*p);
6366            p = collend;
6367            break;
6368        default:
6369            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6370                                                          encoding, reason, s, length, &exc,
6371                                                          collstart-s, collend-s, &newpos);
6372            if (repunicode == NULL)
6373                goto onError;
6374            if (!PyUnicode_Check(repunicode)) {
6375                /* Byte results not supported, since they have no decimal property. */
6376                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6377                Py_DECREF(repunicode);
6378                goto onError;
6379            }
6380            /* generate replacement  */
6381            repsize = PyUnicode_GET_SIZE(repunicode);
6382            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6383                Py_UNICODE ch = *uni2;
6384                if (Py_UNICODE_ISSPACE(ch))
6385                    *output++ = ' ';
6386                else {
6387                    decimal = Py_UNICODE_TODECIMAL(ch);
6388                    if (decimal >= 0)
6389                        *output++ = '0' + decimal;
6390                    else if (0 < ch && ch < 256)
6391                        *output++ = (char)ch;
6392                    else {
6393                        Py_DECREF(repunicode);
6394                        raise_encode_exception(&exc, encoding,
6395                                               s, length, collstart-s, collend-s, reason);
6396                        goto onError;
6397                    }
6398                }
6399            }
6400            p = s + newpos;
6401            Py_DECREF(repunicode);
6402        }
6403    }
6404    /* 0-terminate the output string */
6405    *output++ = '\0';
6406    Py_XDECREF(exc);
6407    Py_XDECREF(errorHandler);
6408    return 0;
6409
6410  onError:
6411    Py_XDECREF(exc);
6412    Py_XDECREF(errorHandler);
6413    return -1;
6414}
6415
6416/* --- Helpers ------------------------------------------------------------ */
6417
6418#include "stringlib/unicodedefs.h"
6419#include "stringlib/fastsearch.h"
6420
6421#include "stringlib/count.h"
6422#include "stringlib/find.h"
6423#include "stringlib/partition.h"
6424#include "stringlib/split.h"
6425
6426#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6427#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6428#include "stringlib/localeutil.h"
6429
6430/* helper macro to fixup start/end slice values */
6431#define ADJUST_INDICES(start, end, len)         \
6432    if (end > len)                              \
6433        end = len;                              \
6434    else if (end < 0) {                         \
6435        end += len;                             \
6436        if (end < 0)                            \
6437            end = 0;                            \
6438    }                                           \
6439    if (start < 0) {                            \
6440        start += len;                           \
6441        if (start < 0)                          \
6442            start = 0;                          \
6443    }
6444
6445Py_ssize_t PyUnicode_Count(PyObject *str,
6446                           PyObject *substr,
6447                           Py_ssize_t start,
6448                           Py_ssize_t end)
6449{
6450    Py_ssize_t result;
6451    PyUnicodeObject* str_obj;
6452    PyUnicodeObject* sub_obj;
6453
6454    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6455    if (!str_obj)
6456        return -1;
6457    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6458    if (!sub_obj) {
6459        Py_DECREF(str_obj);
6460        return -1;
6461    }
6462
6463    ADJUST_INDICES(start, end, str_obj->length);
6464    result = stringlib_count(
6465        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6466        PY_SSIZE_T_MAX
6467        );
6468
6469    Py_DECREF(sub_obj);
6470    Py_DECREF(str_obj);
6471
6472    return result;
6473}
6474
6475Py_ssize_t PyUnicode_Find(PyObject *str,
6476                          PyObject *sub,
6477                          Py_ssize_t start,
6478                          Py_ssize_t end,
6479                          int direction)
6480{
6481    Py_ssize_t result;
6482
6483    str = PyUnicode_FromObject(str);
6484    if (!str)
6485        return -2;
6486    sub = PyUnicode_FromObject(sub);
6487    if (!sub) {
6488        Py_DECREF(str);
6489        return -2;
6490    }
6491
6492    if (direction > 0)
6493        result = stringlib_find_slice(
6494            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6495            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6496            start, end
6497            );
6498    else
6499        result = stringlib_rfind_slice(
6500            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6501            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6502            start, end
6503            );
6504
6505    Py_DECREF(str);
6506    Py_DECREF(sub);
6507
6508    return result;
6509}
6510
6511static
6512int tailmatch(PyUnicodeObject *self,
6513              PyUnicodeObject *substring,
6514              Py_ssize_t start,
6515              Py_ssize_t end,
6516              int direction)
6517{
6518    if (substring->length == 0)
6519        return 1;
6520
6521    ADJUST_INDICES(start, end, self->length);
6522    end -= substring->length;
6523    if (end < start)
6524        return 0;
6525
6526    if (direction > 0) {
6527        if (Py_UNICODE_MATCH(self, end, substring))
6528            return 1;
6529    } else {
6530        if (Py_UNICODE_MATCH(self, start, substring))
6531            return 1;
6532    }
6533
6534    return 0;
6535}
6536
6537Py_ssize_t PyUnicode_Tailmatch(PyObject *str,
6538                               PyObject *substr,
6539                               Py_ssize_t start,
6540                               Py_ssize_t end,
6541                               int direction)
6542{
6543    Py_ssize_t result;
6544
6545    str = PyUnicode_FromObject(str);
6546    if (str == NULL)
6547        return -1;
6548    substr = PyUnicode_FromObject(substr);
6549    if (substr == NULL) {
6550        Py_DECREF(str);
6551        return -1;
6552    }
6553
6554    result = tailmatch((PyUnicodeObject *)str,
6555                       (PyUnicodeObject *)substr,
6556                       start, end, direction);
6557    Py_DECREF(str);
6558    Py_DECREF(substr);
6559    return result;
6560}
6561
6562/* Apply fixfct filter to the Unicode object self and return a
6563   reference to the modified object */
6564
6565static
6566PyObject *fixup(PyUnicodeObject *self,
6567                int (*fixfct)(PyUnicodeObject *s))
6568{
6569
6570    PyUnicodeObject *u;
6571
6572    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6573    if (u == NULL)
6574        return NULL;
6575
6576    Py_UNICODE_COPY(u->str, self->str, self->length);
6577
6578    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6579        /* fixfct should return TRUE if it modified the buffer. If
6580           FALSE, return a reference to the original buffer instead
6581           (to save space, not time) */
6582        Py_INCREF(self);
6583        Py_DECREF(u);
6584        return (PyObject*) self;
6585    }
6586    return (PyObject*) u;
6587}
6588
6589static
6590int fixupper(PyUnicodeObject *self)
6591{
6592    Py_ssize_t len = self->length;
6593    Py_UNICODE *s = self->str;
6594    int status = 0;
6595
6596    while (len-- > 0) {
6597        register Py_UNICODE ch;
6598
6599        ch = Py_UNICODE_TOUPPER(*s);
6600        if (ch != *s) {
6601            status = 1;
6602            *s = ch;
6603        }
6604        s++;
6605    }
6606
6607    return status;
6608}
6609
6610static
6611int fixlower(PyUnicodeObject *self)
6612{
6613    Py_ssize_t len = self->length;
6614    Py_UNICODE *s = self->str;
6615    int status = 0;
6616
6617    while (len-- > 0) {
6618        register Py_UNICODE ch;
6619
6620        ch = Py_UNICODE_TOLOWER(*s);
6621        if (ch != *s) {
6622            status = 1;
6623            *s = ch;
6624        }
6625        s++;
6626    }
6627
6628    return status;
6629}
6630
6631static
6632int fixswapcase(PyUnicodeObject *self)
6633{
6634    Py_ssize_t len = self->length;
6635    Py_UNICODE *s = self->str;
6636    int status = 0;
6637
6638    while (len-- > 0) {
6639        if (Py_UNICODE_ISUPPER(*s)) {
6640            *s = Py_UNICODE_TOLOWER(*s);
6641            status = 1;
6642        } else if (Py_UNICODE_ISLOWER(*s)) {
6643            *s = Py_UNICODE_TOUPPER(*s);
6644            status = 1;
6645        }
6646        s++;
6647    }
6648
6649    return status;
6650}
6651
6652static
6653int fixcapitalize(PyUnicodeObject *self)
6654{
6655    Py_ssize_t len = self->length;
6656    Py_UNICODE *s = self->str;
6657    int status = 0;
6658
6659    if (len == 0)
6660        return 0;
6661    if (Py_UNICODE_ISLOWER(*s)) {
6662        *s = Py_UNICODE_TOUPPER(*s);
6663        status = 1;
6664    }
6665    s++;
6666    while (--len > 0) {
6667        if (Py_UNICODE_ISUPPER(*s)) {
6668            *s = Py_UNICODE_TOLOWER(*s);
6669            status = 1;
6670        }
6671        s++;
6672    }
6673    return status;
6674}
6675
6676static
6677int fixtitle(PyUnicodeObject *self)
6678{
6679    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6680    register Py_UNICODE *e;
6681    int previous_is_cased;
6682
6683    /* Shortcut for single character strings */
6684    if (PyUnicode_GET_SIZE(self) == 1) {
6685        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6686        if (*p != ch) {
6687            *p = ch;
6688            return 1;
6689        }
6690        else
6691            return 0;
6692    }
6693
6694    e = p + PyUnicode_GET_SIZE(self);
6695    previous_is_cased = 0;
6696    for (; p < e; p++) {
6697        register const Py_UNICODE ch = *p;
6698
6699        if (previous_is_cased)
6700            *p = Py_UNICODE_TOLOWER(ch);
6701        else
6702            *p = Py_UNICODE_TOTITLE(ch);
6703
6704        if (Py_UNICODE_ISLOWER(ch) ||
6705            Py_UNICODE_ISUPPER(ch) ||
6706            Py_UNICODE_ISTITLE(ch))
6707            previous_is_cased = 1;
6708        else
6709            previous_is_cased = 0;
6710    }
6711    return 1;
6712}
6713
6714PyObject *
6715PyUnicode_Join(PyObject *separator, PyObject *seq)
6716{
6717    const Py_UNICODE blank = ' ';
6718    const Py_UNICODE *sep = &blank;
6719    Py_ssize_t seplen = 1;
6720    PyUnicodeObject *res = NULL; /* the result */
6721    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6722    PyObject *fseq;          /* PySequence_Fast(seq) */
6723    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6724    PyObject **items;
6725    PyObject *item;
6726    Py_ssize_t sz, i;
6727
6728    fseq = PySequence_Fast(seq, "");
6729    if (fseq == NULL) {
6730        return NULL;
6731    }
6732
6733    /* NOTE: the following code can't call back into Python code,
6734     * so we are sure that fseq won't be mutated.
6735     */
6736
6737    seqlen = PySequence_Fast_GET_SIZE(fseq);
6738    /* If empty sequence, return u"". */
6739    if (seqlen == 0) {
6740        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6741        goto Done;
6742    }
6743    items = PySequence_Fast_ITEMS(fseq);
6744    /* If singleton sequence with an exact Unicode, return that. */
6745    if (seqlen == 1) {
6746        item = items[0];
6747        if (PyUnicode_CheckExact(item)) {
6748            Py_INCREF(item);
6749            res = (PyUnicodeObject *)item;
6750            goto Done;
6751        }
6752    }
6753    else {
6754        /* Set up sep and seplen */
6755        if (separator == NULL) {
6756            sep = &blank;
6757            seplen = 1;
6758        }
6759        else {
6760            if (!PyUnicode_Check(separator)) {
6761                PyErr_Format(PyExc_TypeError,
6762                             "separator: expected str instance,"
6763                             " %.80s found",
6764                             Py_TYPE(separator)->tp_name);
6765                goto onError;
6766            }
6767            sep = PyUnicode_AS_UNICODE(separator);
6768            seplen = PyUnicode_GET_SIZE(separator);
6769        }
6770    }
6771
6772    /* There are at least two things to join, or else we have a subclass
6773     * of str in the sequence.
6774     * Do a pre-pass to figure out the total amount of space we'll
6775     * need (sz), and see whether all argument are strings.
6776     */
6777    sz = 0;
6778    for (i = 0; i < seqlen; i++) {
6779        const Py_ssize_t old_sz = sz;
6780        item = items[i];
6781        if (!PyUnicode_Check(item)) {
6782            PyErr_Format(PyExc_TypeError,
6783                         "sequence item %zd: expected str instance,"
6784                         " %.80s found",
6785                         i, Py_TYPE(item)->tp_name);
6786            goto onError;
6787        }
6788        sz += PyUnicode_GET_SIZE(item);
6789        if (i != 0)
6790            sz += seplen;
6791        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6792            PyErr_SetString(PyExc_OverflowError,
6793                            "join() result is too long for a Python string");
6794            goto onError;
6795        }
6796    }
6797
6798    res = _PyUnicode_New(sz);
6799    if (res == NULL)
6800        goto onError;
6801
6802    /* Catenate everything. */
6803    res_p = PyUnicode_AS_UNICODE(res);
6804    for (i = 0; i < seqlen; ++i) {
6805        Py_ssize_t itemlen;
6806        item = items[i];
6807        itemlen = PyUnicode_GET_SIZE(item);
6808        /* Copy item, and maybe the separator. */
6809        if (i) {
6810            Py_UNICODE_COPY(res_p, sep, seplen);
6811            res_p += seplen;
6812        }
6813        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6814        res_p += itemlen;
6815    }
6816
6817  Done:
6818    Py_DECREF(fseq);
6819    return (PyObject *)res;
6820
6821  onError:
6822    Py_DECREF(fseq);
6823    Py_XDECREF(res);
6824    return NULL;
6825}
6826
6827static
6828PyUnicodeObject *pad(PyUnicodeObject *self,
6829                     Py_ssize_t left,
6830                     Py_ssize_t right,
6831                     Py_UNICODE fill)
6832{
6833    PyUnicodeObject *u;
6834
6835    if (left < 0)
6836        left = 0;
6837    if (right < 0)
6838        right = 0;
6839
6840    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6841        Py_INCREF(self);
6842        return self;
6843    }
6844
6845    if (left > PY_SSIZE_T_MAX - self->length ||
6846        right > PY_SSIZE_T_MAX - (left + self->length)) {
6847        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6848        return NULL;
6849    }
6850    u = _PyUnicode_New(left + self->length + right);
6851    if (u) {
6852        if (left)
6853            Py_UNICODE_FILL(u->str, fill, left);
6854        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6855        if (right)
6856            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6857    }
6858
6859    return u;
6860}
6861
6862PyObject *PyUnicode_Splitlines(PyObject *string, int keepends)
6863{
6864    PyObject *list;
6865
6866    string = PyUnicode_FromObject(string);
6867    if (string == NULL)
6868        return NULL;
6869
6870    list = stringlib_splitlines(
6871        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6872        PyUnicode_GET_SIZE(string), keepends);
6873
6874    Py_DECREF(string);
6875    return list;
6876}
6877
6878static
6879PyObject *split(PyUnicodeObject *self,
6880                PyUnicodeObject *substring,
6881                Py_ssize_t maxcount)
6882{
6883    if (maxcount < 0)
6884        maxcount = PY_SSIZE_T_MAX;
6885
6886    if (substring == NULL)
6887        return stringlib_split_whitespace(
6888            (PyObject*) self,  self->str, self->length, maxcount
6889            );
6890
6891    return stringlib_split(
6892        (PyObject*) self,  self->str, self->length,
6893        substring->str, substring->length,
6894        maxcount
6895        );
6896}
6897
6898static
6899PyObject *rsplit(PyUnicodeObject *self,
6900                 PyUnicodeObject *substring,
6901                 Py_ssize_t maxcount)
6902{
6903    if (maxcount < 0)
6904        maxcount = PY_SSIZE_T_MAX;
6905
6906    if (substring == NULL)
6907        return stringlib_rsplit_whitespace(
6908            (PyObject*) self,  self->str, self->length, maxcount
6909            );
6910
6911    return stringlib_rsplit(
6912        (PyObject*) self,  self->str, self->length,
6913        substring->str, substring->length,
6914        maxcount
6915        );
6916}
6917
6918static
6919PyObject *replace(PyUnicodeObject *self,
6920                  PyUnicodeObject *str1,
6921                  PyUnicodeObject *str2,
6922                  Py_ssize_t maxcount)
6923{
6924    PyUnicodeObject *u;
6925
6926    if (maxcount < 0)
6927        maxcount = PY_SSIZE_T_MAX;
6928    else if (maxcount == 0 || self->length == 0)
6929        goto nothing;
6930
6931    if (str1->length == str2->length) {
6932        Py_ssize_t i;
6933        /* same length */
6934        if (str1->length == 0)
6935            goto nothing;
6936        if (str1->length == 1) {
6937            /* replace characters */
6938            Py_UNICODE u1, u2;
6939            if (!findchar(self->str, self->length, str1->str[0]))
6940                goto nothing;
6941            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6942            if (!u)
6943                return NULL;
6944            Py_UNICODE_COPY(u->str, self->str, self->length);
6945            u1 = str1->str[0];
6946            u2 = str2->str[0];
6947            for (i = 0; i < u->length; i++)
6948                if (u->str[i] == u1) {
6949                    if (--maxcount < 0)
6950                        break;
6951                    u->str[i] = u2;
6952                }
6953        } else {
6954            i = stringlib_find(
6955                self->str, self->length, str1->str, str1->length, 0
6956                );
6957            if (i < 0)
6958                goto nothing;
6959            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6960            if (!u)
6961                return NULL;
6962            Py_UNICODE_COPY(u->str, self->str, self->length);
6963
6964            /* change everything in-place, starting with this one */
6965            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6966            i += str1->length;
6967
6968            while ( --maxcount > 0) {
6969                i = stringlib_find(self->str+i, self->length-i,
6970                                   str1->str, str1->length,
6971                                   i);
6972                if (i == -1)
6973                    break;
6974                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
6975                i += str1->length;
6976            }
6977        }
6978    } else {
6979
6980        Py_ssize_t n, i, j, e;
6981        Py_ssize_t product, new_size, delta;
6982        Py_UNICODE *p;
6983
6984        /* replace strings */
6985        n = stringlib_count(self->str, self->length, str1->str, str1->length,
6986                            maxcount);
6987        if (n == 0)
6988            goto nothing;
6989        /* new_size = self->length + n * (str2->length - str1->length)); */
6990        delta = (str2->length - str1->length);
6991        if (delta == 0) {
6992            new_size = self->length;
6993        } else {
6994            product = n * (str2->length - str1->length);
6995            if ((product / (str2->length - str1->length)) != n) {
6996                PyErr_SetString(PyExc_OverflowError,
6997                                "replace string is too long");
6998                return NULL;
6999            }
7000            new_size = self->length + product;
7001            if (new_size < 0) {
7002                PyErr_SetString(PyExc_OverflowError,
7003                                "replace string is too long");
7004                return NULL;
7005            }
7006        }
7007        u = _PyUnicode_New(new_size);
7008        if (!u)
7009            return NULL;
7010        i = 0;
7011        p = u->str;
7012        e = self->length - str1->length;
7013        if (str1->length > 0) {
7014            while (n-- > 0) {
7015                /* look for next match */
7016                j = stringlib_find(self->str+i, self->length-i,
7017                                   str1->str, str1->length,
7018                                   i);
7019                if (j == -1)
7020                    break;
7021                else if (j > i) {
7022                    /* copy unchanged part [i:j] */
7023                    Py_UNICODE_COPY(p, self->str+i, j-i);
7024                    p += j - i;
7025                }
7026                /* copy substitution string */
7027                if (str2->length > 0) {
7028                    Py_UNICODE_COPY(p, str2->str, str2->length);
7029                    p += str2->length;
7030                }
7031                i = j + str1->length;
7032            }
7033            if (i < self->length)
7034                /* copy tail [i:] */
7035                Py_UNICODE_COPY(p, self->str+i, self->length-i);
7036        } else {
7037            /* interleave */
7038            while (n > 0) {
7039                Py_UNICODE_COPY(p, str2->str, str2->length);
7040                p += str2->length;
7041                if (--n <= 0)
7042                    break;
7043                *p++ = self->str[i++];
7044            }
7045            Py_UNICODE_COPY(p, self->str+i, self->length-i);
7046        }
7047    }
7048    return (PyObject *) u;
7049
7050  nothing:
7051    /* nothing to replace; return original string (when possible) */
7052    if (PyUnicode_CheckExact(self)) {
7053        Py_INCREF(self);
7054        return (PyObject *) self;
7055    }
7056    return PyUnicode_FromUnicode(self->str, self->length);
7057}
7058
7059/* --- Unicode Object Methods --------------------------------------------- */
7060
7061PyDoc_STRVAR(title__doc__,
7062             "S.title() -> str\n\
7063\n\
7064Return a titlecased version of S, i.e. words start with title case\n\
7065characters, all remaining cased characters have lower case.");
7066
7067static PyObject*
7068unicode_title(PyUnicodeObject *self)
7069{
7070    return fixup(self, fixtitle);
7071}
7072
7073PyDoc_STRVAR(capitalize__doc__,
7074             "S.capitalize() -> str\n\
7075\n\
7076Return a capitalized version of S, i.e. make the first character\n\
7077have upper case and the rest lower case.");
7078
7079static PyObject*
7080unicode_capitalize(PyUnicodeObject *self)
7081{
7082    return fixup(self, fixcapitalize);
7083}
7084
7085#if 0
7086PyDoc_STRVAR(capwords__doc__,
7087             "S.capwords() -> str\n\
7088\n\
7089Apply .capitalize() to all words in S and return the result with\n\
7090normalized whitespace (all whitespace strings are replaced by ' ').");
7091
7092static PyObject*
7093unicode_capwords(PyUnicodeObject *self)
7094{
7095    PyObject *list;
7096    PyObject *item;
7097    Py_ssize_t i;
7098
7099    /* Split into words */
7100    list = split(self, NULL, -1);
7101    if (!list)
7102        return NULL;
7103
7104    /* Capitalize each word */
7105    for (i = 0; i < PyList_GET_SIZE(list); i++) {
7106        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
7107                     fixcapitalize);
7108        if (item == NULL)
7109            goto onError;
7110        Py_DECREF(PyList_GET_ITEM(list, i));
7111        PyList_SET_ITEM(list, i, item);
7112    }
7113
7114    /* Join the words to form a new string */
7115    item = PyUnicode_Join(NULL, list);
7116
7117  onError:
7118    Py_DECREF(list);
7119    return (PyObject *)item;
7120}
7121#endif
7122
7123/* Argument converter.  Coerces to a single unicode character */
7124
7125static int
7126convert_uc(PyObject *obj, void *addr)
7127{
7128    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7129    PyObject *uniobj;
7130    Py_UNICODE *unistr;
7131
7132    uniobj = PyUnicode_FromObject(obj);
7133    if (uniobj == NULL) {
7134        PyErr_SetString(PyExc_TypeError,
7135                        "The fill character cannot be converted to Unicode");
7136        return 0;
7137    }
7138    if (PyUnicode_GET_SIZE(uniobj) != 1) {
7139        PyErr_SetString(PyExc_TypeError,
7140                        "The fill character must be exactly one character long");
7141        Py_DECREF(uniobj);
7142        return 0;
7143    }
7144    unistr = PyUnicode_AS_UNICODE(uniobj);
7145    *fillcharloc = unistr[0];
7146    Py_DECREF(uniobj);
7147    return 1;
7148}
7149
7150PyDoc_STRVAR(center__doc__,
7151             "S.center(width[, fillchar]) -> str\n\
7152\n\
7153Return S centered in a string of length width. Padding is\n\
7154done using the specified fill character (default is a space)");
7155
7156static PyObject *
7157unicode_center(PyUnicodeObject *self, PyObject *args)
7158{
7159    Py_ssize_t marg, left;
7160    Py_ssize_t width;
7161    Py_UNICODE fillchar = ' ';
7162
7163    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
7164        return NULL;
7165
7166    if (self->length >= width && PyUnicode_CheckExact(self)) {
7167        Py_INCREF(self);
7168        return (PyObject*) self;
7169    }
7170
7171    marg = width - self->length;
7172    left = marg / 2 + (marg & width & 1);
7173
7174    return (PyObject*) pad(self, left, marg - left, fillchar);
7175}
7176
7177#if 0
7178
7179/* This code should go into some future Unicode collation support
7180   module. The basic comparison should compare ordinals on a naive
7181   basis (this is what Java does and thus Jython too). */
7182
7183/* speedy UTF-16 code point order comparison */
7184/* gleaned from: */
7185/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7186
7187static short utf16Fixup[32] =
7188{
7189    0, 0, 0, 0, 0, 0, 0, 0,
7190    0, 0, 0, 0, 0, 0, 0, 0,
7191    0, 0, 0, 0, 0, 0, 0, 0,
7192    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
7193};
7194
7195static int
7196unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7197{
7198    Py_ssize_t len1, len2;
7199
7200    Py_UNICODE *s1 = str1->str;
7201    Py_UNICODE *s2 = str2->str;
7202
7203    len1 = str1->length;
7204    len2 = str2->length;
7205
7206    while (len1 > 0 && len2 > 0) {
7207        Py_UNICODE c1, c2;
7208
7209        c1 = *s1++;
7210        c2 = *s2++;
7211
7212        if (c1 > (1<<11) * 26)
7213            c1 += utf16Fixup[c1>>11];
7214        if (c2 > (1<<11) * 26)
7215            c2 += utf16Fixup[c2>>11];
7216        /* now c1 and c2 are in UTF-32-compatible order */
7217
7218        if (c1 != c2)
7219            return (c1 < c2) ? -1 : 1;
7220
7221        len1--; len2--;
7222    }
7223
7224    return (len1 < len2) ? -1 : (len1 != len2);
7225}
7226
7227#else
7228
7229static int
7230unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7231{
7232    register Py_ssize_t len1, len2;
7233
7234    Py_UNICODE *s1 = str1->str;
7235    Py_UNICODE *s2 = str2->str;
7236
7237    len1 = str1->length;
7238    len2 = str2->length;
7239
7240    while (len1 > 0 && len2 > 0) {
7241        Py_UNICODE c1, c2;
7242
7243        c1 = *s1++;
7244        c2 = *s2++;
7245
7246        if (c1 != c2)
7247            return (c1 < c2) ? -1 : 1;
7248
7249        len1--; len2--;
7250    }
7251
7252    return (len1 < len2) ? -1 : (len1 != len2);
7253}
7254
7255#endif
7256
7257int PyUnicode_Compare(PyObject *left,
7258                      PyObject *right)
7259{
7260    if (PyUnicode_Check(left) && PyUnicode_Check(right))
7261        return unicode_compare((PyUnicodeObject *)left,
7262                               (PyUnicodeObject *)right);
7263    PyErr_Format(PyExc_TypeError,
7264                 "Can't compare %.100s and %.100s",
7265                 left->ob_type->tp_name,
7266                 right->ob_type->tp_name);
7267    return -1;
7268}
7269
7270int
7271PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7272{
7273    int i;
7274    Py_UNICODE *id;
7275    assert(PyUnicode_Check(uni));
7276    id = PyUnicode_AS_UNICODE(uni);
7277    /* Compare Unicode string and source character set string */
7278    for (i = 0; id[i] && str[i]; i++)
7279        if (id[i] != str[i])
7280            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7281    /* This check keeps Python strings that end in '\0' from comparing equal
7282     to C strings identical up to that point. */
7283    if (PyUnicode_GET_SIZE(uni) != i || id[i])
7284        return 1; /* uni is longer */
7285    if (str[i])
7286        return -1; /* str is longer */
7287    return 0;
7288}
7289
7290
7291#define TEST_COND(cond)                         \
7292    ((cond) ? Py_True : Py_False)
7293
7294PyObject *PyUnicode_RichCompare(PyObject *left,
7295                                PyObject *right,
7296                                int op)
7297{
7298    int result;
7299
7300    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7301        PyObject *v;
7302        if (((PyUnicodeObject *) left)->length !=
7303            ((PyUnicodeObject *) right)->length) {
7304            if (op == Py_EQ) {
7305                Py_INCREF(Py_False);
7306                return Py_False;
7307            }
7308            if (op == Py_NE) {
7309                Py_INCREF(Py_True);
7310                return Py_True;
7311            }
7312        }
7313        if (left == right)
7314            result = 0;
7315        else
7316            result = unicode_compare((PyUnicodeObject *)left,
7317                                     (PyUnicodeObject *)right);
7318
7319        /* Convert the return value to a Boolean */
7320        switch (op) {
7321        case Py_EQ:
7322            v = TEST_COND(result == 0);
7323            break;
7324        case Py_NE:
7325            v = TEST_COND(result != 0);
7326            break;
7327        case Py_LE:
7328            v = TEST_COND(result <= 0);
7329            break;
7330        case Py_GE:
7331            v = TEST_COND(result >= 0);
7332            break;
7333        case Py_LT:
7334            v = TEST_COND(result == -1);
7335            break;
7336        case Py_GT:
7337            v = TEST_COND(result == 1);
7338            break;
7339        default:
7340            PyErr_BadArgument();
7341            return NULL;
7342        }
7343        Py_INCREF(v);
7344        return v;
7345    }
7346
7347    Py_INCREF(Py_NotImplemented);
7348    return Py_NotImplemented;
7349}
7350
7351int PyUnicode_Contains(PyObject *container,
7352                       PyObject *element)
7353{
7354    PyObject *str, *sub;
7355    int result;
7356
7357    /* Coerce the two arguments */
7358    sub = PyUnicode_FromObject(element);
7359    if (!sub) {
7360        PyErr_Format(PyExc_TypeError,
7361                     "'in <string>' requires string as left operand, not %s",
7362                     element->ob_type->tp_name);
7363        return -1;
7364    }
7365
7366    str = PyUnicode_FromObject(container);
7367    if (!str) {
7368        Py_DECREF(sub);
7369        return -1;
7370    }
7371
7372    result = stringlib_contains_obj(str, sub);
7373
7374    Py_DECREF(str);
7375    Py_DECREF(sub);
7376
7377    return result;
7378}
7379
7380/* Concat to string or Unicode object giving a new Unicode object. */
7381
7382PyObject *PyUnicode_Concat(PyObject *left,
7383                           PyObject *right)
7384{
7385    PyUnicodeObject *u = NULL, *v = NULL, *w;
7386
7387    /* Coerce the two arguments */
7388    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7389    if (u == NULL)
7390        goto onError;
7391    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7392    if (v == NULL)
7393        goto onError;
7394
7395    /* Shortcuts */
7396    if (v == unicode_empty) {
7397        Py_DECREF(v);
7398        return (PyObject *)u;
7399    }
7400    if (u == unicode_empty) {
7401        Py_DECREF(u);
7402        return (PyObject *)v;
7403    }
7404
7405    /* Concat the two Unicode strings */
7406    w = _PyUnicode_New(u->length + v->length);
7407    if (w == NULL)
7408        goto onError;
7409    Py_UNICODE_COPY(w->str, u->str, u->length);
7410    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7411
7412    Py_DECREF(u);
7413    Py_DECREF(v);
7414    return (PyObject *)w;
7415
7416  onError:
7417    Py_XDECREF(u);
7418    Py_XDECREF(v);
7419    return NULL;
7420}
7421
7422void
7423PyUnicode_Append(PyObject **pleft, PyObject *right)
7424{
7425    PyObject *new;
7426    if (*pleft == NULL)
7427        return;
7428    if (right == NULL || !PyUnicode_Check(*pleft)) {
7429        Py_DECREF(*pleft);
7430        *pleft = NULL;
7431        return;
7432    }
7433    new = PyUnicode_Concat(*pleft, right);
7434    Py_DECREF(*pleft);
7435    *pleft = new;
7436}
7437
7438void
7439PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7440{
7441    PyUnicode_Append(pleft, right);
7442    Py_XDECREF(right);
7443}
7444
7445PyDoc_STRVAR(count__doc__,
7446             "S.count(sub[, start[, end]]) -> int\n\
7447\n\
7448Return the number of non-overlapping occurrences of substring sub in\n\
7449string S[start:end].  Optional arguments start and end are\n\
7450interpreted as in slice notation.");
7451
7452static PyObject *
7453unicode_count(PyUnicodeObject *self, PyObject *args)
7454{
7455    PyUnicodeObject *substring;
7456    Py_ssize_t start = 0;
7457    Py_ssize_t end = PY_SSIZE_T_MAX;
7458    PyObject *result;
7459
7460    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7461                                            &start, &end))
7462        return NULL;
7463
7464    ADJUST_INDICES(start, end, self->length);
7465    result = PyLong_FromSsize_t(
7466        stringlib_count(self->str + start, end - start,
7467                        substring->str, substring->length,
7468                        PY_SSIZE_T_MAX)
7469        );
7470
7471    Py_DECREF(substring);
7472
7473    return result;
7474}
7475
7476PyDoc_STRVAR(encode__doc__,
7477             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
7478\n\
7479Encode S using the codec registered for encoding. Default encoding\n\
7480is 'utf-8'. errors may be given to set a different error\n\
7481handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7482a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7483'xmlcharrefreplace' as well as any other name registered with\n\
7484codecs.register_error that can handle UnicodeEncodeErrors.");
7485
7486static PyObject *
7487unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7488{
7489    static char *kwlist[] = {"encoding", "errors", 0};
7490    char *encoding = NULL;
7491    char *errors = NULL;
7492
7493    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7494                                     kwlist, &encoding, &errors))
7495        return NULL;
7496    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7497}
7498
7499PyDoc_STRVAR(expandtabs__doc__,
7500             "S.expandtabs([tabsize]) -> str\n\
7501\n\
7502Return a copy of S where all tab characters are expanded using spaces.\n\
7503If tabsize is not given, a tab size of 8 characters is assumed.");
7504
7505static PyObject*
7506unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7507{
7508    Py_UNICODE *e;
7509    Py_UNICODE *p;
7510    Py_UNICODE *q;
7511    Py_UNICODE *qe;
7512    Py_ssize_t i, j, incr;
7513    PyUnicodeObject *u;
7514    int tabsize = 8;
7515
7516    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7517        return NULL;
7518
7519    /* First pass: determine size of output string */
7520    i = 0; /* chars up to and including most recent \n or \r */
7521    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7522    e = self->str + self->length; /* end of input */
7523    for (p = self->str; p < e; p++)
7524        if (*p == '\t') {
7525            if (tabsize > 0) {
7526                incr = tabsize - (j % tabsize); /* cannot overflow */
7527                if (j > PY_SSIZE_T_MAX - incr)
7528                    goto overflow1;
7529                j += incr;
7530            }
7531        }
7532        else {
7533            if (j > PY_SSIZE_T_MAX - 1)
7534                goto overflow1;
7535            j++;
7536            if (*p == '\n' || *p == '\r') {
7537                if (i > PY_SSIZE_T_MAX - j)
7538                    goto overflow1;
7539                i += j;
7540                j = 0;
7541            }
7542        }
7543
7544    if (i > PY_SSIZE_T_MAX - j)
7545        goto overflow1;
7546
7547    /* Second pass: create output string and fill it */
7548    u = _PyUnicode_New(i + j);
7549    if (!u)
7550        return NULL;
7551
7552    j = 0; /* same as in first pass */
7553    q = u->str; /* next output char */
7554    qe = u->str + u->length; /* end of output */
7555
7556    for (p = self->str; p < e; p++)
7557        if (*p == '\t') {
7558            if (tabsize > 0) {
7559                i = tabsize - (j % tabsize);
7560                j += i;
7561                while (i--) {
7562                    if (q >= qe)
7563                        goto overflow2;
7564                    *q++ = ' ';
7565                }
7566            }
7567        }
7568        else {
7569            if (q >= qe)
7570                goto overflow2;
7571            *q++ = *p;
7572            j++;
7573            if (*p == '\n' || *p == '\r')
7574                j = 0;
7575        }
7576
7577    return (PyObject*) u;
7578
7579  overflow2:
7580    Py_DECREF(u);
7581  overflow1:
7582    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7583    return NULL;
7584}
7585
7586PyDoc_STRVAR(find__doc__,
7587             "S.find(sub[, start[, end]]) -> int\n\
7588\n\
7589Return the lowest index in S where substring sub is found,\n\
7590such that sub is contained within S[start:end].  Optional\n\
7591arguments start and end are interpreted as in slice notation.\n\
7592\n\
7593Return -1 on failure.");
7594
7595static PyObject *
7596unicode_find(PyUnicodeObject *self, PyObject *args)
7597{
7598    PyUnicodeObject *substring;
7599    Py_ssize_t start;
7600    Py_ssize_t end;
7601    Py_ssize_t result;
7602
7603    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7604                                            &start, &end))
7605        return NULL;
7606
7607    result = stringlib_find_slice(
7608        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7609        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7610        start, end
7611        );
7612
7613    Py_DECREF(substring);
7614
7615    return PyLong_FromSsize_t(result);
7616}
7617
7618static PyObject *
7619unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7620{
7621    if (index < 0 || index >= self->length) {
7622        PyErr_SetString(PyExc_IndexError, "string index out of range");
7623        return NULL;
7624    }
7625
7626    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7627}
7628
7629/* Believe it or not, this produces the same value for ASCII strings
7630   as string_hash(). */
7631static Py_hash_t
7632unicode_hash(PyUnicodeObject *self)
7633{
7634    Py_ssize_t len;
7635    Py_UNICODE *p;
7636    Py_hash_t x;
7637
7638    if (self->hash != -1)
7639        return self->hash;
7640    len = Py_SIZE(self);
7641    p = self->str;
7642    x = *p << 7;
7643    while (--len >= 0)
7644        x = (1000003*x) ^ *p++;
7645    x ^= Py_SIZE(self);
7646    if (x == -1)
7647        x = -2;
7648    self->hash = x;
7649    return x;
7650}
7651
7652PyDoc_STRVAR(index__doc__,
7653             "S.index(sub[, start[, end]]) -> int\n\
7654\n\
7655Like S.find() but raise ValueError when the substring is not found.");
7656
7657static PyObject *
7658unicode_index(PyUnicodeObject *self, PyObject *args)
7659{
7660    Py_ssize_t result;
7661    PyUnicodeObject *substring;
7662    Py_ssize_t start;
7663    Py_ssize_t end;
7664
7665    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7666                                            &start, &end))
7667        return NULL;
7668
7669    result = stringlib_find_slice(
7670        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7671        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7672        start, end
7673        );
7674
7675    Py_DECREF(substring);
7676
7677    if (result < 0) {
7678        PyErr_SetString(PyExc_ValueError, "substring not found");
7679        return NULL;
7680    }
7681
7682    return PyLong_FromSsize_t(result);
7683}
7684
7685PyDoc_STRVAR(islower__doc__,
7686             "S.islower() -> bool\n\
7687\n\
7688Return True if all cased characters in S are lowercase and there is\n\
7689at least one cased character in S, False otherwise.");
7690
7691static PyObject*
7692unicode_islower(PyUnicodeObject *self)
7693{
7694    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7695    register const Py_UNICODE *e;
7696    int cased;
7697
7698    /* Shortcut for single character strings */
7699    if (PyUnicode_GET_SIZE(self) == 1)
7700        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7701
7702    /* Special case for empty strings */
7703    if (PyUnicode_GET_SIZE(self) == 0)
7704        return PyBool_FromLong(0);
7705
7706    e = p + PyUnicode_GET_SIZE(self);
7707    cased = 0;
7708    for (; p < e; p++) {
7709        register const Py_UNICODE ch = *p;
7710
7711        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7712            return PyBool_FromLong(0);
7713        else if (!cased && Py_UNICODE_ISLOWER(ch))
7714            cased = 1;
7715    }
7716    return PyBool_FromLong(cased);
7717}
7718
7719PyDoc_STRVAR(isupper__doc__,
7720             "S.isupper() -> bool\n\
7721\n\
7722Return True if all cased characters in S are uppercase and there is\n\
7723at least one cased character in S, False otherwise.");
7724
7725static PyObject*
7726unicode_isupper(PyUnicodeObject *self)
7727{
7728    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7729    register const Py_UNICODE *e;
7730    int cased;
7731
7732    /* Shortcut for single character strings */
7733    if (PyUnicode_GET_SIZE(self) == 1)
7734        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7735
7736    /* Special case for empty strings */
7737    if (PyUnicode_GET_SIZE(self) == 0)
7738        return PyBool_FromLong(0);
7739
7740    e = p + PyUnicode_GET_SIZE(self);
7741    cased = 0;
7742    for (; p < e; p++) {
7743        register const Py_UNICODE ch = *p;
7744
7745        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7746            return PyBool_FromLong(0);
7747        else if (!cased && Py_UNICODE_ISUPPER(ch))
7748            cased = 1;
7749    }
7750    return PyBool_FromLong(cased);
7751}
7752
7753PyDoc_STRVAR(istitle__doc__,
7754             "S.istitle() -> bool\n\
7755\n\
7756Return True if S is a titlecased string and there is at least one\n\
7757character in S, i.e. upper- and titlecase characters may only\n\
7758follow uncased characters and lowercase characters only cased ones.\n\
7759Return False otherwise.");
7760
7761static PyObject*
7762unicode_istitle(PyUnicodeObject *self)
7763{
7764    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7765    register const Py_UNICODE *e;
7766    int cased, previous_is_cased;
7767
7768    /* Shortcut for single character strings */
7769    if (PyUnicode_GET_SIZE(self) == 1)
7770        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7771                               (Py_UNICODE_ISUPPER(*p) != 0));
7772
7773    /* Special case for empty strings */
7774    if (PyUnicode_GET_SIZE(self) == 0)
7775        return PyBool_FromLong(0);
7776
7777    e = p + PyUnicode_GET_SIZE(self);
7778    cased = 0;
7779    previous_is_cased = 0;
7780    for (; p < e; p++) {
7781        register const Py_UNICODE ch = *p;
7782
7783        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7784            if (previous_is_cased)
7785                return PyBool_FromLong(0);
7786            previous_is_cased = 1;
7787            cased = 1;
7788        }
7789        else if (Py_UNICODE_ISLOWER(ch)) {
7790            if (!previous_is_cased)
7791                return PyBool_FromLong(0);
7792            previous_is_cased = 1;
7793            cased = 1;
7794        }
7795        else
7796            previous_is_cased = 0;
7797    }
7798    return PyBool_FromLong(cased);
7799}
7800
7801PyDoc_STRVAR(isspace__doc__,
7802             "S.isspace() -> bool\n\
7803\n\
7804Return True if all characters in S are whitespace\n\
7805and there is at least one character in S, False otherwise.");
7806
7807static PyObject*
7808unicode_isspace(PyUnicodeObject *self)
7809{
7810    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7811    register const Py_UNICODE *e;
7812
7813    /* Shortcut for single character strings */
7814    if (PyUnicode_GET_SIZE(self) == 1 &&
7815        Py_UNICODE_ISSPACE(*p))
7816        return PyBool_FromLong(1);
7817
7818    /* Special case for empty strings */
7819    if (PyUnicode_GET_SIZE(self) == 0)
7820        return PyBool_FromLong(0);
7821
7822    e = p + PyUnicode_GET_SIZE(self);
7823    for (; p < e; p++) {
7824        if (!Py_UNICODE_ISSPACE(*p))
7825            return PyBool_FromLong(0);
7826    }
7827    return PyBool_FromLong(1);
7828}
7829
7830PyDoc_STRVAR(isalpha__doc__,
7831             "S.isalpha() -> bool\n\
7832\n\
7833Return True if all characters in S are alphabetic\n\
7834and there is at least one character in S, False otherwise.");
7835
7836static PyObject*
7837unicode_isalpha(PyUnicodeObject *self)
7838{
7839    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7840    register const Py_UNICODE *e;
7841
7842    /* Shortcut for single character strings */
7843    if (PyUnicode_GET_SIZE(self) == 1 &&
7844        Py_UNICODE_ISALPHA(*p))
7845        return PyBool_FromLong(1);
7846
7847    /* Special case for empty strings */
7848    if (PyUnicode_GET_SIZE(self) == 0)
7849        return PyBool_FromLong(0);
7850
7851    e = p + PyUnicode_GET_SIZE(self);
7852    for (; p < e; p++) {
7853        if (!Py_UNICODE_ISALPHA(*p))
7854            return PyBool_FromLong(0);
7855    }
7856    return PyBool_FromLong(1);
7857}
7858
7859PyDoc_STRVAR(isalnum__doc__,
7860             "S.isalnum() -> bool\n\
7861\n\
7862Return True if all characters in S are alphanumeric\n\
7863and there is at least one character in S, False otherwise.");
7864
7865static PyObject*
7866unicode_isalnum(PyUnicodeObject *self)
7867{
7868    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7869    register const Py_UNICODE *e;
7870
7871    /* Shortcut for single character strings */
7872    if (PyUnicode_GET_SIZE(self) == 1 &&
7873        Py_UNICODE_ISALNUM(*p))
7874        return PyBool_FromLong(1);
7875
7876    /* Special case for empty strings */
7877    if (PyUnicode_GET_SIZE(self) == 0)
7878        return PyBool_FromLong(0);
7879
7880    e = p + PyUnicode_GET_SIZE(self);
7881    for (; p < e; p++) {
7882        if (!Py_UNICODE_ISALNUM(*p))
7883            return PyBool_FromLong(0);
7884    }
7885    return PyBool_FromLong(1);
7886}
7887
7888PyDoc_STRVAR(isdecimal__doc__,
7889             "S.isdecimal() -> bool\n\
7890\n\
7891Return True if there are only decimal characters in S,\n\
7892False otherwise.");
7893
7894static PyObject*
7895unicode_isdecimal(PyUnicodeObject *self)
7896{
7897    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7898    register const Py_UNICODE *e;
7899
7900    /* Shortcut for single character strings */
7901    if (PyUnicode_GET_SIZE(self) == 1 &&
7902        Py_UNICODE_ISDECIMAL(*p))
7903        return PyBool_FromLong(1);
7904
7905    /* Special case for empty strings */
7906    if (PyUnicode_GET_SIZE(self) == 0)
7907        return PyBool_FromLong(0);
7908
7909    e = p + PyUnicode_GET_SIZE(self);
7910    for (; p < e; p++) {
7911        if (!Py_UNICODE_ISDECIMAL(*p))
7912            return PyBool_FromLong(0);
7913    }
7914    return PyBool_FromLong(1);
7915}
7916
7917PyDoc_STRVAR(isdigit__doc__,
7918             "S.isdigit() -> bool\n\
7919\n\
7920Return True if all characters in S are digits\n\
7921and there is at least one character in S, False otherwise.");
7922
7923static PyObject*
7924unicode_isdigit(PyUnicodeObject *self)
7925{
7926    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7927    register const Py_UNICODE *e;
7928
7929    /* Shortcut for single character strings */
7930    if (PyUnicode_GET_SIZE(self) == 1 &&
7931        Py_UNICODE_ISDIGIT(*p))
7932        return PyBool_FromLong(1);
7933
7934    /* Special case for empty strings */
7935    if (PyUnicode_GET_SIZE(self) == 0)
7936        return PyBool_FromLong(0);
7937
7938    e = p + PyUnicode_GET_SIZE(self);
7939    for (; p < e; p++) {
7940        if (!Py_UNICODE_ISDIGIT(*p))
7941            return PyBool_FromLong(0);
7942    }
7943    return PyBool_FromLong(1);
7944}
7945
7946PyDoc_STRVAR(isnumeric__doc__,
7947             "S.isnumeric() -> bool\n\
7948\n\
7949Return True if there are only numeric characters in S,\n\
7950False otherwise.");
7951
7952static PyObject*
7953unicode_isnumeric(PyUnicodeObject *self)
7954{
7955    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7956    register const Py_UNICODE *e;
7957
7958    /* Shortcut for single character strings */
7959    if (PyUnicode_GET_SIZE(self) == 1 &&
7960        Py_UNICODE_ISNUMERIC(*p))
7961        return PyBool_FromLong(1);
7962
7963    /* Special case for empty strings */
7964    if (PyUnicode_GET_SIZE(self) == 0)
7965        return PyBool_FromLong(0);
7966
7967    e = p + PyUnicode_GET_SIZE(self);
7968    for (; p < e; p++) {
7969        if (!Py_UNICODE_ISNUMERIC(*p))
7970            return PyBool_FromLong(0);
7971    }
7972    return PyBool_FromLong(1);
7973}
7974
7975int
7976PyUnicode_IsIdentifier(PyObject *self)
7977{
7978    register const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
7979    register const Py_UNICODE *e;
7980
7981    /* Special case for empty strings */
7982    if (PyUnicode_GET_SIZE(self) == 0)
7983        return 0;
7984
7985    /* PEP 3131 says that the first character must be in
7986       XID_Start and subsequent characters in XID_Continue,
7987       and for the ASCII range, the 2.x rules apply (i.e
7988       start with letters and underscore, continue with
7989       letters, digits, underscore). However, given the current
7990       definition of XID_Start and XID_Continue, it is sufficient
7991       to check just for these, except that _ must be allowed
7992       as starting an identifier.  */
7993    if (!_PyUnicode_IsXidStart(*p) && *p != 0x5F /* LOW LINE */)
7994        return 0;
7995
7996    e = p + PyUnicode_GET_SIZE(self);
7997    for (p++; p < e; p++) {
7998        if (!_PyUnicode_IsXidContinue(*p))
7999            return 0;
8000    }
8001    return 1;
8002}
8003
8004PyDoc_STRVAR(isidentifier__doc__,
8005             "S.isidentifier() -> bool\n\
8006\n\
8007Return True if S is a valid identifier according\n\
8008to the language definition.");
8009
8010static PyObject*
8011unicode_isidentifier(PyObject *self)
8012{
8013    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8014}
8015
8016PyDoc_STRVAR(isprintable__doc__,
8017             "S.isprintable() -> bool\n\
8018\n\
8019Return True if all characters in S are considered\n\
8020printable in repr() or S is empty, False otherwise.");
8021
8022static PyObject*
8023unicode_isprintable(PyObject *self)
8024{
8025    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8026    register const Py_UNICODE *e;
8027
8028    /* Shortcut for single character strings */
8029    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8030        Py_RETURN_TRUE;
8031    }
8032
8033    e = p + PyUnicode_GET_SIZE(self);
8034    for (; p < e; p++) {
8035        if (!Py_UNICODE_ISPRINTABLE(*p)) {
8036            Py_RETURN_FALSE;
8037        }
8038    }
8039    Py_RETURN_TRUE;
8040}
8041
8042PyDoc_STRVAR(join__doc__,
8043             "S.join(iterable) -> str\n\
8044\n\
8045Return a string which is the concatenation of the strings in the\n\
8046iterable.  The separator between elements is S.");
8047
8048static PyObject*
8049unicode_join(PyObject *self, PyObject *data)
8050{
8051    return PyUnicode_Join(self, data);
8052}
8053
8054static Py_ssize_t
8055unicode_length(PyUnicodeObject *self)
8056{
8057    return self->length;
8058}
8059
8060PyDoc_STRVAR(ljust__doc__,
8061             "S.ljust(width[, fillchar]) -> str\n\
8062\n\
8063Return S left-justified in a Unicode string of length width. Padding is\n\
8064done using the specified fill character (default is a space).");
8065
8066static PyObject *
8067unicode_ljust(PyUnicodeObject *self, PyObject *args)
8068{
8069    Py_ssize_t width;
8070    Py_UNICODE fillchar = ' ';
8071
8072    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
8073        return NULL;
8074
8075    if (self->length >= width && PyUnicode_CheckExact(self)) {
8076        Py_INCREF(self);
8077        return (PyObject*) self;
8078    }
8079
8080    return (PyObject*) pad(self, 0, width - self->length, fillchar);
8081}
8082
8083PyDoc_STRVAR(lower__doc__,
8084             "S.lower() -> str\n\
8085\n\
8086Return a copy of the string S converted to lowercase.");
8087
8088static PyObject*
8089unicode_lower(PyUnicodeObject *self)
8090{
8091    return fixup(self, fixlower);
8092}
8093
8094#define LEFTSTRIP 0
8095#define RIGHTSTRIP 1
8096#define BOTHSTRIP 2
8097
8098/* Arrays indexed by above */
8099static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8100
8101#define STRIPNAME(i) (stripformat[i]+3)
8102
8103/* externally visible for str.strip(unicode) */
8104PyObject *
8105_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8106{
8107    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8108    Py_ssize_t len = PyUnicode_GET_SIZE(self);
8109    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8110    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8111    Py_ssize_t i, j;
8112
8113    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
8114
8115    i = 0;
8116    if (striptype != RIGHTSTRIP) {
8117        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8118            i++;
8119        }
8120    }
8121
8122    j = len;
8123    if (striptype != LEFTSTRIP) {
8124        do {
8125            j--;
8126        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8127        j++;
8128    }
8129
8130    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8131        Py_INCREF(self);
8132        return (PyObject*)self;
8133    }
8134    else
8135        return PyUnicode_FromUnicode(s+i, j-i);
8136}
8137
8138
8139static PyObject *
8140do_strip(PyUnicodeObject *self, int striptype)
8141{
8142    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8143    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
8144
8145    i = 0;
8146    if (striptype != RIGHTSTRIP) {
8147        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8148            i++;
8149        }
8150    }
8151
8152    j = len;
8153    if (striptype != LEFTSTRIP) {
8154        do {
8155            j--;
8156        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8157        j++;
8158    }
8159
8160    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8161        Py_INCREF(self);
8162        return (PyObject*)self;
8163    }
8164    else
8165        return PyUnicode_FromUnicode(s+i, j-i);
8166}
8167
8168
8169static PyObject *
8170do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8171{
8172    PyObject *sep = NULL;
8173
8174    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8175        return NULL;
8176
8177    if (sep != NULL && sep != Py_None) {
8178        if (PyUnicode_Check(sep))
8179            return _PyUnicode_XStrip(self, striptype, sep);
8180        else {
8181            PyErr_Format(PyExc_TypeError,
8182                         "%s arg must be None or str",
8183                         STRIPNAME(striptype));
8184            return NULL;
8185        }
8186    }
8187
8188    return do_strip(self, striptype);
8189}
8190
8191
8192PyDoc_STRVAR(strip__doc__,
8193             "S.strip([chars]) -> str\n\
8194\n\
8195Return a copy of the string S with leading and trailing\n\
8196whitespace removed.\n\
8197If chars is given and not None, remove characters in chars instead.");
8198
8199static PyObject *
8200unicode_strip(PyUnicodeObject *self, PyObject *args)
8201{
8202    if (PyTuple_GET_SIZE(args) == 0)
8203        return do_strip(self, BOTHSTRIP); /* Common case */
8204    else
8205        return do_argstrip(self, BOTHSTRIP, args);
8206}
8207
8208
8209PyDoc_STRVAR(lstrip__doc__,
8210             "S.lstrip([chars]) -> str\n\
8211\n\
8212Return a copy of the string S with leading whitespace removed.\n\
8213If chars is given and not None, remove characters in chars instead.");
8214
8215static PyObject *
8216unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8217{
8218    if (PyTuple_GET_SIZE(args) == 0)
8219        return do_strip(self, LEFTSTRIP); /* Common case */
8220    else
8221        return do_argstrip(self, LEFTSTRIP, args);
8222}
8223
8224
8225PyDoc_STRVAR(rstrip__doc__,
8226             "S.rstrip([chars]) -> str\n\
8227\n\
8228Return a copy of the string S with trailing whitespace removed.\n\
8229If chars is given and not None, remove characters in chars instead.");
8230
8231static PyObject *
8232unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8233{
8234    if (PyTuple_GET_SIZE(args) == 0)
8235        return do_strip(self, RIGHTSTRIP); /* Common case */
8236    else
8237        return do_argstrip(self, RIGHTSTRIP, args);
8238}
8239
8240
8241static PyObject*
8242unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8243{
8244    PyUnicodeObject *u;
8245    Py_UNICODE *p;
8246    Py_ssize_t nchars;
8247    size_t nbytes;
8248
8249    if (len < 1) {
8250        Py_INCREF(unicode_empty);
8251        return (PyObject *)unicode_empty;
8252    }
8253
8254    if (len == 1 && PyUnicode_CheckExact(str)) {
8255        /* no repeat, return original string */
8256        Py_INCREF(str);
8257        return (PyObject*) str;
8258    }
8259
8260    /* ensure # of chars needed doesn't overflow int and # of bytes
8261     * needed doesn't overflow size_t
8262     */
8263    nchars = len * str->length;
8264    if (nchars / len != str->length) {
8265        PyErr_SetString(PyExc_OverflowError,
8266                        "repeated string is too long");
8267        return NULL;
8268    }
8269    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8270    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8271        PyErr_SetString(PyExc_OverflowError,
8272                        "repeated string is too long");
8273        return NULL;
8274    }
8275    u = _PyUnicode_New(nchars);
8276    if (!u)
8277        return NULL;
8278
8279    p = u->str;
8280
8281    if (str->length == 1) {
8282        Py_UNICODE_FILL(p, str->str[0], len);
8283    } else {
8284        Py_ssize_t done = str->length; /* number of characters copied this far */
8285        Py_UNICODE_COPY(p, str->str, str->length);
8286        while (done < nchars) {
8287            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8288            Py_UNICODE_COPY(p+done, p, n);
8289            done += n;
8290        }
8291    }
8292
8293    return (PyObject*) u;
8294}
8295
8296PyObject *PyUnicode_Replace(PyObject *obj,
8297                            PyObject *subobj,
8298                            PyObject *replobj,
8299                            Py_ssize_t maxcount)
8300{
8301    PyObject *self;
8302    PyObject *str1;
8303    PyObject *str2;
8304    PyObject *result;
8305
8306    self = PyUnicode_FromObject(obj);
8307    if (self == NULL)
8308        return NULL;
8309    str1 = PyUnicode_FromObject(subobj);
8310    if (str1 == NULL) {
8311        Py_DECREF(self);
8312        return NULL;
8313    }
8314    str2 = PyUnicode_FromObject(replobj);
8315    if (str2 == NULL) {
8316        Py_DECREF(self);
8317        Py_DECREF(str1);
8318        return NULL;
8319    }
8320    result = replace((PyUnicodeObject *)self,
8321                     (PyUnicodeObject *)str1,
8322                     (PyUnicodeObject *)str2,
8323                     maxcount);
8324    Py_DECREF(self);
8325    Py_DECREF(str1);
8326    Py_DECREF(str2);
8327    return result;
8328}
8329
8330PyDoc_STRVAR(replace__doc__,
8331             "S.replace(old, new[, count]) -> str\n\
8332\n\
8333Return a copy of S with all occurrences of substring\n\
8334old replaced by new.  If the optional argument count is\n\
8335given, only the first count occurrences are replaced.");
8336
8337static PyObject*
8338unicode_replace(PyUnicodeObject *self, PyObject *args)
8339{
8340    PyUnicodeObject *str1;
8341    PyUnicodeObject *str2;
8342    Py_ssize_t maxcount = -1;
8343    PyObject *result;
8344
8345    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8346        return NULL;
8347    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8348    if (str1 == NULL)
8349        return NULL;
8350    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8351    if (str2 == NULL) {
8352        Py_DECREF(str1);
8353        return NULL;
8354    }
8355
8356    result = replace(self, str1, str2, maxcount);
8357
8358    Py_DECREF(str1);
8359    Py_DECREF(str2);
8360    return result;
8361}
8362
8363static
8364PyObject *unicode_repr(PyObject *unicode)
8365{
8366    PyObject *repr;
8367    Py_UNICODE *p;
8368    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8369    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8370
8371    /* XXX(nnorwitz): rather than over-allocating, it would be
8372       better to choose a different scheme.  Perhaps scan the
8373       first N-chars of the string and allocate based on that size.
8374    */
8375    /* Initial allocation is based on the longest-possible unichr
8376       escape.
8377
8378       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8379       unichr, so in this case it's the longest unichr escape. In
8380       narrow (UTF-16) builds this is five chars per source unichr
8381       since there are two unichrs in the surrogate pair, so in narrow
8382       (UTF-16) builds it's not the longest unichr escape.
8383
8384       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8385       so in the narrow (UTF-16) build case it's the longest unichr
8386       escape.
8387    */
8388
8389    repr = PyUnicode_FromUnicode(NULL,
8390                                 2 /* quotes */
8391#ifdef Py_UNICODE_WIDE
8392                                 + 10*size
8393#else
8394                                 + 6*size
8395#endif
8396                                 + 1);
8397    if (repr == NULL)
8398        return NULL;
8399
8400    p = PyUnicode_AS_UNICODE(repr);
8401
8402    /* Add quote */
8403    *p++ = (findchar(s, size, '\'') &&
8404            !findchar(s, size, '"')) ? '"' : '\'';
8405    while (size-- > 0) {
8406        Py_UNICODE ch = *s++;
8407
8408        /* Escape quotes and backslashes */
8409        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8410            *p++ = '\\';
8411            *p++ = ch;
8412            continue;
8413        }
8414
8415        /* Map special whitespace to '\t', \n', '\r' */
8416        if (ch == '\t') {
8417            *p++ = '\\';
8418            *p++ = 't';
8419        }
8420        else if (ch == '\n') {
8421            *p++ = '\\';
8422            *p++ = 'n';
8423        }
8424        else if (ch == '\r') {
8425            *p++ = '\\';
8426            *p++ = 'r';
8427        }
8428
8429        /* Map non-printable US ASCII to '\xhh' */
8430        else if (ch < ' ' || ch == 0x7F) {
8431            *p++ = '\\';
8432            *p++ = 'x';
8433            *p++ = hexdigits[(ch >> 4) & 0x000F];
8434            *p++ = hexdigits[ch & 0x000F];
8435        }
8436
8437        /* Copy ASCII characters as-is */
8438        else if (ch < 0x7F) {
8439            *p++ = ch;
8440        }
8441
8442        /* Non-ASCII characters */
8443        else {
8444            Py_UCS4 ucs = ch;
8445
8446#ifndef Py_UNICODE_WIDE
8447            Py_UNICODE ch2 = 0;
8448            /* Get code point from surrogate pair */
8449            if (size > 0) {
8450                ch2 = *s;
8451                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8452                    && ch2 <= 0xDFFF) {
8453                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8454                        + 0x00010000;
8455                    s++;
8456                    size--;
8457                }
8458            }
8459#endif
8460            /* Map Unicode whitespace and control characters
8461               (categories Z* and C* except ASCII space)
8462            */
8463            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8464                /* Map 8-bit characters to '\xhh' */
8465                if (ucs <= 0xff) {
8466                    *p++ = '\\';
8467                    *p++ = 'x';
8468                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8469                    *p++ = hexdigits[ch & 0x000F];
8470                }
8471                /* Map 21-bit characters to '\U00xxxxxx' */
8472                else if (ucs >= 0x10000) {
8473                    *p++ = '\\';
8474                    *p++ = 'U';
8475                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8476                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8477                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8478                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8479                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8480                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8481                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8482                    *p++ = hexdigits[ucs & 0x0000000F];
8483                }
8484                /* Map 16-bit characters to '\uxxxx' */
8485                else {
8486                    *p++ = '\\';
8487                    *p++ = 'u';
8488                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8489                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8490                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8491                    *p++ = hexdigits[ucs & 0x000F];
8492                }
8493            }
8494            /* Copy characters as-is */
8495            else {
8496                *p++ = ch;
8497#ifndef Py_UNICODE_WIDE
8498                if (ucs >= 0x10000)
8499                    *p++ = ch2;
8500#endif
8501            }
8502        }
8503    }
8504    /* Add quote */
8505    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8506
8507    *p = '\0';
8508    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8509    return repr;
8510}
8511
8512PyDoc_STRVAR(rfind__doc__,
8513             "S.rfind(sub[, start[, end]]) -> int\n\
8514\n\
8515Return the highest index in S where substring sub is found,\n\
8516such that sub is contained within S[start:end].  Optional\n\
8517arguments start and end are interpreted as in slice notation.\n\
8518\n\
8519Return -1 on failure.");
8520
8521static PyObject *
8522unicode_rfind(PyUnicodeObject *self, PyObject *args)
8523{
8524    PyUnicodeObject *substring;
8525    Py_ssize_t start;
8526    Py_ssize_t end;
8527    Py_ssize_t result;
8528
8529    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8530                                            &start, &end))
8531        return NULL;
8532
8533    result = stringlib_rfind_slice(
8534        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8535        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8536        start, end
8537        );
8538
8539    Py_DECREF(substring);
8540
8541    return PyLong_FromSsize_t(result);
8542}
8543
8544PyDoc_STRVAR(rindex__doc__,
8545             "S.rindex(sub[, start[, end]]) -> int\n\
8546\n\
8547Like S.rfind() but raise ValueError when the substring is not found.");
8548
8549static PyObject *
8550unicode_rindex(PyUnicodeObject *self, PyObject *args)
8551{
8552    PyUnicodeObject *substring;
8553    Py_ssize_t start;
8554    Py_ssize_t end;
8555    Py_ssize_t result;
8556
8557    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8558                                            &start, &end))
8559        return NULL;
8560
8561    result = stringlib_rfind_slice(
8562        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8563        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8564        start, end
8565        );
8566
8567    Py_DECREF(substring);
8568
8569    if (result < 0) {
8570        PyErr_SetString(PyExc_ValueError, "substring not found");
8571        return NULL;
8572    }
8573    return PyLong_FromSsize_t(result);
8574}
8575
8576PyDoc_STRVAR(rjust__doc__,
8577             "S.rjust(width[, fillchar]) -> str\n\
8578\n\
8579Return S right-justified in a string of length width. Padding is\n\
8580done using the specified fill character (default is a space).");
8581
8582static PyObject *
8583unicode_rjust(PyUnicodeObject *self, PyObject *args)
8584{
8585    Py_ssize_t width;
8586    Py_UNICODE fillchar = ' ';
8587
8588    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8589        return NULL;
8590
8591    if (self->length >= width && PyUnicode_CheckExact(self)) {
8592        Py_INCREF(self);
8593        return (PyObject*) self;
8594    }
8595
8596    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8597}
8598
8599PyObject *PyUnicode_Split(PyObject *s,
8600                          PyObject *sep,
8601                          Py_ssize_t maxsplit)
8602{
8603    PyObject *result;
8604
8605    s = PyUnicode_FromObject(s);
8606    if (s == NULL)
8607        return NULL;
8608    if (sep != NULL) {
8609        sep = PyUnicode_FromObject(sep);
8610        if (sep == NULL) {
8611            Py_DECREF(s);
8612            return NULL;
8613        }
8614    }
8615
8616    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8617
8618    Py_DECREF(s);
8619    Py_XDECREF(sep);
8620    return result;
8621}
8622
8623PyDoc_STRVAR(split__doc__,
8624             "S.split([sep[, maxsplit]]) -> list of strings\n\
8625\n\
8626Return a list of the words in S, using sep as the\n\
8627delimiter string.  If maxsplit is given, at most maxsplit\n\
8628splits are done. If sep is not specified or is None, any\n\
8629whitespace string is a separator and empty strings are\n\
8630removed from the result.");
8631
8632static PyObject*
8633unicode_split(PyUnicodeObject *self, PyObject *args)
8634{
8635    PyObject *substring = Py_None;
8636    Py_ssize_t maxcount = -1;
8637
8638    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8639        return NULL;
8640
8641    if (substring == Py_None)
8642        return split(self, NULL, maxcount);
8643    else if (PyUnicode_Check(substring))
8644        return split(self, (PyUnicodeObject *)substring, maxcount);
8645    else
8646        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8647}
8648
8649PyObject *
8650PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8651{
8652    PyObject* str_obj;
8653    PyObject* sep_obj;
8654    PyObject* out;
8655
8656    str_obj = PyUnicode_FromObject(str_in);
8657    if (!str_obj)
8658        return NULL;
8659    sep_obj = PyUnicode_FromObject(sep_in);
8660    if (!sep_obj) {
8661        Py_DECREF(str_obj);
8662        return NULL;
8663    }
8664
8665    out = stringlib_partition(
8666        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8667        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8668        );
8669
8670    Py_DECREF(sep_obj);
8671    Py_DECREF(str_obj);
8672
8673    return out;
8674}
8675
8676
8677PyObject *
8678PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8679{
8680    PyObject* str_obj;
8681    PyObject* sep_obj;
8682    PyObject* out;
8683
8684    str_obj = PyUnicode_FromObject(str_in);
8685    if (!str_obj)
8686        return NULL;
8687    sep_obj = PyUnicode_FromObject(sep_in);
8688    if (!sep_obj) {
8689        Py_DECREF(str_obj);
8690        return NULL;
8691    }
8692
8693    out = stringlib_rpartition(
8694        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8695        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8696        );
8697
8698    Py_DECREF(sep_obj);
8699    Py_DECREF(str_obj);
8700
8701    return out;
8702}
8703
8704PyDoc_STRVAR(partition__doc__,
8705             "S.partition(sep) -> (head, sep, tail)\n\
8706\n\
8707Search for the separator sep in S, and return the part before it,\n\
8708the separator itself, and the part after it.  If the separator is not\n\
8709found, return S and two empty strings.");
8710
8711static PyObject*
8712unicode_partition(PyUnicodeObject *self, PyObject *separator)
8713{
8714    return PyUnicode_Partition((PyObject *)self, separator);
8715}
8716
8717PyDoc_STRVAR(rpartition__doc__,
8718             "S.rpartition(sep) -> (head, sep, tail)\n\
8719\n\
8720Search for the separator sep in S, starting at the end of S, and return\n\
8721the part before it, the separator itself, and the part after it.  If the\n\
8722separator is not found, return two empty strings and S.");
8723
8724static PyObject*
8725unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8726{
8727    return PyUnicode_RPartition((PyObject *)self, separator);
8728}
8729
8730PyObject *PyUnicode_RSplit(PyObject *s,
8731                           PyObject *sep,
8732                           Py_ssize_t maxsplit)
8733{
8734    PyObject *result;
8735
8736    s = PyUnicode_FromObject(s);
8737    if (s == NULL)
8738        return NULL;
8739    if (sep != NULL) {
8740        sep = PyUnicode_FromObject(sep);
8741        if (sep == NULL) {
8742            Py_DECREF(s);
8743            return NULL;
8744        }
8745    }
8746
8747    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8748
8749    Py_DECREF(s);
8750    Py_XDECREF(sep);
8751    return result;
8752}
8753
8754PyDoc_STRVAR(rsplit__doc__,
8755             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8756\n\
8757Return a list of the words in S, using sep as the\n\
8758delimiter string, starting at the end of the string and\n\
8759working to the front.  If maxsplit is given, at most maxsplit\n\
8760splits are done. If sep is not specified, any whitespace string\n\
8761is a separator.");
8762
8763static PyObject*
8764unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8765{
8766    PyObject *substring = Py_None;
8767    Py_ssize_t maxcount = -1;
8768
8769    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8770        return NULL;
8771
8772    if (substring == Py_None)
8773        return rsplit(self, NULL, maxcount);
8774    else if (PyUnicode_Check(substring))
8775        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8776    else
8777        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8778}
8779
8780PyDoc_STRVAR(splitlines__doc__,
8781             "S.splitlines([keepends]) -> list of strings\n\
8782\n\
8783Return a list of the lines in S, breaking at line boundaries.\n\
8784Line breaks are not included in the resulting list unless keepends\n\
8785is given and true.");
8786
8787static PyObject*
8788unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8789{
8790    int keepends = 0;
8791
8792    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8793        return NULL;
8794
8795    return PyUnicode_Splitlines((PyObject *)self, keepends);
8796}
8797
8798static
8799PyObject *unicode_str(PyObject *self)
8800{
8801    if (PyUnicode_CheckExact(self)) {
8802        Py_INCREF(self);
8803        return self;
8804    } else
8805        /* Subtype -- return genuine unicode string with the same value. */
8806        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8807                                     PyUnicode_GET_SIZE(self));
8808}
8809
8810PyDoc_STRVAR(swapcase__doc__,
8811             "S.swapcase() -> str\n\
8812\n\
8813Return a copy of S with uppercase characters converted to lowercase\n\
8814and vice versa.");
8815
8816static PyObject*
8817unicode_swapcase(PyUnicodeObject *self)
8818{
8819    return fixup(self, fixswapcase);
8820}
8821
8822PyDoc_STRVAR(maketrans__doc__,
8823             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8824\n\
8825Return a translation table usable for str.translate().\n\
8826If there is only one argument, it must be a dictionary mapping Unicode\n\
8827ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8828Character keys will be then converted to ordinals.\n\
8829If there are two arguments, they must be strings of equal length, and\n\
8830in the resulting dictionary, each character in x will be mapped to the\n\
8831character at the same position in y. If there is a third argument, it\n\
8832must be a string, whose characters will be mapped to None in the result.");
8833
8834static PyObject*
8835unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8836{
8837    PyObject *x, *y = NULL, *z = NULL;
8838    PyObject *new = NULL, *key, *value;
8839    Py_ssize_t i = 0;
8840    int res;
8841
8842    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8843        return NULL;
8844    new = PyDict_New();
8845    if (!new)
8846        return NULL;
8847    if (y != NULL) {
8848        /* x must be a string too, of equal length */
8849        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8850        if (!PyUnicode_Check(x)) {
8851            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8852                            "be a string if there is a second argument");
8853            goto err;
8854        }
8855        if (PyUnicode_GET_SIZE(x) != ylen) {
8856            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8857                            "arguments must have equal length");
8858            goto err;
8859        }
8860        /* create entries for translating chars in x to those in y */
8861        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8862            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8863            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8864            if (!key || !value)
8865                goto err;
8866            res = PyDict_SetItem(new, key, value);
8867            Py_DECREF(key);
8868            Py_DECREF(value);
8869            if (res < 0)
8870                goto err;
8871        }
8872        /* create entries for deleting chars in z */
8873        if (z != NULL) {
8874            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8875                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8876                if (!key)
8877                    goto err;
8878                res = PyDict_SetItem(new, key, Py_None);
8879                Py_DECREF(key);
8880                if (res < 0)
8881                    goto err;
8882            }
8883        }
8884    } else {
8885        /* x must be a dict */
8886        if (!PyDict_CheckExact(x)) {
8887            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8888                            "to maketrans it must be a dict");
8889            goto err;
8890        }
8891        /* copy entries into the new dict, converting string keys to int keys */
8892        while (PyDict_Next(x, &i, &key, &value)) {
8893            if (PyUnicode_Check(key)) {
8894                /* convert string keys to integer keys */
8895                PyObject *newkey;
8896                if (PyUnicode_GET_SIZE(key) != 1) {
8897                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8898                                    "table must be of length 1");
8899                    goto err;
8900                }
8901                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8902                if (!newkey)
8903                    goto err;
8904                res = PyDict_SetItem(new, newkey, value);
8905                Py_DECREF(newkey);
8906                if (res < 0)
8907                    goto err;
8908            } else if (PyLong_Check(key)) {
8909                /* just keep integer keys */
8910                if (PyDict_SetItem(new, key, value) < 0)
8911                    goto err;
8912            } else {
8913                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
8914                                "be strings or integers");
8915                goto err;
8916            }
8917        }
8918    }
8919    return new;
8920  err:
8921    Py_DECREF(new);
8922    return NULL;
8923}
8924
8925PyDoc_STRVAR(translate__doc__,
8926             "S.translate(table) -> str\n\
8927\n\
8928Return a copy of the string S, where all characters have been mapped\n\
8929through the given translation table, which must be a mapping of\n\
8930Unicode ordinals to Unicode ordinals, strings, or None.\n\
8931Unmapped characters are left untouched. Characters mapped to None\n\
8932are deleted.");
8933
8934static PyObject*
8935unicode_translate(PyUnicodeObject *self, PyObject *table)
8936{
8937    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
8938}
8939
8940PyDoc_STRVAR(upper__doc__,
8941             "S.upper() -> str\n\
8942\n\
8943Return a copy of S converted to uppercase.");
8944
8945static PyObject*
8946unicode_upper(PyUnicodeObject *self)
8947{
8948    return fixup(self, fixupper);
8949}
8950
8951PyDoc_STRVAR(zfill__doc__,
8952             "S.zfill(width) -> str\n\
8953\n\
8954Pad a numeric string S with zeros on the left, to fill a field\n\
8955of the specified width. The string S is never truncated.");
8956
8957static PyObject *
8958unicode_zfill(PyUnicodeObject *self, PyObject *args)
8959{
8960    Py_ssize_t fill;
8961    PyUnicodeObject *u;
8962
8963    Py_ssize_t width;
8964    if (!PyArg_ParseTuple(args, "n:zfill", &width))
8965        return NULL;
8966
8967    if (self->length >= width) {
8968        if (PyUnicode_CheckExact(self)) {
8969            Py_INCREF(self);
8970            return (PyObject*) self;
8971        }
8972        else
8973            return PyUnicode_FromUnicode(
8974                PyUnicode_AS_UNICODE(self),
8975                PyUnicode_GET_SIZE(self)
8976                );
8977    }
8978
8979    fill = width - self->length;
8980
8981    u = pad(self, fill, 0, '0');
8982
8983    if (u == NULL)
8984        return NULL;
8985
8986    if (u->str[fill] == '+' || u->str[fill] == '-') {
8987        /* move sign to beginning of string */
8988        u->str[0] = u->str[fill];
8989        u->str[fill] = '0';
8990    }
8991
8992    return (PyObject*) u;
8993}
8994
8995#if 0
8996static PyObject*
8997unicode_freelistsize(PyUnicodeObject *self)
8998{
8999    return PyLong_FromLong(numfree);
9000}
9001
9002static PyObject *
9003unicode__decimal2ascii(PyObject *self)
9004{
9005    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9006                                             PyUnicode_GET_SIZE(self));
9007}
9008#endif
9009
9010PyDoc_STRVAR(startswith__doc__,
9011             "S.startswith(prefix[, start[, end]]) -> bool\n\
9012\n\
9013Return True if S starts with the specified prefix, False otherwise.\n\
9014With optional start, test S beginning at that position.\n\
9015With optional end, stop comparing S at that position.\n\
9016prefix can also be a tuple of strings to try.");
9017
9018static PyObject *
9019unicode_startswith(PyUnicodeObject *self,
9020                   PyObject *args)
9021{
9022    PyObject *subobj;
9023    PyUnicodeObject *substring;
9024    Py_ssize_t start = 0;
9025    Py_ssize_t end = PY_SSIZE_T_MAX;
9026    int result;
9027
9028    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
9029        return NULL;
9030    if (PyTuple_Check(subobj)) {
9031        Py_ssize_t i;
9032        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9033            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9034                PyTuple_GET_ITEM(subobj, i));
9035            if (substring == NULL)
9036                return NULL;
9037            result = tailmatch(self, substring, start, end, -1);
9038            Py_DECREF(substring);
9039            if (result) {
9040                Py_RETURN_TRUE;
9041            }
9042        }
9043        /* nothing matched */
9044        Py_RETURN_FALSE;
9045    }
9046    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9047    if (substring == NULL) {
9048        if (PyErr_ExceptionMatches(PyExc_TypeError))
9049            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9050                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
9051        return NULL;
9052    }
9053    result = tailmatch(self, substring, start, end, -1);
9054    Py_DECREF(substring);
9055    return PyBool_FromLong(result);
9056}
9057
9058
9059PyDoc_STRVAR(endswith__doc__,
9060             "S.endswith(suffix[, start[, end]]) -> bool\n\
9061\n\
9062Return True if S ends with the specified suffix, False otherwise.\n\
9063With optional start, test S beginning at that position.\n\
9064With optional end, stop comparing S at that position.\n\
9065suffix can also be a tuple of strings to try.");
9066
9067static PyObject *
9068unicode_endswith(PyUnicodeObject *self,
9069                 PyObject *args)
9070{
9071    PyObject *subobj;
9072    PyUnicodeObject *substring;
9073    Py_ssize_t start = 0;
9074    Py_ssize_t end = PY_SSIZE_T_MAX;
9075    int result;
9076
9077    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
9078        return NULL;
9079    if (PyTuple_Check(subobj)) {
9080        Py_ssize_t i;
9081        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9082            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9083                PyTuple_GET_ITEM(subobj, i));
9084            if (substring == NULL)
9085                return NULL;
9086            result = tailmatch(self, substring, start, end, +1);
9087            Py_DECREF(substring);
9088            if (result) {
9089                Py_RETURN_TRUE;
9090            }
9091        }
9092        Py_RETURN_FALSE;
9093    }
9094    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9095    if (substring == NULL) {
9096        if (PyErr_ExceptionMatches(PyExc_TypeError))
9097            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9098                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
9099        return NULL;
9100    }
9101    result = tailmatch(self, substring, start, end, +1);
9102    Py_DECREF(substring);
9103    return PyBool_FromLong(result);
9104}
9105
9106#include "stringlib/string_format.h"
9107
9108PyDoc_STRVAR(format__doc__,
9109             "S.format(*args, **kwargs) -> str\n\
9110\n\
9111Return a formatted version of S, using substitutions from args and kwargs.\n\
9112The substitutions are identified by braces ('{' and '}').");
9113
9114PyDoc_STRVAR(format_map__doc__,
9115             "S.format_map(mapping) -> str\n\
9116\n\
9117Return a formatted version of S, using substitutions from mapping.\n\
9118The substitutions are identified by braces ('{' and '}').");
9119
9120static PyObject *
9121unicode__format__(PyObject* self, PyObject* args)
9122{
9123    PyObject *format_spec;
9124
9125    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9126        return NULL;
9127
9128    return _PyUnicode_FormatAdvanced(self,
9129                                     PyUnicode_AS_UNICODE(format_spec),
9130                                     PyUnicode_GET_SIZE(format_spec));
9131}
9132
9133PyDoc_STRVAR(p_format__doc__,
9134             "S.__format__(format_spec) -> str\n\
9135\n\
9136Return a formatted version of S as described by format_spec.");
9137
9138static PyObject *
9139unicode__sizeof__(PyUnicodeObject *v)
9140{
9141    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9142                              sizeof(Py_UNICODE) * (v->length + 1));
9143}
9144
9145PyDoc_STRVAR(sizeof__doc__,
9146             "S.__sizeof__() -> size of S in memory, in bytes");
9147
9148static PyObject *
9149unicode_getnewargs(PyUnicodeObject *v)
9150{
9151    return Py_BuildValue("(u#)", v->str, v->length);
9152}
9153
9154static PyMethodDef unicode_methods[] = {
9155
9156    /* Order is according to common usage: often used methods should
9157       appear first, since lookup is done sequentially. */
9158
9159    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
9160    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9161    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
9162    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
9163    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9164    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9165    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9166    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9167    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9168    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9169    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
9170    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
9171    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9172    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9173    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
9174    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
9175    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9176    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9177    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
9178    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
9179    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
9180    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
9181    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
9182    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9183    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9184    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9185    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9186    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9187    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9188    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9189    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9190    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9191    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9192    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9193    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9194    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9195    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
9196    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
9197    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
9198    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
9199    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
9200    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
9201    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
9202    {"maketrans", (PyCFunction) unicode_maketrans,
9203     METH_VARARGS | METH_STATIC, maketrans__doc__},
9204    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
9205#if 0
9206    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
9207#endif
9208
9209#if 0
9210    /* These methods are just used for debugging the implementation. */
9211    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9212    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
9213#endif
9214
9215    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9216    {NULL, NULL}
9217};
9218
9219static PyObject *
9220unicode_mod(PyObject *v, PyObject *w)
9221{
9222    if (!PyUnicode_Check(v)) {
9223        Py_INCREF(Py_NotImplemented);
9224        return Py_NotImplemented;
9225    }
9226    return PyUnicode_Format(v, w);
9227}
9228
9229static PyNumberMethods unicode_as_number = {
9230    0,              /*nb_add*/
9231    0,              /*nb_subtract*/
9232    0,              /*nb_multiply*/
9233    unicode_mod,            /*nb_remainder*/
9234};
9235
9236static PySequenceMethods unicode_as_sequence = {
9237    (lenfunc) unicode_length,       /* sq_length */
9238    PyUnicode_Concat,           /* sq_concat */
9239    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
9240    (ssizeargfunc) unicode_getitem,     /* sq_item */
9241    0,                  /* sq_slice */
9242    0,                  /* sq_ass_item */
9243    0,                  /* sq_ass_slice */
9244    PyUnicode_Contains,         /* sq_contains */
9245};
9246
9247static PyObject*
9248unicode_subscript(PyUnicodeObject* self, PyObject* item)
9249{
9250    if (PyIndex_Check(item)) {
9251        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
9252        if (i == -1 && PyErr_Occurred())
9253            return NULL;
9254        if (i < 0)
9255            i += PyUnicode_GET_SIZE(self);
9256        return unicode_getitem(self, i);
9257    } else if (PySlice_Check(item)) {
9258        Py_ssize_t start, stop, step, slicelength, cur, i;
9259        Py_UNICODE* source_buf;
9260        Py_UNICODE* result_buf;
9261        PyObject* result;
9262
9263        if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
9264                                 &start, &stop, &step, &slicelength) < 0) {
9265            return NULL;
9266        }
9267
9268        if (slicelength <= 0) {
9269            return PyUnicode_FromUnicode(NULL, 0);
9270        } else if (start == 0 && step == 1 && slicelength == self->length &&
9271                   PyUnicode_CheckExact(self)) {
9272            Py_INCREF(self);
9273            return (PyObject *)self;
9274        } else if (step == 1) {
9275            return PyUnicode_FromUnicode(self->str + start, slicelength);
9276        } else {
9277            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
9278            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9279                                                       sizeof(Py_UNICODE));
9280
9281            if (result_buf == NULL)
9282                return PyErr_NoMemory();
9283
9284            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9285                result_buf[i] = source_buf[cur];
9286            }
9287
9288            result = PyUnicode_FromUnicode(result_buf, slicelength);
9289            PyObject_FREE(result_buf);
9290            return result;
9291        }
9292    } else {
9293        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9294        return NULL;
9295    }
9296}
9297
9298static PyMappingMethods unicode_as_mapping = {
9299    (lenfunc)unicode_length,        /* mp_length */
9300    (binaryfunc)unicode_subscript,  /* mp_subscript */
9301    (objobjargproc)0,           /* mp_ass_subscript */
9302};
9303
9304
9305/* Helpers for PyUnicode_Format() */
9306
9307static PyObject *
9308getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9309{
9310    Py_ssize_t argidx = *p_argidx;
9311    if (argidx < arglen) {
9312        (*p_argidx)++;
9313        if (arglen < 0)
9314            return args;
9315        else
9316            return PyTuple_GetItem(args, argidx);
9317    }
9318    PyErr_SetString(PyExc_TypeError,
9319                    "not enough arguments for format string");
9320    return NULL;
9321}
9322
9323/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9324
9325static PyObject *
9326formatfloat(PyObject *v, int flags, int prec, int type)
9327{
9328    char *p;
9329    PyObject *result;
9330    double x;
9331
9332    x = PyFloat_AsDouble(v);
9333    if (x == -1.0 && PyErr_Occurred())
9334        return NULL;
9335
9336    if (prec < 0)
9337        prec = 6;
9338
9339    p = PyOS_double_to_string(x, type, prec,
9340                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9341    if (p == NULL)
9342        return NULL;
9343    result = PyUnicode_FromStringAndSize(p, strlen(p));
9344    PyMem_Free(p);
9345    return result;
9346}
9347
9348static PyObject*
9349formatlong(PyObject *val, int flags, int prec, int type)
9350{
9351    char *buf;
9352    int len;
9353    PyObject *str; /* temporary string object. */
9354    PyObject *result;
9355
9356    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9357    if (!str)
9358        return NULL;
9359    result = PyUnicode_FromStringAndSize(buf, len);
9360    Py_DECREF(str);
9361    return result;
9362}
9363
9364static int
9365formatchar(Py_UNICODE *buf,
9366           size_t buflen,
9367           PyObject *v)
9368{
9369    /* presume that the buffer is at least 3 characters long */
9370    if (PyUnicode_Check(v)) {
9371        if (PyUnicode_GET_SIZE(v) == 1) {
9372            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9373            buf[1] = '\0';
9374            return 1;
9375        }
9376#ifndef Py_UNICODE_WIDE
9377        if (PyUnicode_GET_SIZE(v) == 2) {
9378            /* Decode a valid surrogate pair */
9379            int c0 = PyUnicode_AS_UNICODE(v)[0];
9380            int c1 = PyUnicode_AS_UNICODE(v)[1];
9381            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9382                0xDC00 <= c1 && c1 <= 0xDFFF) {
9383                buf[0] = c0;
9384                buf[1] = c1;
9385                buf[2] = '\0';
9386                return 2;
9387            }
9388        }
9389#endif
9390        goto onError;
9391    }
9392    else {
9393        /* Integer input truncated to a character */
9394        long x;
9395        x = PyLong_AsLong(v);
9396        if (x == -1 && PyErr_Occurred())
9397            goto onError;
9398
9399        if (x < 0 || x > 0x10ffff) {
9400            PyErr_SetString(PyExc_OverflowError,
9401                            "%c arg not in range(0x110000)");
9402            return -1;
9403        }
9404
9405#ifndef Py_UNICODE_WIDE
9406        if (x > 0xffff) {
9407            x -= 0x10000;
9408            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9409            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9410            return 2;
9411        }
9412#endif
9413        buf[0] = (Py_UNICODE) x;
9414        buf[1] = '\0';
9415        return 1;
9416    }
9417
9418  onError:
9419    PyErr_SetString(PyExc_TypeError,
9420                    "%c requires int or char");
9421    return -1;
9422}
9423
9424/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9425   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9426*/
9427#define FORMATBUFLEN (size_t)10
9428
9429PyObject *PyUnicode_Format(PyObject *format,
9430                           PyObject *args)
9431{
9432    Py_UNICODE *fmt, *res;
9433    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9434    int args_owned = 0;
9435    PyUnicodeObject *result = NULL;
9436    PyObject *dict = NULL;
9437    PyObject *uformat;
9438
9439    if (format == NULL || args == NULL) {
9440        PyErr_BadInternalCall();
9441        return NULL;
9442    }
9443    uformat = PyUnicode_FromObject(format);
9444    if (uformat == NULL)
9445        return NULL;
9446    fmt = PyUnicode_AS_UNICODE(uformat);
9447    fmtcnt = PyUnicode_GET_SIZE(uformat);
9448
9449    reslen = rescnt = fmtcnt + 100;
9450    result = _PyUnicode_New(reslen);
9451    if (result == NULL)
9452        goto onError;
9453    res = PyUnicode_AS_UNICODE(result);
9454
9455    if (PyTuple_Check(args)) {
9456        arglen = PyTuple_Size(args);
9457        argidx = 0;
9458    }
9459    else {
9460        arglen = -1;
9461        argidx = -2;
9462    }
9463    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9464        !PyUnicode_Check(args))
9465        dict = args;
9466
9467    while (--fmtcnt >= 0) {
9468        if (*fmt != '%') {
9469            if (--rescnt < 0) {
9470                rescnt = fmtcnt + 100;
9471                reslen += rescnt;
9472                if (_PyUnicode_Resize(&result, reslen) < 0)
9473                    goto onError;
9474                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9475                --rescnt;
9476            }
9477            *res++ = *fmt++;
9478        }
9479        else {
9480            /* Got a format specifier */
9481            int flags = 0;
9482            Py_ssize_t width = -1;
9483            int prec = -1;
9484            Py_UNICODE c = '\0';
9485            Py_UNICODE fill;
9486            int isnumok;
9487            PyObject *v = NULL;
9488            PyObject *temp = NULL;
9489            Py_UNICODE *pbuf;
9490            Py_UNICODE sign;
9491            Py_ssize_t len;
9492            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9493
9494            fmt++;
9495            if (*fmt == '(') {
9496                Py_UNICODE *keystart;
9497                Py_ssize_t keylen;
9498                PyObject *key;
9499                int pcount = 1;
9500
9501                if (dict == NULL) {
9502                    PyErr_SetString(PyExc_TypeError,
9503                                    "format requires a mapping");
9504                    goto onError;
9505                }
9506                ++fmt;
9507                --fmtcnt;
9508                keystart = fmt;
9509                /* Skip over balanced parentheses */
9510                while (pcount > 0 && --fmtcnt >= 0) {
9511                    if (*fmt == ')')
9512                        --pcount;
9513                    else if (*fmt == '(')
9514                        ++pcount;
9515                    fmt++;
9516                }
9517                keylen = fmt - keystart - 1;
9518                if (fmtcnt < 0 || pcount > 0) {
9519                    PyErr_SetString(PyExc_ValueError,
9520                                    "incomplete format key");
9521                    goto onError;
9522                }
9523#if 0
9524                /* keys are converted to strings using UTF-8 and
9525                   then looked up since Python uses strings to hold
9526                   variables names etc. in its namespaces and we
9527                   wouldn't want to break common idioms. */
9528                key = PyUnicode_EncodeUTF8(keystart,
9529                                           keylen,
9530                                           NULL);
9531#else
9532                key = PyUnicode_FromUnicode(keystart, keylen);
9533#endif
9534                if (key == NULL)
9535                    goto onError;
9536                if (args_owned) {
9537                    Py_DECREF(args);
9538                    args_owned = 0;
9539                }
9540                args = PyObject_GetItem(dict, key);
9541                Py_DECREF(key);
9542                if (args == NULL) {
9543                    goto onError;
9544                }
9545                args_owned = 1;
9546                arglen = -1;
9547                argidx = -2;
9548            }
9549            while (--fmtcnt >= 0) {
9550                switch (c = *fmt++) {
9551                case '-': flags |= F_LJUST; continue;
9552                case '+': flags |= F_SIGN; continue;
9553                case ' ': flags |= F_BLANK; continue;
9554                case '#': flags |= F_ALT; continue;
9555                case '0': flags |= F_ZERO; continue;
9556                }
9557                break;
9558            }
9559            if (c == '*') {
9560                v = getnextarg(args, arglen, &argidx);
9561                if (v == NULL)
9562                    goto onError;
9563                if (!PyLong_Check(v)) {
9564                    PyErr_SetString(PyExc_TypeError,
9565                                    "* wants int");
9566                    goto onError;
9567                }
9568                width = PyLong_AsLong(v);
9569                if (width == -1 && PyErr_Occurred())
9570                    goto onError;
9571                if (width < 0) {
9572                    flags |= F_LJUST;
9573                    width = -width;
9574                }
9575                if (--fmtcnt >= 0)
9576                    c = *fmt++;
9577            }
9578            else if (c >= '0' && c <= '9') {
9579                width = c - '0';
9580                while (--fmtcnt >= 0) {
9581                    c = *fmt++;
9582                    if (c < '0' || c > '9')
9583                        break;
9584                    if ((width*10) / 10 != width) {
9585                        PyErr_SetString(PyExc_ValueError,
9586                                        "width too big");
9587                        goto onError;
9588                    }
9589                    width = width*10 + (c - '0');
9590                }
9591            }
9592            if (c == '.') {
9593                prec = 0;
9594                if (--fmtcnt >= 0)
9595                    c = *fmt++;
9596                if (c == '*') {
9597                    v = getnextarg(args, arglen, &argidx);
9598                    if (v == NULL)
9599                        goto onError;
9600                    if (!PyLong_Check(v)) {
9601                        PyErr_SetString(PyExc_TypeError,
9602                                        "* wants int");
9603                        goto onError;
9604                    }
9605                    prec = PyLong_AsLong(v);
9606                    if (prec == -1 && PyErr_Occurred())
9607                        goto onError;
9608                    if (prec < 0)
9609                        prec = 0;
9610                    if (--fmtcnt >= 0)
9611                        c = *fmt++;
9612                }
9613                else if (c >= '0' && c <= '9') {
9614                    prec = c - '0';
9615                    while (--fmtcnt >= 0) {
9616                        c = *fmt++;
9617                        if (c < '0' || c > '9')
9618                            break;
9619                        if ((prec*10) / 10 != prec) {
9620                            PyErr_SetString(PyExc_ValueError,
9621                                            "prec too big");
9622                            goto onError;
9623                        }
9624                        prec = prec*10 + (c - '0');
9625                    }
9626                }
9627            } /* prec */
9628            if (fmtcnt >= 0) {
9629                if (c == 'h' || c == 'l' || c == 'L') {
9630                    if (--fmtcnt >= 0)
9631                        c = *fmt++;
9632                }
9633            }
9634            if (fmtcnt < 0) {
9635                PyErr_SetString(PyExc_ValueError,
9636                                "incomplete format");
9637                goto onError;
9638            }
9639            if (c != '%') {
9640                v = getnextarg(args, arglen, &argidx);
9641                if (v == NULL)
9642                    goto onError;
9643            }
9644            sign = 0;
9645            fill = ' ';
9646            switch (c) {
9647
9648            case '%':
9649                pbuf = formatbuf;
9650                /* presume that buffer length is at least 1 */
9651                pbuf[0] = '%';
9652                len = 1;
9653                break;
9654
9655            case 's':
9656            case 'r':
9657            case 'a':
9658                if (PyUnicode_CheckExact(v) && c == 's') {
9659                    temp = v;
9660                    Py_INCREF(temp);
9661                }
9662                else {
9663                    if (c == 's')
9664                        temp = PyObject_Str(v);
9665                    else if (c == 'r')
9666                        temp = PyObject_Repr(v);
9667                    else
9668                        temp = PyObject_ASCII(v);
9669                    if (temp == NULL)
9670                        goto onError;
9671                    if (PyUnicode_Check(temp))
9672                        /* nothing to do */;
9673                    else {
9674                        Py_DECREF(temp);
9675                        PyErr_SetString(PyExc_TypeError,
9676                                        "%s argument has non-string str()");
9677                        goto onError;
9678                    }
9679                }
9680                pbuf = PyUnicode_AS_UNICODE(temp);
9681                len = PyUnicode_GET_SIZE(temp);
9682                if (prec >= 0 && len > prec)
9683                    len = prec;
9684                break;
9685
9686            case 'i':
9687            case 'd':
9688            case 'u':
9689            case 'o':
9690            case 'x':
9691            case 'X':
9692                isnumok = 0;
9693                if (PyNumber_Check(v)) {
9694                    PyObject *iobj=NULL;
9695
9696                    if (PyLong_Check(v)) {
9697                        iobj = v;
9698                        Py_INCREF(iobj);
9699                    }
9700                    else {
9701                        iobj = PyNumber_Long(v);
9702                    }
9703                    if (iobj!=NULL) {
9704                        if (PyLong_Check(iobj)) {
9705                            isnumok = 1;
9706                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
9707                            Py_DECREF(iobj);
9708                            if (!temp)
9709                                goto onError;
9710                            pbuf = PyUnicode_AS_UNICODE(temp);
9711                            len = PyUnicode_GET_SIZE(temp);
9712                            sign = 1;
9713                        }
9714                        else {
9715                            Py_DECREF(iobj);
9716                        }
9717                    }
9718                }
9719                if (!isnumok) {
9720                    PyErr_Format(PyExc_TypeError,
9721                                 "%%%c format: a number is required, "
9722                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9723                    goto onError;
9724                }
9725                if (flags & F_ZERO)
9726                    fill = '0';
9727                break;
9728
9729            case 'e':
9730            case 'E':
9731            case 'f':
9732            case 'F':
9733            case 'g':
9734            case 'G':
9735                temp = formatfloat(v, flags, prec, c);
9736                if (!temp)
9737                    goto onError;
9738                pbuf = PyUnicode_AS_UNICODE(temp);
9739                len = PyUnicode_GET_SIZE(temp);
9740                sign = 1;
9741                if (flags & F_ZERO)
9742                    fill = '0';
9743                break;
9744
9745            case 'c':
9746                pbuf = formatbuf;
9747                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9748                if (len < 0)
9749                    goto onError;
9750                break;
9751
9752            default:
9753                PyErr_Format(PyExc_ValueError,
9754                             "unsupported format character '%c' (0x%x) "
9755                             "at index %zd",
9756                             (31<=c && c<=126) ? (char)c : '?',
9757                             (int)c,
9758                             (Py_ssize_t)(fmt - 1 -
9759                                          PyUnicode_AS_UNICODE(uformat)));
9760                goto onError;
9761            }
9762            if (sign) {
9763                if (*pbuf == '-' || *pbuf == '+') {
9764                    sign = *pbuf++;
9765                    len--;
9766                }
9767                else if (flags & F_SIGN)
9768                    sign = '+';
9769                else if (flags & F_BLANK)
9770                    sign = ' ';
9771                else
9772                    sign = 0;
9773            }
9774            if (width < len)
9775                width = len;
9776            if (rescnt - (sign != 0) < width) {
9777                reslen -= rescnt;
9778                rescnt = width + fmtcnt + 100;
9779                reslen += rescnt;
9780                if (reslen < 0) {
9781                    Py_XDECREF(temp);
9782                    PyErr_NoMemory();
9783                    goto onError;
9784                }
9785                if (_PyUnicode_Resize(&result, reslen) < 0) {
9786                    Py_XDECREF(temp);
9787                    goto onError;
9788                }
9789                res = PyUnicode_AS_UNICODE(result)
9790                    + reslen - rescnt;
9791            }
9792            if (sign) {
9793                if (fill != ' ')
9794                    *res++ = sign;
9795                rescnt--;
9796                if (width > len)
9797                    width--;
9798            }
9799            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9800                assert(pbuf[0] == '0');
9801                assert(pbuf[1] == c);
9802                if (fill != ' ') {
9803                    *res++ = *pbuf++;
9804                    *res++ = *pbuf++;
9805                }
9806                rescnt -= 2;
9807                width -= 2;
9808                if (width < 0)
9809                    width = 0;
9810                len -= 2;
9811            }
9812            if (width > len && !(flags & F_LJUST)) {
9813                do {
9814                    --rescnt;
9815                    *res++ = fill;
9816                } while (--width > len);
9817            }
9818            if (fill == ' ') {
9819                if (sign)
9820                    *res++ = sign;
9821                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9822                    assert(pbuf[0] == '0');
9823                    assert(pbuf[1] == c);
9824                    *res++ = *pbuf++;
9825                    *res++ = *pbuf++;
9826                }
9827            }
9828            Py_UNICODE_COPY(res, pbuf, len);
9829            res += len;
9830            rescnt -= len;
9831            while (--width >= len) {
9832                --rescnt;
9833                *res++ = ' ';
9834            }
9835            if (dict && (argidx < arglen) && c != '%') {
9836                PyErr_SetString(PyExc_TypeError,
9837                                "not all arguments converted during string formatting");
9838                Py_XDECREF(temp);
9839                goto onError;
9840            }
9841            Py_XDECREF(temp);
9842        } /* '%' */
9843    } /* until end */
9844    if (argidx < arglen && !dict) {
9845        PyErr_SetString(PyExc_TypeError,
9846                        "not all arguments converted during string formatting");
9847        goto onError;
9848    }
9849
9850    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9851        goto onError;
9852    if (args_owned) {
9853        Py_DECREF(args);
9854    }
9855    Py_DECREF(uformat);
9856    return (PyObject *)result;
9857
9858  onError:
9859    Py_XDECREF(result);
9860    Py_DECREF(uformat);
9861    if (args_owned) {
9862        Py_DECREF(args);
9863    }
9864    return NULL;
9865}
9866
9867static PyObject *
9868unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9869
9870static PyObject *
9871unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9872{
9873    PyObject *x = NULL;
9874    static char *kwlist[] = {"object", "encoding", "errors", 0};
9875    char *encoding = NULL;
9876    char *errors = NULL;
9877
9878    if (type != &PyUnicode_Type)
9879        return unicode_subtype_new(type, args, kwds);
9880    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9881                                     kwlist, &x, &encoding, &errors))
9882        return NULL;
9883    if (x == NULL)
9884        return (PyObject *)_PyUnicode_New(0);
9885    if (encoding == NULL && errors == NULL)
9886        return PyObject_Str(x);
9887    else
9888        return PyUnicode_FromEncodedObject(x, encoding, errors);
9889}
9890
9891static PyObject *
9892unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9893{
9894    PyUnicodeObject *tmp, *pnew;
9895    Py_ssize_t n;
9896
9897    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9898    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9899    if (tmp == NULL)
9900        return NULL;
9901    assert(PyUnicode_Check(tmp));
9902    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9903    if (pnew == NULL) {
9904        Py_DECREF(tmp);
9905        return NULL;
9906    }
9907    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9908    if (pnew->str == NULL) {
9909        _Py_ForgetReference((PyObject *)pnew);
9910        PyObject_Del(pnew);
9911        Py_DECREF(tmp);
9912        return PyErr_NoMemory();
9913    }
9914    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9915    pnew->length = n;
9916    pnew->hash = tmp->hash;
9917    Py_DECREF(tmp);
9918    return (PyObject *)pnew;
9919}
9920
9921PyDoc_STRVAR(unicode_doc,
9922             "str(string[, encoding[, errors]]) -> str\n\
9923\n\
9924Create a new string object from the given encoded string.\n\
9925encoding defaults to the current default string encoding.\n\
9926errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
9927
9928static PyObject *unicode_iter(PyObject *seq);
9929
9930PyTypeObject PyUnicode_Type = {
9931    PyVarObject_HEAD_INIT(&PyType_Type, 0)
9932    "str",              /* tp_name */
9933    sizeof(PyUnicodeObject),        /* tp_size */
9934    0,                  /* tp_itemsize */
9935    /* Slots */
9936    (destructor)unicode_dealloc,    /* tp_dealloc */
9937    0,                  /* tp_print */
9938    0,                  /* tp_getattr */
9939    0,                  /* tp_setattr */
9940    0,                  /* tp_reserved */
9941    unicode_repr,           /* tp_repr */
9942    &unicode_as_number,         /* tp_as_number */
9943    &unicode_as_sequence,       /* tp_as_sequence */
9944    &unicode_as_mapping,        /* tp_as_mapping */
9945    (hashfunc) unicode_hash,        /* tp_hash*/
9946    0,                  /* tp_call*/
9947    (reprfunc) unicode_str,     /* tp_str */
9948    PyObject_GenericGetAttr,        /* tp_getattro */
9949    0,                  /* tp_setattro */
9950    0,                  /* tp_as_buffer */
9951    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
9952    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
9953    unicode_doc,            /* tp_doc */
9954    0,                  /* tp_traverse */
9955    0,                  /* tp_clear */
9956    PyUnicode_RichCompare,      /* tp_richcompare */
9957    0,                  /* tp_weaklistoffset */
9958    unicode_iter,           /* tp_iter */
9959    0,                  /* tp_iternext */
9960    unicode_methods,            /* tp_methods */
9961    0,                  /* tp_members */
9962    0,                  /* tp_getset */
9963    &PyBaseObject_Type,         /* tp_base */
9964    0,                  /* tp_dict */
9965    0,                  /* tp_descr_get */
9966    0,                  /* tp_descr_set */
9967    0,                  /* tp_dictoffset */
9968    0,                  /* tp_init */
9969    0,                  /* tp_alloc */
9970    unicode_new,            /* tp_new */
9971    PyObject_Del,           /* tp_free */
9972};
9973
9974/* Initialize the Unicode implementation */
9975
9976void _PyUnicode_Init(void)
9977{
9978    int i;
9979
9980    /* XXX - move this array to unicodectype.c ? */
9981    Py_UNICODE linebreak[] = {
9982        0x000A, /* LINE FEED */
9983        0x000D, /* CARRIAGE RETURN */
9984        0x001C, /* FILE SEPARATOR */
9985        0x001D, /* GROUP SEPARATOR */
9986        0x001E, /* RECORD SEPARATOR */
9987        0x0085, /* NEXT LINE */
9988        0x2028, /* LINE SEPARATOR */
9989        0x2029, /* PARAGRAPH SEPARATOR */
9990    };
9991
9992    /* Init the implementation */
9993    free_list = NULL;
9994    numfree = 0;
9995    unicode_empty = _PyUnicode_New(0);
9996    if (!unicode_empty)
9997        return;
9998
9999    for (i = 0; i < 256; i++)
10000        unicode_latin1[i] = NULL;
10001    if (PyType_Ready(&PyUnicode_Type) < 0)
10002        Py_FatalError("Can't initialize 'unicode'");
10003
10004    /* initialize the linebreak bloom filter */
10005    bloom_linebreak = make_bloom_mask(
10006        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10007        );
10008
10009    PyType_Ready(&EncodingMapType);
10010}
10011
10012/* Finalize the Unicode implementation */
10013
10014int
10015PyUnicode_ClearFreeList(void)
10016{
10017    int freelist_size = numfree;
10018    PyUnicodeObject *u;
10019
10020    for (u = free_list; u != NULL;) {
10021        PyUnicodeObject *v = u;
10022        u = *(PyUnicodeObject **)u;
10023        if (v->str)
10024            PyObject_DEL(v->str);
10025        Py_XDECREF(v->defenc);
10026        PyObject_Del(v);
10027        numfree--;
10028    }
10029    free_list = NULL;
10030    assert(numfree == 0);
10031    return freelist_size;
10032}
10033
10034void
10035_PyUnicode_Fini(void)
10036{
10037    int i;
10038
10039    Py_XDECREF(unicode_empty);
10040    unicode_empty = NULL;
10041
10042    for (i = 0; i < 256; i++) {
10043        if (unicode_latin1[i]) {
10044            Py_DECREF(unicode_latin1[i]);
10045            unicode_latin1[i] = NULL;
10046        }
10047    }
10048    (void)PyUnicode_ClearFreeList();
10049}
10050
10051void
10052PyUnicode_InternInPlace(PyObject **p)
10053{
10054    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10055    PyObject *t;
10056    if (s == NULL || !PyUnicode_Check(s))
10057        Py_FatalError(
10058            "PyUnicode_InternInPlace: unicode strings only please!");
10059    /* If it's a subclass, we don't really know what putting
10060       it in the interned dict might do. */
10061    if (!PyUnicode_CheckExact(s))
10062        return;
10063    if (PyUnicode_CHECK_INTERNED(s))
10064        return;
10065    if (interned == NULL) {
10066        interned = PyDict_New();
10067        if (interned == NULL) {
10068            PyErr_Clear(); /* Don't leave an exception */
10069            return;
10070        }
10071    }
10072    /* It might be that the GetItem call fails even
10073       though the key is present in the dictionary,
10074       namely when this happens during a stack overflow. */
10075    Py_ALLOW_RECURSION
10076        t = PyDict_GetItem(interned, (PyObject *)s);
10077    Py_END_ALLOW_RECURSION
10078
10079        if (t) {
10080            Py_INCREF(t);
10081            Py_DECREF(*p);
10082            *p = t;
10083            return;
10084        }
10085
10086    PyThreadState_GET()->recursion_critical = 1;
10087    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10088        PyErr_Clear();
10089        PyThreadState_GET()->recursion_critical = 0;
10090        return;
10091    }
10092    PyThreadState_GET()->recursion_critical = 0;
10093    /* The two references in interned are not counted by refcnt.
10094       The deallocator will take care of this */
10095    Py_REFCNT(s) -= 2;
10096    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
10097}
10098
10099void
10100PyUnicode_InternImmortal(PyObject **p)
10101{
10102    PyUnicode_InternInPlace(p);
10103    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10104        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10105        Py_INCREF(*p);
10106    }
10107}
10108
10109PyObject *
10110PyUnicode_InternFromString(const char *cp)
10111{
10112    PyObject *s = PyUnicode_FromString(cp);
10113    if (s == NULL)
10114        return NULL;
10115    PyUnicode_InternInPlace(&s);
10116    return s;
10117}
10118
10119void _Py_ReleaseInternedUnicodeStrings(void)
10120{
10121    PyObject *keys;
10122    PyUnicodeObject *s;
10123    Py_ssize_t i, n;
10124    Py_ssize_t immortal_size = 0, mortal_size = 0;
10125
10126    if (interned == NULL || !PyDict_Check(interned))
10127        return;
10128    keys = PyDict_Keys(interned);
10129    if (keys == NULL || !PyList_Check(keys)) {
10130        PyErr_Clear();
10131        return;
10132    }
10133
10134    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10135       detector, interned unicode strings are not forcibly deallocated;
10136       rather, we give them their stolen references back, and then clear
10137       and DECREF the interned dict. */
10138
10139    n = PyList_GET_SIZE(keys);
10140    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
10141            n);
10142    for (i = 0; i < n; i++) {
10143        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10144        switch (s->state) {
10145        case SSTATE_NOT_INTERNED:
10146            /* XXX Shouldn't happen */
10147            break;
10148        case SSTATE_INTERNED_IMMORTAL:
10149            Py_REFCNT(s) += 1;
10150            immortal_size += s->length;
10151            break;
10152        case SSTATE_INTERNED_MORTAL:
10153            Py_REFCNT(s) += 2;
10154            mortal_size += s->length;
10155            break;
10156        default:
10157            Py_FatalError("Inconsistent interned string state.");
10158        }
10159        s->state = SSTATE_NOT_INTERNED;
10160    }
10161    fprintf(stderr, "total size of all interned strings: "
10162            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10163            "mortal/immortal\n", mortal_size, immortal_size);
10164    Py_DECREF(keys);
10165    PyDict_Clear(interned);
10166    Py_DECREF(interned);
10167    interned = NULL;
10168}
10169
10170
10171/********************* Unicode Iterator **************************/
10172
10173typedef struct {
10174    PyObject_HEAD
10175    Py_ssize_t it_index;
10176    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
10177} unicodeiterobject;
10178
10179static void
10180unicodeiter_dealloc(unicodeiterobject *it)
10181{
10182    _PyObject_GC_UNTRACK(it);
10183    Py_XDECREF(it->it_seq);
10184    PyObject_GC_Del(it);
10185}
10186
10187static int
10188unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10189{
10190    Py_VISIT(it->it_seq);
10191    return 0;
10192}
10193
10194static PyObject *
10195unicodeiter_next(unicodeiterobject *it)
10196{
10197    PyUnicodeObject *seq;
10198    PyObject *item;
10199
10200    assert(it != NULL);
10201    seq = it->it_seq;
10202    if (seq == NULL)
10203        return NULL;
10204    assert(PyUnicode_Check(seq));
10205
10206    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10207        item = PyUnicode_FromUnicode(
10208            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10209        if (item != NULL)
10210            ++it->it_index;
10211        return item;
10212    }
10213
10214    Py_DECREF(seq);
10215    it->it_seq = NULL;
10216    return NULL;
10217}
10218
10219static PyObject *
10220unicodeiter_len(unicodeiterobject *it)
10221{
10222    Py_ssize_t len = 0;
10223    if (it->it_seq)
10224        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10225    return PyLong_FromSsize_t(len);
10226}
10227
10228PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10229
10230static PyMethodDef unicodeiter_methods[] = {
10231    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
10232     length_hint_doc},
10233    {NULL,      NULL}       /* sentinel */
10234};
10235
10236PyTypeObject PyUnicodeIter_Type = {
10237    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10238    "str_iterator",         /* tp_name */
10239    sizeof(unicodeiterobject),      /* tp_basicsize */
10240    0,                  /* tp_itemsize */
10241    /* methods */
10242    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
10243    0,                  /* tp_print */
10244    0,                  /* tp_getattr */
10245    0,                  /* tp_setattr */
10246    0,                  /* tp_reserved */
10247    0,                  /* tp_repr */
10248    0,                  /* tp_as_number */
10249    0,                  /* tp_as_sequence */
10250    0,                  /* tp_as_mapping */
10251    0,                  /* tp_hash */
10252    0,                  /* tp_call */
10253    0,                  /* tp_str */
10254    PyObject_GenericGetAttr,        /* tp_getattro */
10255    0,                  /* tp_setattro */
10256    0,                  /* tp_as_buffer */
10257    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10258    0,                  /* tp_doc */
10259    (traverseproc)unicodeiter_traverse, /* tp_traverse */
10260    0,                  /* tp_clear */
10261    0,                  /* tp_richcompare */
10262    0,                  /* tp_weaklistoffset */
10263    PyObject_SelfIter,          /* tp_iter */
10264    (iternextfunc)unicodeiter_next,     /* tp_iternext */
10265    unicodeiter_methods,            /* tp_methods */
10266    0,
10267};
10268
10269static PyObject *
10270unicode_iter(PyObject *seq)
10271{
10272    unicodeiterobject *it;
10273
10274    if (!PyUnicode_Check(seq)) {
10275        PyErr_BadInternalCall();
10276        return NULL;
10277    }
10278    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10279    if (it == NULL)
10280        return NULL;
10281    it->it_index = 0;
10282    Py_INCREF(seq);
10283    it->it_seq = (PyUnicodeObject *)seq;
10284    _PyObject_GC_TRACK(it);
10285    return (PyObject *)it;
10286}
10287
10288size_t
10289Py_UNICODE_strlen(const Py_UNICODE *u)
10290{
10291    int res = 0;
10292    while(*u++)
10293        res++;
10294    return res;
10295}
10296
10297Py_UNICODE*
10298Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10299{
10300    Py_UNICODE *u = s1;
10301    while ((*u++ = *s2++));
10302    return s1;
10303}
10304
10305Py_UNICODE*
10306Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10307{
10308    Py_UNICODE *u = s1;
10309    while ((*u++ = *s2++))
10310        if (n-- == 0)
10311            break;
10312    return s1;
10313}
10314
10315Py_UNICODE*
10316Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10317{
10318    Py_UNICODE *u1 = s1;
10319    u1 += Py_UNICODE_strlen(u1);
10320    Py_UNICODE_strcpy(u1, s2);
10321    return s1;
10322}
10323
10324int
10325Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10326{
10327    while (*s1 && *s2 && *s1 == *s2)
10328        s1++, s2++;
10329    if (*s1 && *s2)
10330        return (*s1 < *s2) ? -1 : +1;
10331    if (*s1)
10332        return 1;
10333    if (*s2)
10334        return -1;
10335    return 0;
10336}
10337
10338int
10339Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10340{
10341    register Py_UNICODE u1, u2;
10342    for (; n != 0; n--) {
10343        u1 = *s1;
10344        u2 = *s2;
10345        if (u1 != u2)
10346            return (u1 < u2) ? -1 : +1;
10347        if (u1 == '\0')
10348            return 0;
10349        s1++;
10350        s2++;
10351    }
10352    return 0;
10353}
10354
10355Py_UNICODE*
10356Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10357{
10358    const Py_UNICODE *p;
10359    for (p = s; *p; p++)
10360        if (*p == c)
10361            return (Py_UNICODE*)p;
10362    return NULL;
10363}
10364
10365Py_UNICODE*
10366Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10367{
10368    const Py_UNICODE *p;
10369    p = s + Py_UNICODE_strlen(s);
10370    while (p != s) {
10371        p--;
10372        if (*p == c)
10373            return (Py_UNICODE*)p;
10374    }
10375    return NULL;
10376}
10377
10378Py_UNICODE*
10379PyUnicode_AsUnicodeCopy(PyObject *object)
10380{
10381    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10382    Py_UNICODE *copy;
10383    Py_ssize_t size;
10384
10385    /* Ensure we won't overflow the size. */
10386    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10387        PyErr_NoMemory();
10388        return NULL;
10389    }
10390    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10391    size *= sizeof(Py_UNICODE);
10392    copy = PyMem_Malloc(size);
10393    if (copy == NULL) {
10394        PyErr_NoMemory();
10395        return NULL;
10396    }
10397    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10398    return copy;
10399}
10400
10401/* A _string module, to export formatter_parser and formatter_field_name_split
10402   to the string.Formatter class implemented in Python. */
10403
10404static PyMethodDef _string_methods[] = {
10405    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10406     METH_O, PyDoc_STR("split the argument as a field name")},
10407    {"formatter_parser", (PyCFunction) formatter_parser,
10408     METH_O, PyDoc_STR("parse the argument as a format string")},
10409    {NULL, NULL}
10410};
10411
10412static struct PyModuleDef _string_module = {
10413    PyModuleDef_HEAD_INIT,
10414    "_string",
10415    PyDoc_STR("string helper module"),
10416    0,
10417    _string_methods,
10418    NULL,
10419    NULL,
10420    NULL,
10421    NULL
10422};
10423
10424PyMODINIT_FUNC
10425PyInit__string(void)
10426{
10427    return PyModule_Create(&_string_module);
10428}
10429
10430
10431#ifdef __cplusplus
10432}
10433#endif
10434