unicodeobject.c revision 020340f2841ec2b70b9e09921850d16019d0667e
1/*
2
3Unicode implementation based on original code by Fredrik Lundh,
4modified by Marc-Andre Lemburg <mal@lemburg.com> according to the
5Unicode Integration Proposal (see file Misc/unicode.txt).
6
7Major speed upgrades to the method implementations at the Reykjavik
8NeedForSpeed sprint, by Fredrik Lundh and Andrew Dalke.
9
10Copyright (c) Corporation for National Research Initiatives.
11
12--------------------------------------------------------------------
13The original string type implementation is:
14
15  Copyright (c) 1999 by Secret Labs AB
16  Copyright (c) 1999 by Fredrik Lundh
17
18By obtaining, using, and/or copying this software and/or its
19associated documentation, you agree that you have read, understood,
20and will comply with the following terms and conditions:
21
22Permission to use, copy, modify, and distribute this software and its
23associated documentation for any purpose and without fee is hereby
24granted, provided that the above copyright notice appears in all
25copies, and that both that copyright notice and this permission notice
26appear in supporting documentation, and that the name of Secret Labs
27AB or the author not be used in advertising or publicity pertaining to
28distribution of the software without specific, written prior
29permission.
30
31SECRET LABS AB AND THE AUTHOR DISCLAIMS ALL WARRANTIES WITH REGARD TO
32THIS SOFTWARE, INCLUDING ALL IMPLIED WARRANTIES OF MERCHANTABILITY AND
33FITNESS.  IN NO EVENT SHALL SECRET LABS AB OR THE AUTHOR BE LIABLE FOR
34ANY SPECIAL, INDIRECT OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
35WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
36ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
37OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
38--------------------------------------------------------------------
39
40*/
41
42#define PY_SSIZE_T_CLEAN
43#include "Python.h"
44#include "ucnhash.h"
45
46#ifdef MS_WINDOWS
47#include <windows.h>
48#endif
49
50/* Limit for the Unicode object free list */
51
52#define PyUnicode_MAXFREELIST       1024
53
54/* Limit for the Unicode object free list stay alive optimization.
55
56   The implementation will keep allocated Unicode memory intact for
57   all objects on the free list having a size less than this
58   limit. This reduces malloc() overhead for small Unicode objects.
59
60   At worst this will result in PyUnicode_MAXFREELIST *
61   (sizeof(PyUnicodeObject) + KEEPALIVE_SIZE_LIMIT +
62   malloc()-overhead) bytes of unused garbage.
63
64   Setting the limit to 0 effectively turns the feature off.
65
66   Note: This is an experimental feature ! If you get core dumps when
67   using Unicode objects, turn this feature off.
68
69*/
70
71#define KEEPALIVE_SIZE_LIMIT       9
72
73/* Endianness switches; defaults to little endian */
74
75#ifdef WORDS_BIGENDIAN
76# define BYTEORDER_IS_BIG_ENDIAN
77#else
78# define BYTEORDER_IS_LITTLE_ENDIAN
79#endif
80
81/* --- Globals ------------------------------------------------------------
82
83   The globals are initialized by the _PyUnicode_Init() API and should
84   not be used before calling that API.
85
86*/
87
88
89#ifdef __cplusplus
90extern "C" {
91#endif
92
93/* This dictionary holds all interned unicode strings.  Note that references
94   to strings in this dictionary are *not* counted in the string's ob_refcnt.
95   When the interned string reaches a refcnt of 0 the string deallocation
96   function will delete the reference from this dictionary.
97
98   Another way to look at this is that to say that the actual reference
99   count of a string is:  s->ob_refcnt + (s->state ? 2 : 0)
100*/
101static PyObject *interned;
102
103/* Free list for Unicode objects */
104static PyUnicodeObject *free_list;
105static int numfree;
106
107/* The empty Unicode object is shared to improve performance. */
108static PyUnicodeObject *unicode_empty;
109
110/* Single character Unicode strings in the Latin-1 range are being
111   shared as well. */
112static PyUnicodeObject *unicode_latin1[256];
113
114/* Fast detection of the most frequent whitespace characters */
115const unsigned char _Py_ascii_whitespace[] = {
116    0, 0, 0, 0, 0, 0, 0, 0,
117/*     case 0x0009: * CHARACTER TABULATION */
118/*     case 0x000A: * LINE FEED */
119/*     case 0x000B: * LINE TABULATION */
120/*     case 0x000C: * FORM FEED */
121/*     case 0x000D: * CARRIAGE RETURN */
122    0, 1, 1, 1, 1, 1, 0, 0,
123    0, 0, 0, 0, 0, 0, 0, 0,
124/*     case 0x001C: * FILE SEPARATOR */
125/*     case 0x001D: * GROUP SEPARATOR */
126/*     case 0x001E: * RECORD SEPARATOR */
127/*     case 0x001F: * UNIT SEPARATOR */
128    0, 0, 0, 0, 1, 1, 1, 1,
129/*     case 0x0020: * SPACE */
130    1, 0, 0, 0, 0, 0, 0, 0,
131    0, 0, 0, 0, 0, 0, 0, 0,
132    0, 0, 0, 0, 0, 0, 0, 0,
133    0, 0, 0, 0, 0, 0, 0, 0,
134
135    0, 0, 0, 0, 0, 0, 0, 0,
136    0, 0, 0, 0, 0, 0, 0, 0,
137    0, 0, 0, 0, 0, 0, 0, 0,
138    0, 0, 0, 0, 0, 0, 0, 0,
139    0, 0, 0, 0, 0, 0, 0, 0,
140    0, 0, 0, 0, 0, 0, 0, 0,
141    0, 0, 0, 0, 0, 0, 0, 0,
142    0, 0, 0, 0, 0, 0, 0, 0
143};
144
145static PyObject *
146unicode_encode_call_errorhandler(const char *errors,
147       PyObject **errorHandler,const char *encoding, const char *reason,
148       const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
149       Py_ssize_t startpos, Py_ssize_t endpos, Py_ssize_t *newpos);
150
151static void
152raise_encode_exception(PyObject **exceptionObject,
153		       const char *encoding,
154		       const Py_UNICODE *unicode, Py_ssize_t size,
155		       Py_ssize_t startpos, Py_ssize_t endpos,
156		       const char *reason);
157
158/* Same for linebreaks */
159static unsigned char ascii_linebreak[] = {
160    0, 0, 0, 0, 0, 0, 0, 0,
161/*         0x000A, * LINE FEED */
162/*         0x000B, * LINE TABULATION */
163/*         0x000C, * FORM FEED */
164/*         0x000D, * CARRIAGE RETURN */
165    0, 0, 1, 1, 1, 1, 0, 0,
166    0, 0, 0, 0, 0, 0, 0, 0,
167/*         0x001C, * FILE SEPARATOR */
168/*         0x001D, * GROUP SEPARATOR */
169/*         0x001E, * RECORD SEPARATOR */
170    0, 0, 0, 0, 1, 1, 1, 0,
171    0, 0, 0, 0, 0, 0, 0, 0,
172    0, 0, 0, 0, 0, 0, 0, 0,
173    0, 0, 0, 0, 0, 0, 0, 0,
174    0, 0, 0, 0, 0, 0, 0, 0,
175
176    0, 0, 0, 0, 0, 0, 0, 0,
177    0, 0, 0, 0, 0, 0, 0, 0,
178    0, 0, 0, 0, 0, 0, 0, 0,
179    0, 0, 0, 0, 0, 0, 0, 0,
180    0, 0, 0, 0, 0, 0, 0, 0,
181    0, 0, 0, 0, 0, 0, 0, 0,
182    0, 0, 0, 0, 0, 0, 0, 0,
183    0, 0, 0, 0, 0, 0, 0, 0
184};
185
186
187Py_UNICODE
188PyUnicode_GetMax(void)
189{
190#ifdef Py_UNICODE_WIDE
191    return 0x10FFFF;
192#else
193    /* This is actually an illegal character, so it should
194       not be passed to unichr. */
195    return 0xFFFF;
196#endif
197}
198
199/* --- Bloom Filters ----------------------------------------------------- */
200
201/* stuff to implement simple "bloom filters" for Unicode characters.
202   to keep things simple, we use a single bitmask, using the least 5
203   bits from each unicode characters as the bit index. */
204
205/* the linebreak mask is set up by Unicode_Init below */
206
207#if LONG_BIT >= 128
208#define BLOOM_WIDTH 128
209#elif LONG_BIT >= 64
210#define BLOOM_WIDTH 64
211#elif LONG_BIT >= 32
212#define BLOOM_WIDTH 32
213#else
214#error "LONG_BIT is smaller than 32"
215#endif
216
217#define BLOOM_MASK unsigned long
218
219static BLOOM_MASK bloom_linebreak;
220
221#define BLOOM_ADD(mask, ch) ((mask |= (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
222#define BLOOM(mask, ch)     ((mask &  (1UL << ((ch) & (BLOOM_WIDTH - 1)))))
223
224#define BLOOM_LINEBREAK(ch)                                             \
225    ((ch) < 128U ? ascii_linebreak[(ch)] :                              \
226     (BLOOM(bloom_linebreak, (ch)) && Py_UNICODE_ISLINEBREAK(ch)))
227
228Py_LOCAL_INLINE(BLOOM_MASK)
229make_bloom_mask(Py_UNICODE* ptr, Py_ssize_t len)
230{
231    /* calculate simple bloom-style bitmask for a given unicode string */
232
233    BLOOM_MASK mask;
234    Py_ssize_t i;
235
236    mask = 0;
237    for (i = 0; i < len; i++)
238        BLOOM_ADD(mask, ptr[i]);
239
240    return mask;
241}
242
243Py_LOCAL_INLINE(int)
244unicode_member(Py_UNICODE chr, Py_UNICODE* set, Py_ssize_t setlen)
245{
246    Py_ssize_t i;
247
248    for (i = 0; i < setlen; i++)
249        if (set[i] == chr)
250            return 1;
251
252    return 0;
253}
254
255#define BLOOM_MEMBER(mask, chr, set, setlen)                    \
256    BLOOM(mask, chr) && unicode_member(chr, set, setlen)
257
258/* --- Unicode Object ----------------------------------------------------- */
259
260static int
261unicode_resize(register PyUnicodeObject *unicode,
262	       Py_ssize_t length)
263{
264    void *oldstr;
265
266    /* Shortcut if there's nothing much to do. */
267    if (unicode->length == length)
268        goto reset;
269
270    /* Resizing shared object (unicode_empty or single character
271       objects) in-place is not allowed. Use PyUnicode_Resize()
272       instead ! */
273
274    if (unicode == unicode_empty ||
275        (unicode->length == 1 &&
276         unicode->str[0] < 256U &&
277         unicode_latin1[unicode->str[0]] == unicode)) {
278        PyErr_SetString(PyExc_SystemError,
279                        "can't resize shared str objects");
280        return -1;
281    }
282
283    /* We allocate one more byte to make sure the string is Ux0000 terminated.
284       The overallocation is also used by fastsearch, which assumes that it's
285       safe to look at str[length] (without making any assumptions about what
286       it contains). */
287
288    oldstr = unicode->str;
289    unicode->str = PyObject_REALLOC(unicode->str,
290                                    sizeof(Py_UNICODE) * (length + 1));
291    if (!unicode->str) {
292        unicode->str = (Py_UNICODE *)oldstr;
293        PyErr_NoMemory();
294        return -1;
295    }
296    unicode->str[length] = 0;
297    unicode->length = length;
298
299  reset:
300    /* Reset the object caches */
301    if (unicode->defenc) {
302        Py_CLEAR(unicode->defenc);
303    }
304    unicode->hash = -1;
305
306    return 0;
307}
308
309/* We allocate one more byte to make sure the string is
310   Ux0000 terminated; some code (e.g. new_identifier)
311   relies on that.
312
313   XXX This allocator could further be enhanced by assuring that the
314   free list never reduces its size below 1.
315
316*/
317
318static PyUnicodeObject *
319_PyUnicode_New(Py_ssize_t length)
320{
321    register PyUnicodeObject *unicode;
322
323    /* Optimization for empty strings */
324    if (length == 0 && unicode_empty != NULL) {
325        Py_INCREF(unicode_empty);
326        return unicode_empty;
327    }
328
329    /* Ensure we won't overflow the size. */
330    if (length > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
331        return (PyUnicodeObject *)PyErr_NoMemory();
332    }
333
334    /* Unicode freelist & memory allocation */
335    if (free_list) {
336        unicode = free_list;
337        free_list = *(PyUnicodeObject **)unicode;
338        numfree--;
339        if (unicode->str) {
340            /* Keep-Alive optimization: we only upsize the buffer,
341               never downsize it. */
342            if ((unicode->length < length) &&
343                unicode_resize(unicode, length) < 0) {
344                PyObject_DEL(unicode->str);
345                unicode->str = NULL;
346            }
347        }
348        else {
349            size_t new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
350            unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
351        }
352        PyObject_INIT(unicode, &PyUnicode_Type);
353    }
354    else {
355        size_t new_size;
356        unicode = PyObject_New(PyUnicodeObject, &PyUnicode_Type);
357        if (unicode == NULL)
358            return NULL;
359        new_size = sizeof(Py_UNICODE) * ((size_t)length + 1);
360        unicode->str = (Py_UNICODE*) PyObject_MALLOC(new_size);
361    }
362
363    if (!unicode->str) {
364        PyErr_NoMemory();
365        goto onError;
366    }
367    /* Initialize the first element to guard against cases where
368     * the caller fails before initializing str -- unicode_resize()
369     * reads str[0], and the Keep-Alive optimization can keep memory
370     * allocated for str alive across a call to unicode_dealloc(unicode).
371     * We don't want unicode_resize to read uninitialized memory in
372     * that case.
373     */
374    unicode->str[0] = 0;
375    unicode->str[length] = 0;
376    unicode->length = length;
377    unicode->hash = -1;
378    unicode->state = 0;
379    unicode->defenc = NULL;
380    return unicode;
381
382  onError:
383    /* XXX UNREF/NEWREF interface should be more symmetrical */
384    _Py_DEC_REFTOTAL;
385    _Py_ForgetReference((PyObject *)unicode);
386    PyObject_Del(unicode);
387    return NULL;
388}
389
390static void
391unicode_dealloc(register PyUnicodeObject *unicode)
392{
393    switch (PyUnicode_CHECK_INTERNED(unicode)) {
394    case SSTATE_NOT_INTERNED:
395        break;
396
397    case SSTATE_INTERNED_MORTAL:
398        /* revive dead object temporarily for DelItem */
399        Py_REFCNT(unicode) = 3;
400        if (PyDict_DelItem(interned, (PyObject *)unicode) != 0)
401            Py_FatalError(
402                "deletion of interned string failed");
403        break;
404
405    case SSTATE_INTERNED_IMMORTAL:
406        Py_FatalError("Immortal interned string died.");
407
408    default:
409        Py_FatalError("Inconsistent interned string state.");
410    }
411
412    if (PyUnicode_CheckExact(unicode) &&
413        numfree < PyUnicode_MAXFREELIST) {
414        /* Keep-Alive optimization */
415        if (unicode->length >= KEEPALIVE_SIZE_LIMIT) {
416            PyObject_DEL(unicode->str);
417            unicode->str = NULL;
418            unicode->length = 0;
419        }
420        if (unicode->defenc) {
421            Py_CLEAR(unicode->defenc);
422        }
423        /* Add to free list */
424        *(PyUnicodeObject **)unicode = free_list;
425        free_list = unicode;
426        numfree++;
427    }
428    else {
429        PyObject_DEL(unicode->str);
430        Py_XDECREF(unicode->defenc);
431        Py_TYPE(unicode)->tp_free((PyObject *)unicode);
432    }
433}
434
435static int
436_PyUnicode_Resize(PyUnicodeObject **unicode, Py_ssize_t length)
437{
438    register PyUnicodeObject *v;
439
440    /* Argument checks */
441    if (unicode == NULL) {
442        PyErr_BadInternalCall();
443        return -1;
444    }
445    v = *unicode;
446    if (v == NULL || !PyUnicode_Check(v) || Py_REFCNT(v) != 1 || length < 0) {
447        PyErr_BadInternalCall();
448        return -1;
449    }
450
451    /* Resizing unicode_empty and single character objects is not
452       possible since these are being shared. We simply return a fresh
453       copy with the same Unicode content. */
454    if (v->length != length &&
455        (v == unicode_empty || v->length == 1)) {
456        PyUnicodeObject *w = _PyUnicode_New(length);
457        if (w == NULL)
458            return -1;
459        Py_UNICODE_COPY(w->str, v->str,
460                        length < v->length ? length : v->length);
461        Py_DECREF(*unicode);
462        *unicode = w;
463        return 0;
464    }
465
466    /* Note that we don't have to modify *unicode for unshared Unicode
467       objects, since we can modify them in-place. */
468    return unicode_resize(v, length);
469}
470
471int
472PyUnicode_Resize(PyObject **unicode, Py_ssize_t length)
473{
474    return _PyUnicode_Resize((PyUnicodeObject **)unicode, length);
475}
476
477PyObject *
478PyUnicode_FromUnicode(const Py_UNICODE *u, Py_ssize_t size)
479{
480    PyUnicodeObject *unicode;
481
482    /* If the Unicode data is known at construction time, we can apply
483       some optimizations which share commonly used objects. */
484    if (u != NULL) {
485
486        /* Optimization for empty strings */
487        if (size == 0 && unicode_empty != NULL) {
488            Py_INCREF(unicode_empty);
489            return (PyObject *)unicode_empty;
490        }
491
492        /* Single character Unicode objects in the Latin-1 range are
493           shared when using this constructor */
494        if (size == 1 && *u < 256) {
495            unicode = unicode_latin1[*u];
496            if (!unicode) {
497                unicode = _PyUnicode_New(1);
498                if (!unicode)
499                    return NULL;
500                unicode->str[0] = *u;
501                unicode_latin1[*u] = unicode;
502            }
503            Py_INCREF(unicode);
504            return (PyObject *)unicode;
505        }
506    }
507
508    unicode = _PyUnicode_New(size);
509    if (!unicode)
510        return NULL;
511
512    /* Copy the Unicode data into the new object */
513    if (u != NULL)
514        Py_UNICODE_COPY(unicode->str, u, size);
515
516    return (PyObject *)unicode;
517}
518
519PyObject *
520PyUnicode_FromStringAndSize(const char *u, Py_ssize_t size)
521{
522    PyUnicodeObject *unicode;
523
524    if (size < 0) {
525        PyErr_SetString(PyExc_SystemError,
526                        "Negative size passed to PyUnicode_FromStringAndSize");
527        return NULL;
528    }
529
530    /* If the Unicode data is known at construction time, we can apply
531       some optimizations which share commonly used objects.
532       Also, this means the input must be UTF-8, so fall back to the
533       UTF-8 decoder at the end. */
534    if (u != NULL) {
535
536        /* Optimization for empty strings */
537        if (size == 0 && unicode_empty != NULL) {
538            Py_INCREF(unicode_empty);
539            return (PyObject *)unicode_empty;
540        }
541
542        /* Single characters are shared when using this constructor.
543           Restrict to ASCII, since the input must be UTF-8. */
544        if (size == 1 && Py_CHARMASK(*u) < 128) {
545            unicode = unicode_latin1[Py_CHARMASK(*u)];
546            if (!unicode) {
547                unicode = _PyUnicode_New(1);
548                if (!unicode)
549                    return NULL;
550                unicode->str[0] = Py_CHARMASK(*u);
551                unicode_latin1[Py_CHARMASK(*u)] = unicode;
552            }
553            Py_INCREF(unicode);
554            return (PyObject *)unicode;
555        }
556
557        return PyUnicode_DecodeUTF8(u, size, NULL);
558    }
559
560    unicode = _PyUnicode_New(size);
561    if (!unicode)
562        return NULL;
563
564    return (PyObject *)unicode;
565}
566
567PyObject *
568PyUnicode_FromString(const char *u)
569{
570    size_t size = strlen(u);
571    if (size > PY_SSIZE_T_MAX) {
572        PyErr_SetString(PyExc_OverflowError, "input too long");
573        return NULL;
574    }
575
576    return PyUnicode_FromStringAndSize(u, size);
577}
578
579#ifdef HAVE_WCHAR_H
580
581#if (Py_UNICODE_SIZE == 2) && defined(SIZEOF_WCHAR_T) && (SIZEOF_WCHAR_T == 4)
582# define CONVERT_WCHAR_TO_SURROGATES
583#endif
584
585#ifdef CONVERT_WCHAR_TO_SURROGATES
586
587/* Here sizeof(wchar_t) is 4 but Py_UNICODE_SIZE == 2, so we need
588   to convert from UTF32 to UTF16. */
589
590PyObject *
591PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
592{
593    PyUnicodeObject *unicode;
594    register Py_ssize_t i;
595    Py_ssize_t alloc;
596    const wchar_t *orig_w;
597
598    if (w == NULL) {
599        if (size == 0)
600            return PyUnicode_FromStringAndSize(NULL, 0);
601        PyErr_BadInternalCall();
602        return NULL;
603    }
604
605    if (size == -1) {
606        size = wcslen(w);
607    }
608
609    alloc = size;
610    orig_w = w;
611    for (i = size; i > 0; i--) {
612        if (*w > 0xFFFF)
613            alloc++;
614        w++;
615    }
616    w = orig_w;
617    unicode = _PyUnicode_New(alloc);
618    if (!unicode)
619        return NULL;
620
621    /* Copy the wchar_t data into the new object */
622    {
623        register Py_UNICODE *u;
624        u = PyUnicode_AS_UNICODE(unicode);
625        for (i = size; i > 0; i--) {
626            if (*w > 0xFFFF) {
627                wchar_t ordinal = *w++;
628                ordinal -= 0x10000;
629                *u++ = 0xD800 | (ordinal >> 10);
630                *u++ = 0xDC00 | (ordinal & 0x3FF);
631            }
632            else
633                *u++ = *w++;
634        }
635    }
636    return (PyObject *)unicode;
637}
638
639#else
640
641PyObject *
642PyUnicode_FromWideChar(register const wchar_t *w, Py_ssize_t size)
643{
644    PyUnicodeObject *unicode;
645
646    if (w == NULL) {
647        if (size == 0)
648            return PyUnicode_FromStringAndSize(NULL, 0);
649        PyErr_BadInternalCall();
650        return NULL;
651    }
652
653    if (size == -1) {
654        size = wcslen(w);
655    }
656
657    unicode = _PyUnicode_New(size);
658    if (!unicode)
659        return NULL;
660
661    /* Copy the wchar_t data into the new object */
662#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
663    memcpy(unicode->str, w, size * sizeof(wchar_t));
664#else
665    {
666        register Py_UNICODE *u;
667        register Py_ssize_t i;
668        u = PyUnicode_AS_UNICODE(unicode);
669        for (i = size; i > 0; i--)
670            *u++ = *w++;
671    }
672#endif
673
674    return (PyObject *)unicode;
675}
676
677#endif /* CONVERT_WCHAR_TO_SURROGATES */
678
679#undef CONVERT_WCHAR_TO_SURROGATES
680
681static void
682makefmt(char *fmt, int longflag, int longlongflag, int size_tflag,
683        int zeropad, int width, int precision, char c)
684{
685    *fmt++ = '%';
686    if (width) {
687        if (zeropad)
688            *fmt++ = '0';
689        fmt += sprintf(fmt, "%d", width);
690    }
691    if (precision)
692        fmt += sprintf(fmt, ".%d", precision);
693    if (longflag)
694        *fmt++ = 'l';
695    else if (longlongflag) {
696        /* longlongflag should only ever be nonzero on machines with
697           HAVE_LONG_LONG defined */
698#ifdef HAVE_LONG_LONG
699        char *f = PY_FORMAT_LONG_LONG;
700        while (*f)
701            *fmt++ = *f++;
702#else
703        /* we shouldn't ever get here */
704        assert(0);
705        *fmt++ = 'l';
706#endif
707    }
708    else if (size_tflag) {
709        char *f = PY_FORMAT_SIZE_T;
710        while (*f)
711            *fmt++ = *f++;
712    }
713    *fmt++ = c;
714    *fmt = '\0';
715}
716
717/* helper for PyUnicode_FromFormatV() */
718
719static const char*
720parse_format_flags(const char *f,
721                   int *p_width, int *p_precision,
722                   int *p_longflag, int *p_longlongflag, int *p_size_tflag)
723{
724    int width, precision, longflag, longlongflag, size_tflag;
725
726    /* parse the width.precision part, e.g. "%2.5s" => width=2, precision=5 */
727    f++;
728    width = 0;
729    while (Py_ISDIGIT((unsigned)*f))
730        width = (width*10) + *f++ - '0';
731    precision = 0;
732    if (*f == '.') {
733        f++;
734        while (Py_ISDIGIT((unsigned)*f))
735            precision = (precision*10) + *f++ - '0';
736        if (*f == '%') {
737            /* "%.3%s" => f points to "3" */
738            f--;
739        }
740    }
741    if (*f == '\0') {
742        /* bogus format "%.1" => go backward, f points to "1" */
743        f--;
744    }
745    if (p_width != NULL)
746        *p_width = width;
747    if (p_precision != NULL)
748        *p_precision = precision;
749
750    /* Handle %ld, %lu, %lld and %llu. */
751    longflag = 0;
752    longlongflag = 0;
753    size_tflag = 0;
754
755    if (*f == 'l') {
756        if (f[1] == 'd' || f[1] == 'u' || f[1] == 'i') {
757            longflag = 1;
758            ++f;
759        }
760#ifdef HAVE_LONG_LONG
761        else if (f[1] == 'l' &&
762                 (f[2] == 'd' || f[2] == 'u' || f[2] == 'i')) {
763            longlongflag = 1;
764            f += 2;
765        }
766#endif
767    }
768    /* handle the size_t flag. */
769    else if (*f == 'z' && (f[1] == 'd' || f[1] == 'u' || f[1] == 'i')) {
770        size_tflag = 1;
771        ++f;
772    }
773    if (p_longflag != NULL)
774        *p_longflag = longflag;
775    if (p_longlongflag != NULL)
776        *p_longlongflag = longlongflag;
777    if (p_size_tflag != NULL)
778        *p_size_tflag = size_tflag;
779    return f;
780}
781
782#define appendstring(string) {for (copy = string;*copy;) *s++ = *copy++;}
783
784/* size of fixed-size buffer for formatting single arguments */
785#define ITEM_BUFFER_LEN 21
786/* maximum number of characters required for output of %ld.  21 characters
787   allows for 64-bit integers (in decimal) and an optional sign. */
788#define MAX_LONG_CHARS 21
789/* maximum number of characters required for output of %lld.
790   We need at most ceil(log10(256)*SIZEOF_LONG_LONG) digits,
791   plus 1 for the sign.  53/22 is an upper bound for log10(256). */
792#define MAX_LONG_LONG_CHARS (2 + (SIZEOF_LONG_LONG*53-1) / 22)
793
794PyObject *
795PyUnicode_FromFormatV(const char *format, va_list vargs)
796{
797    va_list count;
798    Py_ssize_t callcount = 0;
799    PyObject **callresults = NULL;
800    PyObject **callresult = NULL;
801    Py_ssize_t n = 0;
802    int width = 0;
803    int precision = 0;
804    int zeropad;
805    const char* f;
806    Py_UNICODE *s;
807    PyObject *string;
808    /* used by sprintf */
809    char buffer[ITEM_BUFFER_LEN+1];
810    /* use abuffer instead of buffer, if we need more space
811     * (which can happen if there's a format specifier with width). */
812    char *abuffer = NULL;
813    char *realbuffer;
814    Py_ssize_t abuffersize = 0;
815    char fmt[61]; /* should be enough for %0width.precisionlld */
816    const char *copy;
817
818    Py_VA_COPY(count, vargs);
819    /* step 1: count the number of %S/%R/%A/%s format specifications
820     * (we call PyObject_Str()/PyObject_Repr()/PyObject_ASCII()/
821     * PyUnicode_DecodeUTF8() for these objects once during step 3 and put the
822     * result in an array) */
823    for (f = format; *f; f++) {
824         if (*f == '%') {
825             /* skip width or width.precision (eg. "1.2" of "%1.2f") */
826             f = parse_format_flags(f, NULL, NULL, NULL, NULL, NULL);
827             if (*f == 's' || *f=='S' || *f=='R' || *f=='A' || *f=='V')
828                 ++callcount;
829         }
830         else if (128 <= (unsigned char)*f) {
831             PyErr_Format(PyExc_ValueError,
832                "PyUnicode_FromFormatV() expects an ASCII-encoded format "
833                "string, got a non-ASCII byte: 0x%02x",
834                (unsigned char)*f);
835             return NULL;
836         }
837    }
838    /* step 2: allocate memory for the results of
839     * PyObject_Str()/PyObject_Repr()/PyUnicode_DecodeUTF8() calls */
840    if (callcount) {
841        callresults = PyObject_Malloc(sizeof(PyObject *)*callcount);
842        if (!callresults) {
843            PyErr_NoMemory();
844            return NULL;
845        }
846        callresult = callresults;
847    }
848    /* step 3: figure out how large a buffer we need */
849    for (f = format; *f; f++) {
850        if (*f == '%') {
851#ifdef HAVE_LONG_LONG
852            int longlongflag;
853#endif
854            const char* p;
855
856            p = f;
857            f = parse_format_flags(f, &width, NULL,
858                                   NULL, &longlongflag, NULL);
859
860            switch (*f) {
861            case 'c':
862            {
863#ifndef Py_UNICODE_WIDE
864                int ordinal = va_arg(count, int);
865                if (ordinal > 0xffff)
866                    n += 2;
867                else
868                    n++;
869#else
870                (void)va_arg(count, int);
871                n++;
872#endif
873                break;
874            }
875            case '%':
876                n++;
877                break;
878            case 'd': case 'u': case 'i': case 'x':
879                (void) va_arg(count, int);
880#ifdef HAVE_LONG_LONG
881                if (longlongflag) {
882                    if (width < MAX_LONG_LONG_CHARS)
883                        width = MAX_LONG_LONG_CHARS;
884                }
885                else
886#endif
887                    /* MAX_LONG_CHARS is enough to hold a 64-bit integer,
888                       including sign.  Decimal takes the most space.  This
889                       isn't enough for octal.  If a width is specified we
890                       need more (which we allocate later). */
891                    if (width < MAX_LONG_CHARS)
892                        width = MAX_LONG_CHARS;
893                n += width;
894                /* XXX should allow for large precision here too. */
895                if (abuffersize < width)
896                    abuffersize = width;
897                break;
898            case 's':
899            {
900                /* UTF-8 */
901                const char *s = va_arg(count, const char*);
902                PyObject *str = PyUnicode_DecodeUTF8(s, strlen(s), "replace");
903                if (!str)
904                    goto fail;
905                n += PyUnicode_GET_SIZE(str);
906                /* Remember the str and switch to the next slot */
907                *callresult++ = str;
908                break;
909            }
910            case 'U':
911            {
912                PyObject *obj = va_arg(count, PyObject *);
913                assert(obj && PyUnicode_Check(obj));
914                n += PyUnicode_GET_SIZE(obj);
915                break;
916            }
917            case 'V':
918            {
919                PyObject *obj = va_arg(count, PyObject *);
920                const char *str = va_arg(count, const char *);
921                PyObject *str_obj;
922                assert(obj || str);
923                assert(!obj || PyUnicode_Check(obj));
924                if (obj) {
925                    n += PyUnicode_GET_SIZE(obj);
926                    *callresult++ = NULL;
927                }
928                else {
929                    str_obj = PyUnicode_DecodeUTF8(str, strlen(str), "replace");
930                    if (!str_obj)
931                        goto fail;
932                    n += PyUnicode_GET_SIZE(str_obj);
933                    *callresult++ = str_obj;
934                }
935                break;
936            }
937            case 'S':
938            {
939                PyObject *obj = va_arg(count, PyObject *);
940                PyObject *str;
941                assert(obj);
942                str = PyObject_Str(obj);
943                if (!str)
944                    goto fail;
945                n += PyUnicode_GET_SIZE(str);
946                /* Remember the str and switch to the next slot */
947                *callresult++ = str;
948                break;
949            }
950            case 'R':
951            {
952                PyObject *obj = va_arg(count, PyObject *);
953                PyObject *repr;
954                assert(obj);
955                repr = PyObject_Repr(obj);
956                if (!repr)
957                    goto fail;
958                n += PyUnicode_GET_SIZE(repr);
959                /* Remember the repr and switch to the next slot */
960                *callresult++ = repr;
961                break;
962            }
963            case 'A':
964            {
965                PyObject *obj = va_arg(count, PyObject *);
966                PyObject *ascii;
967                assert(obj);
968                ascii = PyObject_ASCII(obj);
969                if (!ascii)
970                    goto fail;
971                n += PyUnicode_GET_SIZE(ascii);
972                /* Remember the repr and switch to the next slot */
973                *callresult++ = ascii;
974                break;
975            }
976            case 'p':
977                (void) va_arg(count, int);
978                /* maximum 64-bit pointer representation:
979                 * 0xffffffffffffffff
980                 * so 19 characters is enough.
981                 * XXX I count 18 -- what's the extra for?
982                 */
983                n += 19;
984                break;
985            default:
986                /* if we stumble upon an unknown
987                   formatting code, copy the rest of
988                   the format string to the output
989                   string. (we cannot just skip the
990                   code, since there's no way to know
991                   what's in the argument list) */
992                n += strlen(p);
993                goto expand;
994            }
995        } else
996            n++;
997    }
998  expand:
999    if (abuffersize > ITEM_BUFFER_LEN) {
1000        /* add 1 for sprintf's trailing null byte */
1001        abuffer = PyObject_Malloc(abuffersize + 1);
1002        if (!abuffer) {
1003            PyErr_NoMemory();
1004            goto fail;
1005        }
1006        realbuffer = abuffer;
1007    }
1008    else
1009        realbuffer = buffer;
1010    /* step 4: fill the buffer */
1011    /* Since we've analyzed how much space we need for the worst case,
1012       we don't have to resize the string.
1013       There can be no errors beyond this point. */
1014    string = PyUnicode_FromUnicode(NULL, n);
1015    if (!string)
1016        goto fail;
1017
1018    s = PyUnicode_AS_UNICODE(string);
1019    callresult = callresults;
1020
1021    for (f = format; *f; f++) {
1022        if (*f == '%') {
1023            const char* p;
1024            int longflag;
1025            int longlongflag;
1026            int size_tflag;
1027
1028            p = f;
1029            zeropad = (f[1] == '0');
1030            f = parse_format_flags(f, &width, &precision,
1031                                   &longflag, &longlongflag, &size_tflag);
1032
1033            switch (*f) {
1034            case 'c':
1035            {
1036                int ordinal = va_arg(vargs, int);
1037#ifndef Py_UNICODE_WIDE
1038                if (ordinal > 0xffff) {
1039                    ordinal -= 0x10000;
1040                    *s++ = 0xD800 | (ordinal >> 10);
1041                    *s++ = 0xDC00 | (ordinal & 0x3FF);
1042                } else
1043#endif
1044                *s++ = ordinal;
1045                break;
1046            }
1047            case 'i':
1048            case 'd':
1049                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1050                        width, precision, *f);
1051                if (longflag)
1052                    sprintf(realbuffer, fmt, va_arg(vargs, long));
1053#ifdef HAVE_LONG_LONG
1054                else if (longlongflag)
1055                    sprintf(realbuffer, fmt, va_arg(vargs, PY_LONG_LONG));
1056#endif
1057                else if (size_tflag)
1058                    sprintf(realbuffer, fmt, va_arg(vargs, Py_ssize_t));
1059                else
1060                    sprintf(realbuffer, fmt, va_arg(vargs, int));
1061                appendstring(realbuffer);
1062                break;
1063            case 'u':
1064                makefmt(fmt, longflag, longlongflag, size_tflag, zeropad,
1065                        width, precision, 'u');
1066                if (longflag)
1067                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned long));
1068#ifdef HAVE_LONG_LONG
1069                else if (longlongflag)
1070                    sprintf(realbuffer, fmt, va_arg(vargs,
1071                                                    unsigned PY_LONG_LONG));
1072#endif
1073                else if (size_tflag)
1074                    sprintf(realbuffer, fmt, va_arg(vargs, size_t));
1075                else
1076                    sprintf(realbuffer, fmt, va_arg(vargs, unsigned int));
1077                appendstring(realbuffer);
1078                break;
1079            case 'x':
1080                makefmt(fmt, 0, 0, 0, zeropad, width, precision, 'x');
1081                sprintf(realbuffer, fmt, va_arg(vargs, int));
1082                appendstring(realbuffer);
1083                break;
1084            case 's':
1085            {
1086                /* unused, since we already have the result */
1087                (void) va_arg(vargs, char *);
1088                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1089                                PyUnicode_GET_SIZE(*callresult));
1090                s += PyUnicode_GET_SIZE(*callresult);
1091                /* We're done with the unicode()/repr() => forget it */
1092                Py_DECREF(*callresult);
1093                /* switch to next unicode()/repr() result */
1094                ++callresult;
1095                break;
1096            }
1097            case 'U':
1098            {
1099                PyObject *obj = va_arg(vargs, PyObject *);
1100                Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1101                Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1102                s += size;
1103                break;
1104            }
1105            case 'V':
1106            {
1107                PyObject *obj = va_arg(vargs, PyObject *);
1108                va_arg(vargs, const char *);
1109                if (obj) {
1110                    Py_ssize_t size = PyUnicode_GET_SIZE(obj);
1111                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(obj), size);
1112                    s += size;
1113                } else {
1114                    Py_UNICODE_COPY(s, PyUnicode_AS_UNICODE(*callresult),
1115                                    PyUnicode_GET_SIZE(*callresult));
1116                    s += PyUnicode_GET_SIZE(*callresult);
1117                    Py_DECREF(*callresult);
1118                }
1119                ++callresult;
1120                break;
1121            }
1122            case 'S':
1123            case 'R':
1124            case 'A':
1125            {
1126                Py_UNICODE *ucopy;
1127                Py_ssize_t usize;
1128                Py_ssize_t upos;
1129                /* unused, since we already have the result */
1130                (void) va_arg(vargs, PyObject *);
1131                ucopy = PyUnicode_AS_UNICODE(*callresult);
1132                usize = PyUnicode_GET_SIZE(*callresult);
1133                for (upos = 0; upos<usize;)
1134                    *s++ = ucopy[upos++];
1135                /* We're done with the unicode()/repr() => forget it */
1136                Py_DECREF(*callresult);
1137                /* switch to next unicode()/repr() result */
1138                ++callresult;
1139                break;
1140            }
1141            case 'p':
1142                sprintf(buffer, "%p", va_arg(vargs, void*));
1143                /* %p is ill-defined:  ensure leading 0x. */
1144                if (buffer[1] == 'X')
1145                    buffer[1] = 'x';
1146                else if (buffer[1] != 'x') {
1147                    memmove(buffer+2, buffer, strlen(buffer)+1);
1148                    buffer[0] = '0';
1149                    buffer[1] = 'x';
1150                }
1151                appendstring(buffer);
1152                break;
1153            case '%':
1154                *s++ = '%';
1155                break;
1156            default:
1157                appendstring(p);
1158                goto end;
1159            }
1160        }
1161        else
1162            *s++ = *f;
1163    }
1164
1165  end:
1166    if (callresults)
1167        PyObject_Free(callresults);
1168    if (abuffer)
1169        PyObject_Free(abuffer);
1170    PyUnicode_Resize(&string, s - PyUnicode_AS_UNICODE(string));
1171    return string;
1172  fail:
1173    if (callresults) {
1174        PyObject **callresult2 = callresults;
1175        while (callresult2 < callresult) {
1176            Py_XDECREF(*callresult2);
1177            ++callresult2;
1178        }
1179        PyObject_Free(callresults);
1180    }
1181    if (abuffer)
1182        PyObject_Free(abuffer);
1183    return NULL;
1184}
1185
1186#undef appendstring
1187
1188PyObject *
1189PyUnicode_FromFormat(const char *format, ...)
1190{
1191    PyObject* ret;
1192    va_list vargs;
1193
1194#ifdef HAVE_STDARG_PROTOTYPES
1195    va_start(vargs, format);
1196#else
1197    va_start(vargs);
1198#endif
1199    ret = PyUnicode_FromFormatV(format, vargs);
1200    va_end(vargs);
1201    return ret;
1202}
1203
1204/* Helper function for PyUnicode_AsWideChar() and PyUnicode_AsWideCharString():
1205   convert a Unicode object to a wide character string.
1206
1207   - If w is NULL: return the number of wide characters (including the nul
1208     character) required to convert the unicode object. Ignore size argument.
1209
1210   - Otherwise: return the number of wide characters (excluding the nul
1211     character) written into w. Write at most size wide characters (including
1212     the nul character). */
1213static Py_ssize_t
1214unicode_aswidechar(PyUnicodeObject *unicode,
1215                   wchar_t *w,
1216                   Py_ssize_t size)
1217{
1218#if Py_UNICODE_SIZE == SIZEOF_WCHAR_T
1219    Py_ssize_t res;
1220    if (w != NULL) {
1221        res = PyUnicode_GET_SIZE(unicode);
1222        if (size > res)
1223            size = res + 1;
1224        else
1225            res = size;
1226        memcpy(w, unicode->str, size * sizeof(wchar_t));
1227        return res;
1228    }
1229    else
1230        return PyUnicode_GET_SIZE(unicode) + 1;
1231#elif Py_UNICODE_SIZE == 2 && SIZEOF_WCHAR_T == 4
1232    register const Py_UNICODE *u;
1233    const Py_UNICODE *uend;
1234    const wchar_t *worig, *wend;
1235    Py_ssize_t nchar;
1236
1237    u = PyUnicode_AS_UNICODE(unicode);
1238    uend = u + PyUnicode_GET_SIZE(unicode);
1239    if (w != NULL) {
1240        worig = w;
1241        wend = w + size;
1242        while (u != uend && w != wend) {
1243            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1244                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1245            {
1246                *w = (((u[0] & 0x3FF) << 10) | (u[1] & 0x3FF)) + 0x10000;
1247                u += 2;
1248            }
1249            else {
1250                *w = *u;
1251                u++;
1252            }
1253            w++;
1254        }
1255        if (w != wend)
1256            *w = L'\0';
1257        return w - worig;
1258    }
1259    else {
1260        nchar = 1; /* nul character at the end */
1261        while (u != uend) {
1262            if (0xD800 <= u[0] && u[0] <= 0xDBFF
1263                && 0xDC00 <= u[1] && u[1] <= 0xDFFF)
1264                u += 2;
1265            else
1266                u++;
1267            nchar++;
1268        }
1269    }
1270    return nchar;
1271#elif Py_UNICODE_SIZE == 4 && SIZEOF_WCHAR_T == 2
1272    register Py_UNICODE *u, *uend, ordinal;
1273    register Py_ssize_t i;
1274    wchar_t *worig, *wend;
1275    Py_ssize_t nchar;
1276
1277    u = PyUnicode_AS_UNICODE(unicode);
1278    uend = u + PyUnicode_GET_SIZE(u);
1279    if (w != NULL) {
1280        worig = w;
1281        wend = w + size;
1282        while (u != uend && w != wend) {
1283            ordinal = *u;
1284            if (ordinal > 0xffff) {
1285                ordinal -= 0x10000;
1286                *w++ = 0xD800 | (ordinal >> 10);
1287                *w++ = 0xDC00 | (ordinal & 0x3FF);
1288            }
1289            else
1290                *w++ = ordinal;
1291            u++;
1292        }
1293        if (w != wend)
1294            *w = 0;
1295        return w - worig;
1296    }
1297    else {
1298        nchar = 1; /* nul character */
1299        while (u != uend) {
1300            if (*u > 0xffff)
1301                nchar += 2;
1302            else
1303                nchar++;
1304            u++;
1305        }
1306        return nchar;
1307    }
1308#else
1309#  error "unsupported wchar_t and Py_UNICODE sizes, see issue #8670"
1310#endif
1311}
1312
1313Py_ssize_t
1314PyUnicode_AsWideChar(PyObject *unicode,
1315                     wchar_t *w,
1316                     Py_ssize_t size)
1317{
1318    if (unicode == NULL) {
1319        PyErr_BadInternalCall();
1320        return -1;
1321    }
1322    return unicode_aswidechar((PyUnicodeObject*)unicode, w, size);
1323}
1324
1325wchar_t*
1326PyUnicode_AsWideCharString(PyObject *unicode,
1327                           Py_ssize_t *size)
1328{
1329    wchar_t* buffer;
1330    Py_ssize_t buflen;
1331
1332    if (unicode == NULL) {
1333        PyErr_BadInternalCall();
1334        return NULL;
1335    }
1336
1337    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, NULL, 0);
1338    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < buflen) {
1339        PyErr_NoMemory();
1340        return NULL;
1341    }
1342
1343    buffer = PyMem_MALLOC(buflen * sizeof(wchar_t));
1344    if (buffer == NULL) {
1345        PyErr_NoMemory();
1346        return NULL;
1347    }
1348    buflen = unicode_aswidechar((PyUnicodeObject *)unicode, buffer, buflen);
1349    if (size != NULL)
1350        *size = buflen;
1351    return buffer;
1352}
1353
1354#endif
1355
1356PyObject *
1357PyUnicode_FromOrdinal(int ordinal)
1358{
1359    Py_UNICODE s[2];
1360
1361    if (ordinal < 0 || ordinal > 0x10ffff) {
1362        PyErr_SetString(PyExc_ValueError,
1363                        "chr() arg not in range(0x110000)");
1364        return NULL;
1365    }
1366
1367#ifndef Py_UNICODE_WIDE
1368    if (ordinal > 0xffff) {
1369        ordinal -= 0x10000;
1370        s[0] = 0xD800 | (ordinal >> 10);
1371        s[1] = 0xDC00 | (ordinal & 0x3FF);
1372        return PyUnicode_FromUnicode(s, 2);
1373    }
1374#endif
1375
1376    s[0] = (Py_UNICODE)ordinal;
1377    return PyUnicode_FromUnicode(s, 1);
1378}
1379
1380PyObject *
1381PyUnicode_FromObject(register PyObject *obj)
1382{
1383    /* XXX Perhaps we should make this API an alias of
1384       PyObject_Str() instead ?! */
1385    if (PyUnicode_CheckExact(obj)) {
1386        Py_INCREF(obj);
1387        return obj;
1388    }
1389    if (PyUnicode_Check(obj)) {
1390        /* For a Unicode subtype that's not a Unicode object,
1391           return a true Unicode object with the same data. */
1392        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(obj),
1393                                     PyUnicode_GET_SIZE(obj));
1394    }
1395    PyErr_Format(PyExc_TypeError,
1396                 "Can't convert '%.100s' object to str implicitly",
1397                 Py_TYPE(obj)->tp_name);
1398    return NULL;
1399}
1400
1401PyObject *
1402PyUnicode_FromEncodedObject(register PyObject *obj,
1403			    const char *encoding,
1404			    const char *errors)
1405{
1406    Py_buffer buffer;
1407    PyObject *v;
1408
1409    if (obj == NULL) {
1410        PyErr_BadInternalCall();
1411        return NULL;
1412    }
1413
1414    /* Decoding bytes objects is the most common case and should be fast */
1415    if (PyBytes_Check(obj)) {
1416        if (PyBytes_GET_SIZE(obj) == 0) {
1417            Py_INCREF(unicode_empty);
1418            v = (PyObject *) unicode_empty;
1419        }
1420        else {
1421            v = PyUnicode_Decode(
1422                    PyBytes_AS_STRING(obj), PyBytes_GET_SIZE(obj),
1423                    encoding, errors);
1424        }
1425        return v;
1426    }
1427
1428    if (PyUnicode_Check(obj)) {
1429        PyErr_SetString(PyExc_TypeError,
1430                        "decoding str is not supported");
1431        return NULL;
1432    }
1433
1434    /* Retrieve a bytes buffer view through the PEP 3118 buffer interface */
1435    if (PyObject_GetBuffer(obj, &buffer, PyBUF_SIMPLE) < 0) {
1436        PyErr_Format(PyExc_TypeError,
1437                     "coercing to str: need bytes, bytearray "
1438                     "or buffer-like object, %.80s found",
1439                     Py_TYPE(obj)->tp_name);
1440        return NULL;
1441    }
1442
1443    if (buffer.len == 0) {
1444        Py_INCREF(unicode_empty);
1445        v = (PyObject *) unicode_empty;
1446    }
1447    else
1448        v = PyUnicode_Decode((char*) buffer.buf, buffer.len, encoding, errors);
1449
1450    PyBuffer_Release(&buffer);
1451    return v;
1452}
1453
1454/* Convert encoding to lower case and replace '_' with '-' in order to
1455   catch e.g. UTF_8. Return 0 on error (encoding is longer than lower_len-1),
1456   1 on success. */
1457static int
1458normalize_encoding(const char *encoding,
1459                   char *lower,
1460                   size_t lower_len)
1461{
1462    const char *e;
1463    char *l;
1464    char *l_end;
1465
1466    e = encoding;
1467    l = lower;
1468    l_end = &lower[lower_len - 1];
1469    while (*e) {
1470        if (l == l_end)
1471            return 0;
1472        if (Py_ISUPPER(*e)) {
1473            *l++ = Py_TOLOWER(*e++);
1474        }
1475        else if (*e == '_') {
1476            *l++ = '-';
1477            e++;
1478        }
1479        else {
1480            *l++ = *e++;
1481        }
1482    }
1483    *l = '\0';
1484    return 1;
1485}
1486
1487PyObject *
1488PyUnicode_Decode(const char *s,
1489		 Py_ssize_t size,
1490		 const char *encoding,
1491		 const char *errors)
1492{
1493    PyObject *buffer = NULL, *unicode;
1494    Py_buffer info;
1495    char lower[11];  /* Enough for any encoding shortcut */
1496
1497    if (encoding == NULL)
1498        return PyUnicode_DecodeUTF8(s, size, errors);
1499
1500    /* Shortcuts for common default encodings */
1501    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1502        if ((strcmp(lower, "utf-8") == 0) ||
1503            (strcmp(lower, "utf8") == 0))
1504            return PyUnicode_DecodeUTF8(s, size, errors);
1505        else if ((strcmp(lower, "latin-1") == 0) ||
1506                 (strcmp(lower, "latin1") == 0) ||
1507                 (strcmp(lower, "iso-8859-1") == 0))
1508            return PyUnicode_DecodeLatin1(s, size, errors);
1509#ifdef HAVE_MBCS
1510        else if (strcmp(lower, "mbcs") == 0)
1511            return PyUnicode_DecodeMBCS(s, size, errors);
1512#endif
1513        else if (strcmp(lower, "ascii") == 0)
1514            return PyUnicode_DecodeASCII(s, size, errors);
1515        else if (strcmp(lower, "utf-16") == 0)
1516            return PyUnicode_DecodeUTF16(s, size, errors, 0);
1517        else if (strcmp(lower, "utf-32") == 0)
1518            return PyUnicode_DecodeUTF32(s, size, errors, 0);
1519    }
1520
1521    /* Decode via the codec registry */
1522    buffer = NULL;
1523    if (PyBuffer_FillInfo(&info, NULL, (void *)s, size, 1, PyBUF_FULL_RO) < 0)
1524        goto onError;
1525    buffer = PyMemoryView_FromBuffer(&info);
1526    if (buffer == NULL)
1527        goto onError;
1528    unicode = PyCodec_Decode(buffer, encoding, errors);
1529    if (unicode == NULL)
1530        goto onError;
1531    if (!PyUnicode_Check(unicode)) {
1532        PyErr_Format(PyExc_TypeError,
1533                     "decoder did not return a str object (type=%.400s)",
1534                     Py_TYPE(unicode)->tp_name);
1535        Py_DECREF(unicode);
1536        goto onError;
1537    }
1538    Py_DECREF(buffer);
1539    return unicode;
1540
1541  onError:
1542    Py_XDECREF(buffer);
1543    return NULL;
1544}
1545
1546PyObject *
1547PyUnicode_AsDecodedObject(PyObject *unicode,
1548			  const char *encoding,
1549			  const char *errors)
1550{
1551    PyObject *v;
1552
1553    if (!PyUnicode_Check(unicode)) {
1554        PyErr_BadArgument();
1555        goto onError;
1556    }
1557
1558    if (encoding == NULL)
1559        encoding = PyUnicode_GetDefaultEncoding();
1560
1561    /* Decode via the codec registry */
1562    v = PyCodec_Decode(unicode, encoding, errors);
1563    if (v == NULL)
1564        goto onError;
1565    return v;
1566
1567  onError:
1568    return NULL;
1569}
1570
1571PyObject *
1572PyUnicode_AsDecodedUnicode(PyObject *unicode,
1573			   const char *encoding,
1574			   const char *errors)
1575{
1576    PyObject *v;
1577
1578    if (!PyUnicode_Check(unicode)) {
1579        PyErr_BadArgument();
1580        goto onError;
1581    }
1582
1583    if (encoding == NULL)
1584        encoding = PyUnicode_GetDefaultEncoding();
1585
1586    /* Decode via the codec registry */
1587    v = PyCodec_Decode(unicode, encoding, errors);
1588    if (v == NULL)
1589        goto onError;
1590    if (!PyUnicode_Check(v)) {
1591        PyErr_Format(PyExc_TypeError,
1592                     "decoder did not return a str object (type=%.400s)",
1593                     Py_TYPE(v)->tp_name);
1594        Py_DECREF(v);
1595        goto onError;
1596    }
1597    return v;
1598
1599  onError:
1600    return NULL;
1601}
1602
1603PyObject *
1604PyUnicode_Encode(const Py_UNICODE *s,
1605		 Py_ssize_t size,
1606		 const char *encoding,
1607		 const char *errors)
1608{
1609    PyObject *v, *unicode;
1610
1611    unicode = PyUnicode_FromUnicode(s, size);
1612    if (unicode == NULL)
1613        return NULL;
1614    v = PyUnicode_AsEncodedString(unicode, encoding, errors);
1615    Py_DECREF(unicode);
1616    return v;
1617}
1618
1619PyObject *
1620PyUnicode_AsEncodedObject(PyObject *unicode,
1621			  const char *encoding,
1622			  const char *errors)
1623{
1624    PyObject *v;
1625
1626    if (!PyUnicode_Check(unicode)) {
1627        PyErr_BadArgument();
1628        goto onError;
1629    }
1630
1631    if (encoding == NULL)
1632        encoding = PyUnicode_GetDefaultEncoding();
1633
1634    /* Encode via the codec registry */
1635    v = PyCodec_Encode(unicode, encoding, errors);
1636    if (v == NULL)
1637        goto onError;
1638    return v;
1639
1640  onError:
1641    return NULL;
1642}
1643
1644PyObject *
1645PyUnicode_EncodeFSDefault(PyObject *unicode)
1646{
1647#ifdef HAVE_MBCS
1648    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1649                                PyUnicode_GET_SIZE(unicode),
1650                                NULL);
1651#elif defined(__APPLE__)
1652    return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1653                                PyUnicode_GET_SIZE(unicode),
1654                                "surrogateescape");
1655#else
1656    PyInterpreterState *interp = PyThreadState_GET()->interp;
1657    /* Bootstrap check: if the filesystem codec is implemented in Python, we
1658       cannot use it to encode and decode filenames before it is loaded. Load
1659       the Python codec requires to encode at least its own filename. Use the C
1660       version of the locale codec until the codec registry is initialized and
1661       the Python codec is loaded.
1662
1663       Py_FileSystemDefaultEncoding is shared between all interpreters, we
1664       cannot only rely on it: check also interp->fscodec_initialized for
1665       subinterpreters. */
1666    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1667        return PyUnicode_AsEncodedString(unicode,
1668                                         Py_FileSystemDefaultEncoding,
1669                                         "surrogateescape");
1670    }
1671    else {
1672        /* locale encoding with surrogateescape */
1673        wchar_t *wchar;
1674        char *bytes;
1675        PyObject *bytes_obj;
1676        size_t error_pos;
1677
1678        wchar = PyUnicode_AsWideCharString(unicode, NULL);
1679        if (wchar == NULL)
1680            return NULL;
1681        bytes = _Py_wchar2char(wchar, &error_pos);
1682        if (bytes == NULL) {
1683            if (error_pos != (size_t)-1) {
1684                char *errmsg = strerror(errno);
1685                PyObject *exc = NULL;
1686                if (errmsg == NULL)
1687                    errmsg = "Py_wchar2char() failed";
1688                raise_encode_exception(&exc,
1689                    "filesystemencoding",
1690                    PyUnicode_AS_UNICODE(unicode), PyUnicode_GET_SIZE(unicode),
1691                    error_pos, error_pos+1,
1692                    errmsg);
1693                Py_XDECREF(exc);
1694            }
1695            else
1696                PyErr_NoMemory();
1697            PyMem_Free(wchar);
1698            return NULL;
1699        }
1700        PyMem_Free(wchar);
1701
1702        bytes_obj = PyBytes_FromString(bytes);
1703        PyMem_Free(bytes);
1704        return bytes_obj;
1705    }
1706#endif
1707}
1708
1709PyObject *
1710PyUnicode_AsEncodedString(PyObject *unicode,
1711			  const char *encoding,
1712			  const char *errors)
1713{
1714    PyObject *v;
1715    char lower[11];  /* Enough for any encoding shortcut */
1716
1717    if (!PyUnicode_Check(unicode)) {
1718        PyErr_BadArgument();
1719        return NULL;
1720    }
1721
1722    if (encoding == NULL) {
1723        if (errors == NULL || strcmp(errors, "strict") == 0)
1724            return PyUnicode_AsUTF8String(unicode);
1725        else
1726            return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1727                                        PyUnicode_GET_SIZE(unicode),
1728                                        errors);
1729    }
1730
1731    /* Shortcuts for common default encodings */
1732    if (normalize_encoding(encoding, lower, sizeof(lower))) {
1733        if ((strcmp(lower, "utf-8") == 0) ||
1734            (strcmp(lower, "utf8") == 0))
1735        {
1736            if (errors == NULL || strcmp(errors, "strict") == 0)
1737                return PyUnicode_AsUTF8String(unicode);
1738            else
1739                return PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1740                                            PyUnicode_GET_SIZE(unicode),
1741                                            errors);
1742        }
1743        else if ((strcmp(lower, "latin-1") == 0) ||
1744                 (strcmp(lower, "latin1") == 0) ||
1745                 (strcmp(lower, "iso-8859-1") == 0))
1746            return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
1747                                          PyUnicode_GET_SIZE(unicode),
1748                                          errors);
1749#ifdef HAVE_MBCS
1750        else if (strcmp(lower, "mbcs") == 0)
1751            return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
1752                                        PyUnicode_GET_SIZE(unicode),
1753                                        errors);
1754#endif
1755        else if (strcmp(lower, "ascii") == 0)
1756            return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
1757                                         PyUnicode_GET_SIZE(unicode),
1758                                         errors);
1759    }
1760
1761    /* Encode via the codec registry */
1762    v = PyCodec_Encode(unicode, encoding, errors);
1763    if (v == NULL)
1764        return NULL;
1765
1766    /* The normal path */
1767    if (PyBytes_Check(v))
1768        return v;
1769
1770    /* If the codec returns a buffer, raise a warning and convert to bytes */
1771    if (PyByteArray_Check(v)) {
1772        int error;
1773        PyObject *b;
1774
1775        error = PyErr_WarnFormat(PyExc_RuntimeWarning, 1,
1776            "encoder %s returned bytearray instead of bytes",
1777            encoding);
1778        if (error) {
1779            Py_DECREF(v);
1780            return NULL;
1781        }
1782
1783        b = PyBytes_FromStringAndSize(PyByteArray_AS_STRING(v), Py_SIZE(v));
1784        Py_DECREF(v);
1785        return b;
1786    }
1787
1788    PyErr_Format(PyExc_TypeError,
1789                 "encoder did not return a bytes object (type=%.400s)",
1790                 Py_TYPE(v)->tp_name);
1791    Py_DECREF(v);
1792    return NULL;
1793}
1794
1795PyObject *
1796PyUnicode_AsEncodedUnicode(PyObject *unicode,
1797			   const char *encoding,
1798			   const char *errors)
1799{
1800    PyObject *v;
1801
1802    if (!PyUnicode_Check(unicode)) {
1803        PyErr_BadArgument();
1804        goto onError;
1805    }
1806
1807    if (encoding == NULL)
1808        encoding = PyUnicode_GetDefaultEncoding();
1809
1810    /* Encode via the codec registry */
1811    v = PyCodec_Encode(unicode, encoding, errors);
1812    if (v == NULL)
1813        goto onError;
1814    if (!PyUnicode_Check(v)) {
1815        PyErr_Format(PyExc_TypeError,
1816                     "encoder did not return an str object (type=%.400s)",
1817                     Py_TYPE(v)->tp_name);
1818        Py_DECREF(v);
1819        goto onError;
1820    }
1821    return v;
1822
1823  onError:
1824    return NULL;
1825}
1826
1827PyObject *
1828_PyUnicode_AsDefaultEncodedString(PyObject *unicode)
1829{
1830    PyObject *v = ((PyUnicodeObject *)unicode)->defenc;
1831    if (v)
1832        return v;
1833    v = PyUnicode_EncodeUTF8(PyUnicode_AS_UNICODE(unicode),
1834                             PyUnicode_GET_SIZE(unicode),
1835                             NULL);
1836    if (!v)
1837        return NULL;
1838    ((PyUnicodeObject *)unicode)->defenc = v;
1839    return v;
1840}
1841
1842PyObject*
1843PyUnicode_DecodeFSDefault(const char *s) {
1844    Py_ssize_t size = (Py_ssize_t)strlen(s);
1845    return PyUnicode_DecodeFSDefaultAndSize(s, size);
1846}
1847
1848PyObject*
1849PyUnicode_DecodeFSDefaultAndSize(const char *s, Py_ssize_t size)
1850{
1851#ifdef HAVE_MBCS
1852    return PyUnicode_DecodeMBCS(s, size, NULL);
1853#elif defined(__APPLE__)
1854    return PyUnicode_DecodeUTF8(s, size, "surrogateescape");
1855#else
1856    PyInterpreterState *interp = PyThreadState_GET()->interp;
1857    /* Bootstrap check: if the filesystem codec is implemented in Python, we
1858       cannot use it to encode and decode filenames before it is loaded. Load
1859       the Python codec requires to encode at least its own filename. Use the C
1860       version of the locale codec until the codec registry is initialized and
1861       the Python codec is loaded.
1862
1863       Py_FileSystemDefaultEncoding is shared between all interpreters, we
1864       cannot only rely on it: check also interp->fscodec_initialized for
1865       subinterpreters. */
1866    if (Py_FileSystemDefaultEncoding && interp->fscodec_initialized) {
1867        return PyUnicode_Decode(s, size,
1868                                Py_FileSystemDefaultEncoding,
1869                                "surrogateescape");
1870    }
1871    else {
1872        /* locale encoding with surrogateescape */
1873        wchar_t *wchar;
1874        PyObject *unicode;
1875        size_t len;
1876
1877        if (s[size] != '\0' || size != strlen(s)) {
1878            PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1879            return NULL;
1880        }
1881
1882        wchar = _Py_char2wchar(s, &len);
1883        if (wchar == NULL)
1884            return PyErr_NoMemory();
1885
1886        unicode = PyUnicode_FromWideChar(wchar, len);
1887        PyMem_Free(wchar);
1888        return unicode;
1889    }
1890#endif
1891}
1892
1893
1894int
1895PyUnicode_FSConverter(PyObject* arg, void* addr)
1896{
1897    PyObject *output = NULL;
1898    Py_ssize_t size;
1899    void *data;
1900    if (arg == NULL) {
1901        Py_DECREF(*(PyObject**)addr);
1902        return 1;
1903    }
1904    if (PyBytes_Check(arg)) {
1905        output = arg;
1906        Py_INCREF(output);
1907    }
1908    else {
1909        arg = PyUnicode_FromObject(arg);
1910        if (!arg)
1911            return 0;
1912        output = PyUnicode_EncodeFSDefault(arg);
1913        Py_DECREF(arg);
1914        if (!output)
1915            return 0;
1916        if (!PyBytes_Check(output)) {
1917            Py_DECREF(output);
1918            PyErr_SetString(PyExc_TypeError, "encoder failed to return bytes");
1919            return 0;
1920        }
1921    }
1922    size = PyBytes_GET_SIZE(output);
1923    data = PyBytes_AS_STRING(output);
1924    if (size != strlen(data)) {
1925        PyErr_SetString(PyExc_TypeError, "embedded NULL character");
1926        Py_DECREF(output);
1927        return 0;
1928    }
1929    *(PyObject**)addr = output;
1930    return Py_CLEANUP_SUPPORTED;
1931}
1932
1933
1934int
1935PyUnicode_FSDecoder(PyObject* arg, void* addr)
1936{
1937    PyObject *output = NULL;
1938    Py_ssize_t size;
1939    void *data;
1940    if (arg == NULL) {
1941        Py_DECREF(*(PyObject**)addr);
1942        return 1;
1943    }
1944    if (PyUnicode_Check(arg)) {
1945        output = arg;
1946        Py_INCREF(output);
1947    }
1948    else {
1949        arg = PyBytes_FromObject(arg);
1950        if (!arg)
1951            return 0;
1952        output = PyUnicode_DecodeFSDefaultAndSize(PyBytes_AS_STRING(arg),
1953                                                  PyBytes_GET_SIZE(arg));
1954        Py_DECREF(arg);
1955        if (!output)
1956            return 0;
1957        if (!PyUnicode_Check(output)) {
1958            Py_DECREF(output);
1959            PyErr_SetString(PyExc_TypeError, "decoder failed to return unicode");
1960            return 0;
1961        }
1962    }
1963    size = PyUnicode_GET_SIZE(output);
1964    data = PyUnicode_AS_UNICODE(output);
1965    if (size != Py_UNICODE_strlen(data)) {
1966        PyErr_SetString(PyExc_TypeError, "embedded NUL character");
1967        Py_DECREF(output);
1968        return 0;
1969    }
1970    *(PyObject**)addr = output;
1971    return Py_CLEANUP_SUPPORTED;
1972}
1973
1974
1975char*
1976_PyUnicode_AsStringAndSize(PyObject *unicode, Py_ssize_t *psize)
1977{
1978    PyObject *bytes;
1979    if (!PyUnicode_Check(unicode)) {
1980        PyErr_BadArgument();
1981        return NULL;
1982    }
1983    bytes = _PyUnicode_AsDefaultEncodedString(unicode);
1984    if (bytes == NULL)
1985        return NULL;
1986    if (psize != NULL)
1987        *psize = PyBytes_GET_SIZE(bytes);
1988    return PyBytes_AS_STRING(bytes);
1989}
1990
1991char*
1992_PyUnicode_AsString(PyObject *unicode)
1993{
1994    return _PyUnicode_AsStringAndSize(unicode, NULL);
1995}
1996
1997Py_UNICODE *
1998PyUnicode_AsUnicode(PyObject *unicode)
1999{
2000    if (!PyUnicode_Check(unicode)) {
2001        PyErr_BadArgument();
2002        goto onError;
2003    }
2004    return PyUnicode_AS_UNICODE(unicode);
2005
2006  onError:
2007    return NULL;
2008}
2009
2010Py_ssize_t
2011PyUnicode_GetSize(PyObject *unicode)
2012{
2013    if (!PyUnicode_Check(unicode)) {
2014        PyErr_BadArgument();
2015        goto onError;
2016    }
2017    return PyUnicode_GET_SIZE(unicode);
2018
2019  onError:
2020    return -1;
2021}
2022
2023const char *
2024PyUnicode_GetDefaultEncoding(void)
2025{
2026    return "utf-8";
2027}
2028
2029/* create or adjust a UnicodeDecodeError */
2030static void
2031make_decode_exception(PyObject **exceptionObject,
2032                      const char *encoding,
2033                      const char *input, Py_ssize_t length,
2034                      Py_ssize_t startpos, Py_ssize_t endpos,
2035                      const char *reason)
2036{
2037    if (*exceptionObject == NULL) {
2038        *exceptionObject = PyUnicodeDecodeError_Create(
2039            encoding, input, length, startpos, endpos, reason);
2040    }
2041    else {
2042        if (PyUnicodeDecodeError_SetStart(*exceptionObject, startpos))
2043            goto onError;
2044        if (PyUnicodeDecodeError_SetEnd(*exceptionObject, endpos))
2045            goto onError;
2046        if (PyUnicodeDecodeError_SetReason(*exceptionObject, reason))
2047            goto onError;
2048    }
2049    return;
2050
2051onError:
2052    Py_DECREF(*exceptionObject);
2053    *exceptionObject = NULL;
2054}
2055
2056/* error handling callback helper:
2057   build arguments, call the callback and check the arguments,
2058   if no exception occurred, copy the replacement to the output
2059   and adjust various state variables.
2060   return 0 on success, -1 on error
2061*/
2062
2063static int
2064unicode_decode_call_errorhandler(const char *errors, PyObject **errorHandler,
2065				 const char *encoding, const char *reason,
2066				 const char **input, const char **inend, Py_ssize_t *startinpos,
2067				 Py_ssize_t *endinpos, PyObject **exceptionObject, const char **inptr,
2068				 PyUnicodeObject **output, Py_ssize_t *outpos, Py_UNICODE **outptr)
2069{
2070    static char *argparse = "O!n;decoding error handler must return (str, int) tuple";
2071
2072    PyObject *restuple = NULL;
2073    PyObject *repunicode = NULL;
2074    Py_ssize_t outsize = PyUnicode_GET_SIZE(*output);
2075    Py_ssize_t insize;
2076    Py_ssize_t requiredsize;
2077    Py_ssize_t newpos;
2078    Py_UNICODE *repptr;
2079    PyObject *inputobj = NULL;
2080    Py_ssize_t repsize;
2081    int res = -1;
2082
2083    if (*errorHandler == NULL) {
2084        *errorHandler = PyCodec_LookupError(errors);
2085        if (*errorHandler == NULL)
2086            goto onError;
2087    }
2088
2089    make_decode_exception(exceptionObject,
2090        encoding,
2091        *input, *inend - *input,
2092        *startinpos, *endinpos,
2093        reason);
2094    if (*exceptionObject == NULL)
2095        goto onError;
2096
2097    restuple = PyObject_CallFunctionObjArgs(*errorHandler, *exceptionObject, NULL);
2098    if (restuple == NULL)
2099        goto onError;
2100    if (!PyTuple_Check(restuple)) {
2101        PyErr_SetString(PyExc_TypeError, &argparse[4]);
2102        goto onError;
2103    }
2104    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type, &repunicode, &newpos))
2105        goto onError;
2106
2107    /* Copy back the bytes variables, which might have been modified by the
2108       callback */
2109    inputobj = PyUnicodeDecodeError_GetObject(*exceptionObject);
2110    if (!inputobj)
2111        goto onError;
2112    if (!PyBytes_Check(inputobj)) {
2113        PyErr_Format(PyExc_TypeError, "exception attribute object must be bytes");
2114    }
2115    *input = PyBytes_AS_STRING(inputobj);
2116    insize = PyBytes_GET_SIZE(inputobj);
2117    *inend = *input + insize;
2118    /* we can DECREF safely, as the exception has another reference,
2119       so the object won't go away. */
2120    Py_DECREF(inputobj);
2121
2122    if (newpos<0)
2123        newpos = insize+newpos;
2124    if (newpos<0 || newpos>insize) {
2125        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", newpos);
2126        goto onError;
2127    }
2128
2129    /* need more space? (at least enough for what we
2130       have+the replacement+the rest of the string (starting
2131       at the new input position), so we won't have to check space
2132       when there are no errors in the rest of the string) */
2133    repptr = PyUnicode_AS_UNICODE(repunicode);
2134    repsize = PyUnicode_GET_SIZE(repunicode);
2135    requiredsize = *outpos + repsize + insize-newpos;
2136    if (requiredsize > outsize) {
2137        if (requiredsize<2*outsize)
2138            requiredsize = 2*outsize;
2139        if (_PyUnicode_Resize(output, requiredsize) < 0)
2140            goto onError;
2141        *outptr = PyUnicode_AS_UNICODE(*output) + *outpos;
2142    }
2143    *endinpos = newpos;
2144    *inptr = *input + newpos;
2145    Py_UNICODE_COPY(*outptr, repptr, repsize);
2146    *outptr += repsize;
2147    *outpos += repsize;
2148
2149    /* we made it! */
2150    res = 0;
2151
2152  onError:
2153    Py_XDECREF(restuple);
2154    return res;
2155}
2156
2157/* --- UTF-7 Codec -------------------------------------------------------- */
2158
2159/* See RFC2152 for details.  We encode conservatively and decode liberally. */
2160
2161/* Three simple macros defining base-64. */
2162
2163/* Is c a base-64 character? */
2164
2165#define IS_BASE64(c) \
2166    (((c) >= 'A' && (c) <= 'Z') ||     \
2167     ((c) >= 'a' && (c) <= 'z') ||     \
2168     ((c) >= '0' && (c) <= '9') ||     \
2169     (c) == '+' || (c) == '/')
2170
2171/* given that c is a base-64 character, what is its base-64 value? */
2172
2173#define FROM_BASE64(c)                                                  \
2174    (((c) >= 'A' && (c) <= 'Z') ? (c) - 'A' :                           \
2175     ((c) >= 'a' && (c) <= 'z') ? (c) - 'a' + 26 :                      \
2176     ((c) >= '0' && (c) <= '9') ? (c) - '0' + 52 :                      \
2177     (c) == '+' ? 62 : 63)
2178
2179/* What is the base-64 character of the bottom 6 bits of n? */
2180
2181#define TO_BASE64(n)  \
2182    ("ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"[(n) & 0x3f])
2183
2184/* DECODE_DIRECT: this byte encountered in a UTF-7 string should be
2185 * decoded as itself.  We are permissive on decoding; the only ASCII
2186 * byte not decoding to itself is the + which begins a base64
2187 * string. */
2188
2189#define DECODE_DIRECT(c)                                \
2190    ((c) <= 127 && (c) != '+')
2191
2192/* The UTF-7 encoder treats ASCII characters differently according to
2193 * whether they are Set D, Set O, Whitespace, or special (i.e. none of
2194 * the above).  See RFC2152.  This array identifies these different
2195 * sets:
2196 * 0 : "Set D"
2197 *     alphanumeric and '(),-./:?
2198 * 1 : "Set O"
2199 *     !"#$%&*;<=>@[]^_`{|}
2200 * 2 : "whitespace"
2201 *     ht nl cr sp
2202 * 3 : special (must be base64 encoded)
2203 *     everything else (i.e. +\~ and non-printing codes 0-8 11-12 14-31 127)
2204 */
2205
2206static
2207char utf7_category[128] = {
2208/* nul soh stx etx eot enq ack bel bs  ht  nl  vt  np  cr  so  si  */
2209    3,  3,  3,  3,  3,  3,  3,  3,  3,  2,  2,  3,  3,  2,  3,  3,
2210/* dle dc1 dc2 dc3 dc4 nak syn etb can em  sub esc fs  gs  rs  us  */
2211    3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,  3,
2212/* sp   !   "   #   $   %   &   '   (   )   *   +   ,   -   .   /  */
2213    2,  1,  1,  1,  1,  1,  1,  0,  0,  0,  1,  3,  0,  0,  0,  0,
2214/*  0   1   2   3   4   5   6   7   8   9   :   ;   <   =   >   ?  */
2215    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  1,  0,
2216/*  @   A   B   C   D   E   F   G   H   I   J   K   L   M   N   O  */
2217    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2218/*  P   Q   R   S   T   U   V   W   X   Y   Z   [   \   ]   ^   _  */
2219    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  3,  1,  1,  1,
2220/*  `   a   b   c   d   e   f   g   h   i   j   k   l   m   n   o  */
2221    1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
2222/*  p   q   r   s   t   u   v   w   x   y   z   {   |   }   ~  del */
2223    0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,  1,  1,  3,  3,
2224};
2225
2226/* ENCODE_DIRECT: this character should be encoded as itself.  The
2227 * answer depends on whether we are encoding set O as itself, and also
2228 * on whether we are encoding whitespace as itself.  RFC2152 makes it
2229 * clear that the answers to these questions vary between
2230 * applications, so this code needs to be flexible.  */
2231
2232#define ENCODE_DIRECT(c, directO, directWS)             \
2233    ((c) < 128 && (c) > 0 &&                            \
2234     ((utf7_category[(c)] == 0) ||                      \
2235      (directWS && (utf7_category[(c)] == 2)) ||        \
2236      (directO && (utf7_category[(c)] == 1))))
2237
2238PyObject *
2239PyUnicode_DecodeUTF7(const char *s,
2240		     Py_ssize_t size,
2241		     const char *errors)
2242{
2243    return PyUnicode_DecodeUTF7Stateful(s, size, errors, NULL);
2244}
2245
2246/* The decoder.  The only state we preserve is our read position,
2247 * i.e. how many characters we have consumed.  So if we end in the
2248 * middle of a shift sequence we have to back off the read position
2249 * and the output to the beginning of the sequence, otherwise we lose
2250 * all the shift state (seen bits, number of bits seen, high
2251 * surrogate). */
2252
2253PyObject *
2254PyUnicode_DecodeUTF7Stateful(const char *s,
2255			     Py_ssize_t size,
2256			     const char *errors,
2257			     Py_ssize_t *consumed)
2258{
2259    const char *starts = s;
2260    Py_ssize_t startinpos;
2261    Py_ssize_t endinpos;
2262    Py_ssize_t outpos;
2263    const char *e;
2264    PyUnicodeObject *unicode;
2265    Py_UNICODE *p;
2266    const char *errmsg = "";
2267    int inShift = 0;
2268    Py_UNICODE *shiftOutStart;
2269    unsigned int base64bits = 0;
2270    unsigned long base64buffer = 0;
2271    Py_UNICODE surrogate = 0;
2272    PyObject *errorHandler = NULL;
2273    PyObject *exc = NULL;
2274
2275    unicode = _PyUnicode_New(size);
2276    if (!unicode)
2277        return NULL;
2278    if (size == 0) {
2279        if (consumed)
2280            *consumed = 0;
2281        return (PyObject *)unicode;
2282    }
2283
2284    p = unicode->str;
2285    shiftOutStart = p;
2286    e = s + size;
2287
2288    while (s < e) {
2289        Py_UNICODE ch;
2290      restart:
2291        ch = (unsigned char) *s;
2292
2293        if (inShift) { /* in a base-64 section */
2294            if (IS_BASE64(ch)) { /* consume a base-64 character */
2295                base64buffer = (base64buffer << 6) | FROM_BASE64(ch);
2296                base64bits += 6;
2297                s++;
2298                if (base64bits >= 16) {
2299                    /* we have enough bits for a UTF-16 value */
2300                    Py_UNICODE outCh = (Py_UNICODE)
2301                                       (base64buffer >> (base64bits-16));
2302                    base64bits -= 16;
2303                    base64buffer &= (1 << base64bits) - 1; /* clear high bits */
2304                    if (surrogate) {
2305                        /* expecting a second surrogate */
2306                        if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2307#ifdef Py_UNICODE_WIDE
2308                            *p++ = (((surrogate & 0x3FF)<<10)
2309                                    | (outCh & 0x3FF)) + 0x10000;
2310#else
2311                            *p++ = surrogate;
2312                            *p++ = outCh;
2313#endif
2314                            surrogate = 0;
2315                        }
2316                        else {
2317                            surrogate = 0;
2318                            errmsg = "second surrogate missing";
2319                            goto utf7Error;
2320                        }
2321                    }
2322                    else if (outCh >= 0xD800 && outCh <= 0xDBFF) {
2323                        /* first surrogate */
2324                        surrogate = outCh;
2325                    }
2326                    else if (outCh >= 0xDC00 && outCh <= 0xDFFF) {
2327                        errmsg = "unexpected second surrogate";
2328                        goto utf7Error;
2329                    }
2330                    else {
2331                        *p++ = outCh;
2332                    }
2333                }
2334            }
2335            else { /* now leaving a base-64 section */
2336                inShift = 0;
2337                s++;
2338                if (surrogate) {
2339                    errmsg = "second surrogate missing at end of shift sequence";
2340                    goto utf7Error;
2341                }
2342                if (base64bits > 0) { /* left-over bits */
2343                    if (base64bits >= 6) {
2344                        /* We've seen at least one base-64 character */
2345                        errmsg = "partial character in shift sequence";
2346                        goto utf7Error;
2347                    }
2348                    else {
2349                        /* Some bits remain; they should be zero */
2350                        if (base64buffer != 0) {
2351                            errmsg = "non-zero padding bits in shift sequence";
2352                            goto utf7Error;
2353                        }
2354                    }
2355                }
2356                if (ch != '-') {
2357                    /* '-' is absorbed; other terminating
2358                       characters are preserved */
2359                    *p++ = ch;
2360                }
2361            }
2362        }
2363        else if ( ch == '+' ) {
2364            startinpos = s-starts;
2365            s++; /* consume '+' */
2366            if (s < e && *s == '-') { /* '+-' encodes '+' */
2367                s++;
2368                *p++ = '+';
2369            }
2370            else { /* begin base64-encoded section */
2371                inShift = 1;
2372                shiftOutStart = p;
2373                base64bits = 0;
2374            }
2375        }
2376        else if (DECODE_DIRECT(ch)) { /* character decodes as itself */
2377            *p++ = ch;
2378            s++;
2379        }
2380        else {
2381            startinpos = s-starts;
2382            s++;
2383            errmsg = "unexpected special character";
2384            goto utf7Error;
2385        }
2386        continue;
2387utf7Error:
2388        outpos = p-PyUnicode_AS_UNICODE(unicode);
2389        endinpos = s-starts;
2390        if (unicode_decode_call_errorhandler(
2391                errors, &errorHandler,
2392                "utf7", errmsg,
2393                &starts, &e, &startinpos, &endinpos, &exc, &s,
2394                &unicode, &outpos, &p))
2395            goto onError;
2396    }
2397
2398    /* end of string */
2399
2400    if (inShift && !consumed) { /* in shift sequence, no more to follow */
2401        /* if we're in an inconsistent state, that's an error */
2402        if (surrogate ||
2403                (base64bits >= 6) ||
2404                (base64bits > 0 && base64buffer != 0)) {
2405            outpos = p-PyUnicode_AS_UNICODE(unicode);
2406            endinpos = size;
2407            if (unicode_decode_call_errorhandler(
2408                    errors, &errorHandler,
2409                    "utf7", "unterminated shift sequence",
2410                    &starts, &e, &startinpos, &endinpos, &exc, &s,
2411                    &unicode, &outpos, &p))
2412                goto onError;
2413            if (s < e)
2414                goto restart;
2415        }
2416    }
2417
2418    /* return state */
2419    if (consumed) {
2420        if (inShift) {
2421            p = shiftOutStart; /* back off output */
2422            *consumed = startinpos;
2423        }
2424        else {
2425            *consumed = s-starts;
2426        }
2427    }
2428
2429    if (_PyUnicode_Resize(&unicode, p - PyUnicode_AS_UNICODE(unicode)) < 0)
2430        goto onError;
2431
2432    Py_XDECREF(errorHandler);
2433    Py_XDECREF(exc);
2434    return (PyObject *)unicode;
2435
2436  onError:
2437    Py_XDECREF(errorHandler);
2438    Py_XDECREF(exc);
2439    Py_DECREF(unicode);
2440    return NULL;
2441}
2442
2443
2444PyObject *
2445PyUnicode_EncodeUTF7(const Py_UNICODE *s,
2446		     Py_ssize_t size,
2447		     int base64SetO,
2448		     int base64WhiteSpace,
2449		     const char *errors)
2450{
2451    PyObject *v;
2452    /* It might be possible to tighten this worst case */
2453    Py_ssize_t allocated = 8 * size;
2454    int inShift = 0;
2455    Py_ssize_t i = 0;
2456    unsigned int base64bits = 0;
2457    unsigned long base64buffer = 0;
2458    char * out;
2459    char * start;
2460
2461    if (size == 0)
2462        return PyBytes_FromStringAndSize(NULL, 0);
2463
2464    if (allocated / 8 != size)
2465        return PyErr_NoMemory();
2466
2467    v = PyBytes_FromStringAndSize(NULL, allocated);
2468    if (v == NULL)
2469        return NULL;
2470
2471    start = out = PyBytes_AS_STRING(v);
2472    for (;i < size; ++i) {
2473        Py_UNICODE ch = s[i];
2474
2475        if (inShift) {
2476            if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2477                /* shifting out */
2478                if (base64bits) { /* output remaining bits */
2479                    *out++ = TO_BASE64(base64buffer << (6-base64bits));
2480                    base64buffer = 0;
2481                    base64bits = 0;
2482                }
2483                inShift = 0;
2484                /* Characters not in the BASE64 set implicitly unshift the sequence
2485                   so no '-' is required, except if the character is itself a '-' */
2486                if (IS_BASE64(ch) || ch == '-') {
2487                    *out++ = '-';
2488                }
2489                *out++ = (char) ch;
2490            }
2491            else {
2492                goto encode_char;
2493            }
2494        }
2495        else { /* not in a shift sequence */
2496            if (ch == '+') {
2497                *out++ = '+';
2498                        *out++ = '-';
2499            }
2500            else if (ENCODE_DIRECT(ch, !base64SetO, !base64WhiteSpace)) {
2501                *out++ = (char) ch;
2502            }
2503            else {
2504                *out++ = '+';
2505                inShift = 1;
2506                goto encode_char;
2507            }
2508        }
2509        continue;
2510encode_char:
2511#ifdef Py_UNICODE_WIDE
2512        if (ch >= 0x10000) {
2513            /* code first surrogate */
2514            base64bits += 16;
2515            base64buffer = (base64buffer << 16) | 0xd800 | ((ch-0x10000) >> 10);
2516            while (base64bits >= 6) {
2517                *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2518                base64bits -= 6;
2519            }
2520            /* prepare second surrogate */
2521            ch =  0xDC00 | ((ch-0x10000) & 0x3FF);
2522        }
2523#endif
2524        base64bits += 16;
2525        base64buffer = (base64buffer << 16) | ch;
2526        while (base64bits >= 6) {
2527            *out++ = TO_BASE64(base64buffer >> (base64bits-6));
2528            base64bits -= 6;
2529        }
2530    }
2531    if (base64bits)
2532        *out++= TO_BASE64(base64buffer << (6-base64bits) );
2533    if (inShift)
2534        *out++ = '-';
2535    if (_PyBytes_Resize(&v, out - start) < 0)
2536        return NULL;
2537    return v;
2538}
2539
2540#undef IS_BASE64
2541#undef FROM_BASE64
2542#undef TO_BASE64
2543#undef DECODE_DIRECT
2544#undef ENCODE_DIRECT
2545
2546/* --- UTF-8 Codec -------------------------------------------------------- */
2547
2548static
2549char utf8_code_length[256] = {
2550    /* Map UTF-8 encoded prefix byte to sequence length.  Zero means
2551       illegal prefix.  See RFC 3629 for details */
2552    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 00-0F */
2553    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2554    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2555    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2556    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2557    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2558    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
2559    1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, /* 70-7F */
2560    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* 80-8F */
2561    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2562    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
2563    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, /* B0-BF */
2564    0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* C0-C1 + C2-CF */
2565    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, /* D0-DF */
2566    3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, /* E0-EF */
2567    4, 4, 4, 4, 4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0  /* F0-F4 + F5-FF */
2568};
2569
2570PyObject *
2571PyUnicode_DecodeUTF8(const char *s,
2572		     Py_ssize_t size,
2573		     const char *errors)
2574{
2575    return PyUnicode_DecodeUTF8Stateful(s, size, errors, NULL);
2576}
2577
2578/* Mask to check or force alignment of a pointer to C 'long' boundaries */
2579#define LONG_PTR_MASK (size_t) (SIZEOF_LONG - 1)
2580
2581/* Mask to quickly check whether a C 'long' contains a
2582   non-ASCII, UTF8-encoded char. */
2583#if (SIZEOF_LONG == 8)
2584# define ASCII_CHAR_MASK 0x8080808080808080L
2585#elif (SIZEOF_LONG == 4)
2586# define ASCII_CHAR_MASK 0x80808080L
2587#else
2588# error C 'long' size should be either 4 or 8!
2589#endif
2590
2591PyObject *
2592PyUnicode_DecodeUTF8Stateful(const char *s,
2593			     Py_ssize_t size,
2594			     const char *errors,
2595			     Py_ssize_t *consumed)
2596{
2597    const char *starts = s;
2598    int n;
2599    int k;
2600    Py_ssize_t startinpos;
2601    Py_ssize_t endinpos;
2602    Py_ssize_t outpos;
2603    const char *e, *aligned_end;
2604    PyUnicodeObject *unicode;
2605    Py_UNICODE *p;
2606    const char *errmsg = "";
2607    PyObject *errorHandler = NULL;
2608    PyObject *exc = NULL;
2609
2610    /* Note: size will always be longer than the resulting Unicode
2611       character count */
2612    unicode = _PyUnicode_New(size);
2613    if (!unicode)
2614        return NULL;
2615    if (size == 0) {
2616        if (consumed)
2617            *consumed = 0;
2618        return (PyObject *)unicode;
2619    }
2620
2621    /* Unpack UTF-8 encoded data */
2622    p = unicode->str;
2623    e = s + size;
2624    aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2625
2626    while (s < e) {
2627        Py_UCS4 ch = (unsigned char)*s;
2628
2629        if (ch < 0x80) {
2630            /* Fast path for runs of ASCII characters. Given that common UTF-8
2631               input will consist of an overwhelming majority of ASCII
2632               characters, we try to optimize for this case by checking
2633               as many characters as a C 'long' can contain.
2634               First, check if we can do an aligned read, as most CPUs have
2635               a penalty for unaligned reads.
2636            */
2637            if (!((size_t) s & LONG_PTR_MASK)) {
2638                /* Help register allocation */
2639                register const char *_s = s;
2640                register Py_UNICODE *_p = p;
2641                while (_s < aligned_end) {
2642                    /* Read a whole long at a time (either 4 or 8 bytes),
2643                       and do a fast unrolled copy if it only contains ASCII
2644                       characters. */
2645                    unsigned long data = *(unsigned long *) _s;
2646                    if (data & ASCII_CHAR_MASK)
2647                        break;
2648                    _p[0] = (unsigned char) _s[0];
2649                    _p[1] = (unsigned char) _s[1];
2650                    _p[2] = (unsigned char) _s[2];
2651                    _p[3] = (unsigned char) _s[3];
2652#if (SIZEOF_LONG == 8)
2653                    _p[4] = (unsigned char) _s[4];
2654                    _p[5] = (unsigned char) _s[5];
2655                    _p[6] = (unsigned char) _s[6];
2656                    _p[7] = (unsigned char) _s[7];
2657#endif
2658                    _s += SIZEOF_LONG;
2659                    _p += SIZEOF_LONG;
2660                }
2661                s = _s;
2662                p = _p;
2663                if (s == e)
2664                    break;
2665                ch = (unsigned char)*s;
2666            }
2667        }
2668
2669        if (ch < 0x80) {
2670            *p++ = (Py_UNICODE)ch;
2671            s++;
2672            continue;
2673        }
2674
2675        n = utf8_code_length[ch];
2676
2677        if (s + n > e) {
2678            if (consumed)
2679                break;
2680            else {
2681                errmsg = "unexpected end of data";
2682                startinpos = s-starts;
2683                endinpos = startinpos+1;
2684                for (k=1; (k < size-startinpos) && ((s[k]&0xC0) == 0x80); k++)
2685                    endinpos++;
2686                goto utf8Error;
2687            }
2688        }
2689
2690        switch (n) {
2691
2692        case 0:
2693            errmsg = "invalid start byte";
2694            startinpos = s-starts;
2695            endinpos = startinpos+1;
2696            goto utf8Error;
2697
2698        case 1:
2699            errmsg = "internal error";
2700            startinpos = s-starts;
2701            endinpos = startinpos+1;
2702            goto utf8Error;
2703
2704        case 2:
2705            if ((s[1] & 0xc0) != 0x80) {
2706                errmsg = "invalid continuation byte";
2707                startinpos = s-starts;
2708                endinpos = startinpos + 1;
2709                goto utf8Error;
2710            }
2711            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2712            assert ((ch > 0x007F) && (ch <= 0x07FF));
2713            *p++ = (Py_UNICODE)ch;
2714            break;
2715
2716        case 3:
2717            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2718               will result in surrogates in range d800-dfff. Surrogates are
2719               not valid UTF-8 so they are rejected.
2720               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2721               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2722            if ((s[1] & 0xc0) != 0x80 ||
2723                (s[2] & 0xc0) != 0x80 ||
2724                ((unsigned char)s[0] == 0xE0 &&
2725                 (unsigned char)s[1] < 0xA0) ||
2726                ((unsigned char)s[0] == 0xED &&
2727                 (unsigned char)s[1] > 0x9F)) {
2728                errmsg = "invalid continuation byte";
2729                startinpos = s-starts;
2730                endinpos = startinpos + 1;
2731
2732                /* if s[1] first two bits are 1 and 0, then the invalid
2733                   continuation byte is s[2], so increment endinpos by 1,
2734                   if not, s[1] is invalid and endinpos doesn't need to
2735                   be incremented. */
2736                if ((s[1] & 0xC0) == 0x80)
2737                    endinpos++;
2738                goto utf8Error;
2739            }
2740            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2741            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2742            *p++ = (Py_UNICODE)ch;
2743            break;
2744
2745        case 4:
2746            if ((s[1] & 0xc0) != 0x80 ||
2747                (s[2] & 0xc0) != 0x80 ||
2748                (s[3] & 0xc0) != 0x80 ||
2749                ((unsigned char)s[0] == 0xF0 &&
2750                 (unsigned char)s[1] < 0x90) ||
2751                ((unsigned char)s[0] == 0xF4 &&
2752                 (unsigned char)s[1] > 0x8F)) {
2753                errmsg = "invalid continuation byte";
2754                startinpos = s-starts;
2755                endinpos = startinpos + 1;
2756                if ((s[1] & 0xC0) == 0x80) {
2757                    endinpos++;
2758                    if ((s[2] & 0xC0) == 0x80)
2759                        endinpos++;
2760                }
2761                goto utf8Error;
2762            }
2763            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2764                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2765            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2766
2767#ifdef Py_UNICODE_WIDE
2768            *p++ = (Py_UNICODE)ch;
2769#else
2770            /*  compute and append the two surrogates: */
2771
2772            /*  translate from 10000..10FFFF to 0..FFFF */
2773            ch -= 0x10000;
2774
2775            /*  high surrogate = top 10 bits added to D800 */
2776            *p++ = (Py_UNICODE)(0xD800 + (ch >> 10));
2777
2778            /*  low surrogate = bottom 10 bits added to DC00 */
2779            *p++ = (Py_UNICODE)(0xDC00 + (ch & 0x03FF));
2780#endif
2781            break;
2782        }
2783        s += n;
2784        continue;
2785
2786      utf8Error:
2787        outpos = p-PyUnicode_AS_UNICODE(unicode);
2788        if (unicode_decode_call_errorhandler(
2789                errors, &errorHandler,
2790                "utf8", errmsg,
2791                &starts, &e, &startinpos, &endinpos, &exc, &s,
2792                &unicode, &outpos, &p))
2793            goto onError;
2794        aligned_end = (const char *) ((size_t) e & ~LONG_PTR_MASK);
2795    }
2796    if (consumed)
2797        *consumed = s-starts;
2798
2799    /* Adjust length */
2800    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
2801        goto onError;
2802
2803    Py_XDECREF(errorHandler);
2804    Py_XDECREF(exc);
2805    return (PyObject *)unicode;
2806
2807  onError:
2808    Py_XDECREF(errorHandler);
2809    Py_XDECREF(exc);
2810    Py_DECREF(unicode);
2811    return NULL;
2812}
2813
2814#undef ASCII_CHAR_MASK
2815
2816#ifdef __APPLE__
2817
2818/* Simplified UTF-8 decoder using surrogateescape error handler,
2819   used to decode the command line arguments on Mac OS X. */
2820
2821wchar_t*
2822_Py_DecodeUTF8_surrogateescape(const char *s, Py_ssize_t size)
2823{
2824    int n;
2825    const char *e;
2826    wchar_t *unicode, *p;
2827
2828    /* Note: size will always be longer than the resulting Unicode
2829       character count */
2830    if (PY_SSIZE_T_MAX / sizeof(wchar_t) < (size + 1)) {
2831        PyErr_NoMemory();
2832        return NULL;
2833    }
2834    unicode = PyMem_Malloc((size + 1) * sizeof(wchar_t));
2835    if (!unicode)
2836        return NULL;
2837
2838    /* Unpack UTF-8 encoded data */
2839    p = unicode;
2840    e = s + size;
2841    while (s < e) {
2842        Py_UCS4 ch = (unsigned char)*s;
2843
2844        if (ch < 0x80) {
2845            *p++ = (wchar_t)ch;
2846            s++;
2847            continue;
2848        }
2849
2850        n = utf8_code_length[ch];
2851        if (s + n > e) {
2852            goto surrogateescape;
2853        }
2854
2855        switch (n) {
2856        case 0:
2857        case 1:
2858            goto surrogateescape;
2859
2860        case 2:
2861            if ((s[1] & 0xc0) != 0x80)
2862                goto surrogateescape;
2863            ch = ((s[0] & 0x1f) << 6) + (s[1] & 0x3f);
2864            assert ((ch > 0x007F) && (ch <= 0x07FF));
2865            *p++ = (wchar_t)ch;
2866            break;
2867
2868        case 3:
2869            /* Decoding UTF-8 sequences in range \xed\xa0\x80-\xed\xbf\xbf
2870               will result in surrogates in range d800-dfff. Surrogates are
2871               not valid UTF-8 so they are rejected.
2872               See http://www.unicode.org/versions/Unicode5.2.0/ch03.pdf
2873               (table 3-7) and http://www.rfc-editor.org/rfc/rfc3629.txt */
2874            if ((s[1] & 0xc0) != 0x80 ||
2875                (s[2] & 0xc0) != 0x80 ||
2876                ((unsigned char)s[0] == 0xE0 &&
2877                 (unsigned char)s[1] < 0xA0) ||
2878                ((unsigned char)s[0] == 0xED &&
2879                 (unsigned char)s[1] > 0x9F)) {
2880
2881                goto surrogateescape;
2882            }
2883            ch = ((s[0] & 0x0f) << 12) + ((s[1] & 0x3f) << 6) + (s[2] & 0x3f);
2884            assert ((ch > 0x07FF) && (ch <= 0xFFFF));
2885            *p++ = (Py_UNICODE)ch;
2886            break;
2887
2888        case 4:
2889            if ((s[1] & 0xc0) != 0x80 ||
2890                (s[2] & 0xc0) != 0x80 ||
2891                (s[3] & 0xc0) != 0x80 ||
2892                ((unsigned char)s[0] == 0xF0 &&
2893                 (unsigned char)s[1] < 0x90) ||
2894                ((unsigned char)s[0] == 0xF4 &&
2895                 (unsigned char)s[1] > 0x8F)) {
2896                goto surrogateescape;
2897            }
2898            ch = ((s[0] & 0x7) << 18) + ((s[1] & 0x3f) << 12) +
2899                 ((s[2] & 0x3f) << 6) + (s[3] & 0x3f);
2900            assert ((ch > 0xFFFF) && (ch <= 0x10ffff));
2901
2902#if SIZEOF_WCHAR_T == 4
2903            *p++ = (wchar_t)ch;
2904#else
2905            /*  compute and append the two surrogates: */
2906
2907            /*  translate from 10000..10FFFF to 0..FFFF */
2908            ch -= 0x10000;
2909
2910            /*  high surrogate = top 10 bits added to D800 */
2911            *p++ = (wchar_t)(0xD800 + (ch >> 10));
2912
2913            /*  low surrogate = bottom 10 bits added to DC00 */
2914            *p++ = (wchar_t)(0xDC00 + (ch & 0x03FF));
2915#endif
2916            break;
2917        }
2918        s += n;
2919        continue;
2920
2921      surrogateescape:
2922        *p++ = 0xDC00 + ch;
2923        s++;
2924    }
2925    *p = L'\0';
2926    return unicode;
2927}
2928
2929#endif /* __APPLE__ */
2930
2931/* Allocation strategy:  if the string is short, convert into a stack buffer
2932   and allocate exactly as much space needed at the end.  Else allocate the
2933   maximum possible needed (4 result bytes per Unicode character), and return
2934   the excess memory at the end.
2935*/
2936PyObject *
2937PyUnicode_EncodeUTF8(const Py_UNICODE *s,
2938                     Py_ssize_t size,
2939                     const char *errors)
2940{
2941#define MAX_SHORT_UNICHARS 300  /* largest size we'll do on the stack */
2942
2943    Py_ssize_t i;                /* index into s of next input byte */
2944    PyObject *result;            /* result string object */
2945    char *p;                     /* next free byte in output buffer */
2946    Py_ssize_t nallocated;      /* number of result bytes allocated */
2947    Py_ssize_t nneeded;            /* number of result bytes needed */
2948    char stackbuf[MAX_SHORT_UNICHARS * 4];
2949    PyObject *errorHandler = NULL;
2950    PyObject *exc = NULL;
2951
2952    assert(s != NULL);
2953    assert(size >= 0);
2954
2955    if (size <= MAX_SHORT_UNICHARS) {
2956        /* Write into the stack buffer; nallocated can't overflow.
2957         * At the end, we'll allocate exactly as much heap space as it
2958         * turns out we need.
2959         */
2960        nallocated = Py_SAFE_DOWNCAST(sizeof(stackbuf), size_t, int);
2961        result = NULL;   /* will allocate after we're done */
2962        p = stackbuf;
2963    }
2964    else {
2965        /* Overallocate on the heap, and give the excess back at the end. */
2966        nallocated = size * 4;
2967        if (nallocated / 4 != size)  /* overflow! */
2968            return PyErr_NoMemory();
2969        result = PyBytes_FromStringAndSize(NULL, nallocated);
2970        if (result == NULL)
2971            return NULL;
2972        p = PyBytes_AS_STRING(result);
2973    }
2974
2975    for (i = 0; i < size;) {
2976        Py_UCS4 ch = s[i++];
2977
2978        if (ch < 0x80)
2979            /* Encode ASCII */
2980            *p++ = (char) ch;
2981
2982        else if (ch < 0x0800) {
2983            /* Encode Latin-1 */
2984            *p++ = (char)(0xc0 | (ch >> 6));
2985            *p++ = (char)(0x80 | (ch & 0x3f));
2986        } else if (0xD800 <= ch && ch <= 0xDFFF) {
2987#ifndef Py_UNICODE_WIDE
2988            /* Special case: check for high and low surrogate */
2989            if (ch <= 0xDBFF && i != size && 0xDC00 <= s[i] && s[i] <= 0xDFFF) {
2990                Py_UCS4 ch2 = s[i];
2991                /* Combine the two surrogates to form a UCS4 value */
2992                ch = ((ch - 0xD800) << 10 | (ch2 - 0xDC00)) + 0x10000;
2993                i++;
2994
2995                /* Encode UCS4 Unicode ordinals */
2996                *p++ = (char)(0xf0 | (ch >> 18));
2997                *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
2998                *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
2999                *p++ = (char)(0x80 | (ch & 0x3f));
3000            } else {
3001#endif
3002                Py_ssize_t newpos;
3003                PyObject *rep;
3004                Py_ssize_t repsize, k;
3005                rep = unicode_encode_call_errorhandler
3006                    (errors, &errorHandler, "utf-8", "surrogates not allowed",
3007                     s, size, &exc, i-1, i, &newpos);
3008                if (!rep)
3009                    goto error;
3010
3011                if (PyBytes_Check(rep))
3012                    repsize = PyBytes_GET_SIZE(rep);
3013                else
3014                    repsize = PyUnicode_GET_SIZE(rep);
3015
3016                if (repsize > 4) {
3017                    Py_ssize_t offset;
3018
3019                    if (result == NULL)
3020                        offset = p - stackbuf;
3021                    else
3022                        offset = p - PyBytes_AS_STRING(result);
3023
3024                    if (nallocated > PY_SSIZE_T_MAX - repsize + 4) {
3025                        /* integer overflow */
3026                        PyErr_NoMemory();
3027                        goto error;
3028                    }
3029                    nallocated += repsize - 4;
3030                    if (result != NULL) {
3031                        if (_PyBytes_Resize(&result, nallocated) < 0)
3032                            goto error;
3033                    } else {
3034                        result = PyBytes_FromStringAndSize(NULL, nallocated);
3035                        if (result == NULL)
3036                            goto error;
3037                        Py_MEMCPY(PyBytes_AS_STRING(result), stackbuf, offset);
3038                    }
3039                    p = PyBytes_AS_STRING(result) + offset;
3040                }
3041
3042                if (PyBytes_Check(rep)) {
3043                    char *prep = PyBytes_AS_STRING(rep);
3044                    for(k = repsize; k > 0; k--)
3045                        *p++ = *prep++;
3046                } else /* rep is unicode */ {
3047                    Py_UNICODE *prep = PyUnicode_AS_UNICODE(rep);
3048                    Py_UNICODE c;
3049
3050                    for(k=0; k<repsize; k++) {
3051                        c = prep[k];
3052                        if (0x80 <= c) {
3053                            raise_encode_exception(&exc, "utf-8", s, size,
3054                                                   i-1, i, "surrogates not allowed");
3055                            goto error;
3056                        }
3057                        *p++ = (char)prep[k];
3058                    }
3059                }
3060                Py_DECREF(rep);
3061#ifndef Py_UNICODE_WIDE
3062            }
3063#endif
3064        } else if (ch < 0x10000) {
3065            *p++ = (char)(0xe0 | (ch >> 12));
3066            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3067            *p++ = (char)(0x80 | (ch & 0x3f));
3068        } else /* ch >= 0x10000 */ {
3069            /* Encode UCS4 Unicode ordinals */
3070            *p++ = (char)(0xf0 | (ch >> 18));
3071            *p++ = (char)(0x80 | ((ch >> 12) & 0x3f));
3072            *p++ = (char)(0x80 | ((ch >> 6) & 0x3f));
3073            *p++ = (char)(0x80 | (ch & 0x3f));
3074        }
3075    }
3076
3077    if (result == NULL) {
3078        /* This was stack allocated. */
3079        nneeded = p - stackbuf;
3080        assert(nneeded <= nallocated);
3081        result = PyBytes_FromStringAndSize(stackbuf, nneeded);
3082    }
3083    else {
3084        /* Cut back to size actually needed. */
3085        nneeded = p - PyBytes_AS_STRING(result);
3086        assert(nneeded <= nallocated);
3087        _PyBytes_Resize(&result, nneeded);
3088    }
3089    Py_XDECREF(errorHandler);
3090    Py_XDECREF(exc);
3091    return result;
3092 error:
3093    Py_XDECREF(errorHandler);
3094    Py_XDECREF(exc);
3095    Py_XDECREF(result);
3096    return NULL;
3097
3098#undef MAX_SHORT_UNICHARS
3099}
3100
3101PyObject *
3102PyUnicode_AsUTF8String(PyObject *unicode)
3103{
3104    PyObject *utf8;
3105    if (!PyUnicode_Check(unicode)) {
3106        PyErr_BadArgument();
3107        return NULL;
3108    }
3109    utf8 = _PyUnicode_AsDefaultEncodedString(unicode);
3110    if (utf8 == NULL)
3111        return NULL;
3112    Py_INCREF(utf8);
3113    return utf8;
3114}
3115
3116/* --- UTF-32 Codec ------------------------------------------------------- */
3117
3118PyObject *
3119PyUnicode_DecodeUTF32(const char *s,
3120                      Py_ssize_t size,
3121                      const char *errors,
3122                      int *byteorder)
3123{
3124    return PyUnicode_DecodeUTF32Stateful(s, size, errors, byteorder, NULL);
3125}
3126
3127PyObject *
3128PyUnicode_DecodeUTF32Stateful(const char *s,
3129                              Py_ssize_t size,
3130                              const char *errors,
3131                              int *byteorder,
3132                              Py_ssize_t *consumed)
3133{
3134    const char *starts = s;
3135    Py_ssize_t startinpos;
3136    Py_ssize_t endinpos;
3137    Py_ssize_t outpos;
3138    PyUnicodeObject *unicode;
3139    Py_UNICODE *p;
3140#ifndef Py_UNICODE_WIDE
3141    int pairs = 0;
3142    const unsigned char *qq;
3143#else
3144    const int pairs = 0;
3145#endif
3146    const unsigned char *q, *e;
3147    int bo = 0;       /* assume native ordering by default */
3148    const char *errmsg = "";
3149    /* Offsets from q for retrieving bytes in the right order. */
3150#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3151    int iorder[] = {0, 1, 2, 3};
3152#else
3153    int iorder[] = {3, 2, 1, 0};
3154#endif
3155    PyObject *errorHandler = NULL;
3156    PyObject *exc = NULL;
3157
3158    q = (unsigned char *)s;
3159    e = q + size;
3160
3161    if (byteorder)
3162        bo = *byteorder;
3163
3164    /* Check for BOM marks (U+FEFF) in the input and adjust current
3165       byte order setting accordingly. In native mode, the leading BOM
3166       mark is skipped, in all other modes, it is copied to the output
3167       stream as-is (giving a ZWNBSP character). */
3168    if (bo == 0) {
3169        if (size >= 4) {
3170            const Py_UCS4 bom = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3171                (q[iorder[1]] << 8) | q[iorder[0]];
3172#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3173            if (bom == 0x0000FEFF) {
3174                q += 4;
3175                bo = -1;
3176            }
3177            else if (bom == 0xFFFE0000) {
3178                q += 4;
3179                bo = 1;
3180            }
3181#else
3182            if (bom == 0x0000FEFF) {
3183                q += 4;
3184                bo = 1;
3185            }
3186            else if (bom == 0xFFFE0000) {
3187                q += 4;
3188                bo = -1;
3189            }
3190#endif
3191        }
3192    }
3193
3194    if (bo == -1) {
3195        /* force LE */
3196        iorder[0] = 0;
3197        iorder[1] = 1;
3198        iorder[2] = 2;
3199        iorder[3] = 3;
3200    }
3201    else if (bo == 1) {
3202        /* force BE */
3203        iorder[0] = 3;
3204        iorder[1] = 2;
3205        iorder[2] = 1;
3206        iorder[3] = 0;
3207    }
3208
3209    /* On narrow builds we split characters outside the BMP into two
3210       codepoints => count how much extra space we need. */
3211#ifndef Py_UNICODE_WIDE
3212    for (qq = q; qq < e; qq += 4)
3213        if (qq[iorder[2]] != 0 || qq[iorder[3]] != 0)
3214            pairs++;
3215#endif
3216
3217    /* This might be one to much, because of a BOM */
3218    unicode = _PyUnicode_New((size+3)/4+pairs);
3219    if (!unicode)
3220        return NULL;
3221    if (size == 0)
3222        return (PyObject *)unicode;
3223
3224    /* Unpack UTF-32 encoded data */
3225    p = unicode->str;
3226
3227    while (q < e) {
3228        Py_UCS4 ch;
3229        /* remaining bytes at the end? (size should be divisible by 4) */
3230        if (e-q<4) {
3231            if (consumed)
3232                break;
3233            errmsg = "truncated data";
3234            startinpos = ((const char *)q)-starts;
3235            endinpos = ((const char *)e)-starts;
3236            goto utf32Error;
3237            /* The remaining input chars are ignored if the callback
3238               chooses to skip the input */
3239        }
3240        ch = (q[iorder[3]] << 24) | (q[iorder[2]] << 16) |
3241            (q[iorder[1]] << 8) | q[iorder[0]];
3242
3243        if (ch >= 0x110000)
3244        {
3245            errmsg = "codepoint not in range(0x110000)";
3246            startinpos = ((const char *)q)-starts;
3247            endinpos = startinpos+4;
3248            goto utf32Error;
3249        }
3250#ifndef Py_UNICODE_WIDE
3251        if (ch >= 0x10000)
3252        {
3253            *p++ = 0xD800 | ((ch-0x10000) >> 10);
3254            *p++ = 0xDC00 | ((ch-0x10000) & 0x3FF);
3255        }
3256        else
3257#endif
3258            *p++ = ch;
3259        q += 4;
3260        continue;
3261      utf32Error:
3262        outpos = p-PyUnicode_AS_UNICODE(unicode);
3263        if (unicode_decode_call_errorhandler(
3264                errors, &errorHandler,
3265                "utf32", errmsg,
3266                &starts, (const char **)&e, &startinpos, &endinpos, &exc, (const char **)&q,
3267                &unicode, &outpos, &p))
3268            goto onError;
3269    }
3270
3271    if (byteorder)
3272        *byteorder = bo;
3273
3274    if (consumed)
3275        *consumed = (const char *)q-starts;
3276
3277    /* Adjust length */
3278    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3279        goto onError;
3280
3281    Py_XDECREF(errorHandler);
3282    Py_XDECREF(exc);
3283    return (PyObject *)unicode;
3284
3285  onError:
3286    Py_DECREF(unicode);
3287    Py_XDECREF(errorHandler);
3288    Py_XDECREF(exc);
3289    return NULL;
3290}
3291
3292PyObject *
3293PyUnicode_EncodeUTF32(const Py_UNICODE *s,
3294                      Py_ssize_t size,
3295                      const char *errors,
3296                      int byteorder)
3297{
3298    PyObject *v;
3299    unsigned char *p;
3300    Py_ssize_t nsize, bytesize;
3301#ifndef Py_UNICODE_WIDE
3302    Py_ssize_t i, pairs;
3303#else
3304    const int pairs = 0;
3305#endif
3306    /* Offsets from p for storing byte pairs in the right order. */
3307#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3308    int iorder[] = {0, 1, 2, 3};
3309#else
3310    int iorder[] = {3, 2, 1, 0};
3311#endif
3312
3313#define STORECHAR(CH)                           \
3314    do {                                        \
3315        p[iorder[3]] = ((CH) >> 24) & 0xff;     \
3316        p[iorder[2]] = ((CH) >> 16) & 0xff;     \
3317        p[iorder[1]] = ((CH) >> 8) & 0xff;      \
3318        p[iorder[0]] = (CH) & 0xff;             \
3319        p += 4;                                 \
3320    } while(0)
3321
3322    /* In narrow builds we can output surrogate pairs as one codepoint,
3323       so we need less space. */
3324#ifndef Py_UNICODE_WIDE
3325    for (i = pairs = 0; i < size-1; i++)
3326        if (0xD800 <= s[i] && s[i] <= 0xDBFF &&
3327            0xDC00 <= s[i+1] && s[i+1] <= 0xDFFF)
3328            pairs++;
3329#endif
3330    nsize = (size - pairs + (byteorder == 0));
3331    bytesize = nsize * 4;
3332    if (bytesize / 4 != nsize)
3333        return PyErr_NoMemory();
3334    v = PyBytes_FromStringAndSize(NULL, bytesize);
3335    if (v == NULL)
3336        return NULL;
3337
3338    p = (unsigned char *)PyBytes_AS_STRING(v);
3339    if (byteorder == 0)
3340        STORECHAR(0xFEFF);
3341    if (size == 0)
3342        goto done;
3343
3344    if (byteorder == -1) {
3345        /* force LE */
3346        iorder[0] = 0;
3347        iorder[1] = 1;
3348        iorder[2] = 2;
3349        iorder[3] = 3;
3350    }
3351    else if (byteorder == 1) {
3352        /* force BE */
3353        iorder[0] = 3;
3354        iorder[1] = 2;
3355        iorder[2] = 1;
3356        iorder[3] = 0;
3357    }
3358
3359    while (size-- > 0) {
3360        Py_UCS4 ch = *s++;
3361#ifndef Py_UNICODE_WIDE
3362        if (0xD800 <= ch && ch <= 0xDBFF && size > 0) {
3363            Py_UCS4 ch2 = *s;
3364            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3365                ch = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3366                s++;
3367                size--;
3368            }
3369        }
3370#endif
3371        STORECHAR(ch);
3372    }
3373
3374  done:
3375    return v;
3376#undef STORECHAR
3377}
3378
3379PyObject *
3380PyUnicode_AsUTF32String(PyObject *unicode)
3381{
3382    if (!PyUnicode_Check(unicode)) {
3383        PyErr_BadArgument();
3384        return NULL;
3385    }
3386    return PyUnicode_EncodeUTF32(PyUnicode_AS_UNICODE(unicode),
3387                                 PyUnicode_GET_SIZE(unicode),
3388                                 NULL,
3389                                 0);
3390}
3391
3392/* --- UTF-16 Codec ------------------------------------------------------- */
3393
3394PyObject *
3395PyUnicode_DecodeUTF16(const char *s,
3396                      Py_ssize_t size,
3397                      const char *errors,
3398                      int *byteorder)
3399{
3400    return PyUnicode_DecodeUTF16Stateful(s, size, errors, byteorder, NULL);
3401}
3402
3403/* Two masks for fast checking of whether a C 'long' may contain
3404   UTF16-encoded surrogate characters. This is an efficient heuristic,
3405   assuming that non-surrogate characters with a code point >= 0x8000 are
3406   rare in most input.
3407   FAST_CHAR_MASK is used when the input is in native byte ordering,
3408   SWAPPED_FAST_CHAR_MASK when the input is in byteswapped ordering.
3409*/
3410#if (SIZEOF_LONG == 8)
3411# define FAST_CHAR_MASK         0x8000800080008000L
3412# define SWAPPED_FAST_CHAR_MASK 0x0080008000800080L
3413#elif (SIZEOF_LONG == 4)
3414# define FAST_CHAR_MASK         0x80008000L
3415# define SWAPPED_FAST_CHAR_MASK 0x00800080L
3416#else
3417# error C 'long' size should be either 4 or 8!
3418#endif
3419
3420PyObject *
3421PyUnicode_DecodeUTF16Stateful(const char *s,
3422                              Py_ssize_t size,
3423                              const char *errors,
3424                              int *byteorder,
3425                              Py_ssize_t *consumed)
3426{
3427    const char *starts = s;
3428    Py_ssize_t startinpos;
3429    Py_ssize_t endinpos;
3430    Py_ssize_t outpos;
3431    PyUnicodeObject *unicode;
3432    Py_UNICODE *p;
3433    const unsigned char *q, *e, *aligned_end;
3434    int bo = 0;       /* assume native ordering by default */
3435    int native_ordering = 0;
3436    const char *errmsg = "";
3437    /* Offsets from q for retrieving byte pairs in the right order. */
3438#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3439    int ihi = 1, ilo = 0;
3440#else
3441    int ihi = 0, ilo = 1;
3442#endif
3443    PyObject *errorHandler = NULL;
3444    PyObject *exc = NULL;
3445
3446    /* Note: size will always be longer than the resulting Unicode
3447       character count */
3448    unicode = _PyUnicode_New(size);
3449    if (!unicode)
3450        return NULL;
3451    if (size == 0)
3452        return (PyObject *)unicode;
3453
3454    /* Unpack UTF-16 encoded data */
3455    p = unicode->str;
3456    q = (unsigned char *)s;
3457    e = q + size - 1;
3458
3459    if (byteorder)
3460        bo = *byteorder;
3461
3462    /* Check for BOM marks (U+FEFF) in the input and adjust current
3463       byte order setting accordingly. In native mode, the leading BOM
3464       mark is skipped, in all other modes, it is copied to the output
3465       stream as-is (giving a ZWNBSP character). */
3466    if (bo == 0) {
3467        if (size >= 2) {
3468            const Py_UNICODE bom = (q[ihi] << 8) | q[ilo];
3469#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3470            if (bom == 0xFEFF) {
3471                q += 2;
3472                bo = -1;
3473            }
3474            else if (bom == 0xFFFE) {
3475                q += 2;
3476                bo = 1;
3477            }
3478#else
3479            if (bom == 0xFEFF) {
3480                q += 2;
3481                bo = 1;
3482            }
3483            else if (bom == 0xFFFE) {
3484                q += 2;
3485                bo = -1;
3486            }
3487#endif
3488        }
3489    }
3490
3491    if (bo == -1) {
3492        /* force LE */
3493        ihi = 1;
3494        ilo = 0;
3495    }
3496    else if (bo == 1) {
3497        /* force BE */
3498        ihi = 0;
3499        ilo = 1;
3500    }
3501#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3502    native_ordering = ilo < ihi;
3503#else
3504    native_ordering = ilo > ihi;
3505#endif
3506
3507    aligned_end = (const unsigned char *) ((size_t) e & ~LONG_PTR_MASK);
3508    while (q < e) {
3509        Py_UNICODE ch;
3510        /* First check for possible aligned read of a C 'long'. Unaligned
3511           reads are more expensive, better to defer to another iteration. */
3512        if (!((size_t) q & LONG_PTR_MASK)) {
3513            /* Fast path for runs of non-surrogate chars. */
3514            register const unsigned char *_q = q;
3515            Py_UNICODE *_p = p;
3516            if (native_ordering) {
3517                /* Native ordering is simple: as long as the input cannot
3518                   possibly contain a surrogate char, do an unrolled copy
3519                   of several 16-bit code points to the target object.
3520                   The non-surrogate check is done on several input bytes
3521                   at a time (as many as a C 'long' can contain). */
3522                while (_q < aligned_end) {
3523                    unsigned long data = * (unsigned long *) _q;
3524                    if (data & FAST_CHAR_MASK)
3525                        break;
3526                    _p[0] = ((unsigned short *) _q)[0];
3527                    _p[1] = ((unsigned short *) _q)[1];
3528#if (SIZEOF_LONG == 8)
3529                    _p[2] = ((unsigned short *) _q)[2];
3530                    _p[3] = ((unsigned short *) _q)[3];
3531#endif
3532                    _q += SIZEOF_LONG;
3533                    _p += SIZEOF_LONG / 2;
3534                }
3535            }
3536            else {
3537                /* Byteswapped ordering is similar, but we must decompose
3538                   the copy bytewise, and take care of zero'ing out the
3539                   upper bytes if the target object is in 32-bit units
3540                   (that is, in UCS-4 builds). */
3541                while (_q < aligned_end) {
3542                    unsigned long data = * (unsigned long *) _q;
3543                    if (data & SWAPPED_FAST_CHAR_MASK)
3544                        break;
3545                    /* Zero upper bytes in UCS-4 builds */
3546#if (Py_UNICODE_SIZE > 2)
3547                    _p[0] = 0;
3548                    _p[1] = 0;
3549#if (SIZEOF_LONG == 8)
3550                    _p[2] = 0;
3551                    _p[3] = 0;
3552#endif
3553#endif
3554                    /* Issue #4916; UCS-4 builds on big endian machines must
3555                       fill the two last bytes of each 4-byte unit. */
3556#if (!defined(BYTEORDER_IS_LITTLE_ENDIAN) && Py_UNICODE_SIZE > 2)
3557# define OFF 2
3558#else
3559# define OFF 0
3560#endif
3561                    ((unsigned char *) _p)[OFF + 1] = _q[0];
3562                    ((unsigned char *) _p)[OFF + 0] = _q[1];
3563                    ((unsigned char *) _p)[OFF + 1 + Py_UNICODE_SIZE] = _q[2];
3564                    ((unsigned char *) _p)[OFF + 0 + Py_UNICODE_SIZE] = _q[3];
3565#if (SIZEOF_LONG == 8)
3566                    ((unsigned char *) _p)[OFF + 1 + 2 * Py_UNICODE_SIZE] = _q[4];
3567                    ((unsigned char *) _p)[OFF + 0 + 2 * Py_UNICODE_SIZE] = _q[5];
3568                    ((unsigned char *) _p)[OFF + 1 + 3 * Py_UNICODE_SIZE] = _q[6];
3569                    ((unsigned char *) _p)[OFF + 0 + 3 * Py_UNICODE_SIZE] = _q[7];
3570#endif
3571#undef OFF
3572                    _q += SIZEOF_LONG;
3573                    _p += SIZEOF_LONG / 2;
3574                }
3575            }
3576            p = _p;
3577            q = _q;
3578            if (q >= e)
3579                break;
3580        }
3581        ch = (q[ihi] << 8) | q[ilo];
3582
3583        q += 2;
3584
3585        if (ch < 0xD800 || ch > 0xDFFF) {
3586            *p++ = ch;
3587            continue;
3588        }
3589
3590        /* UTF-16 code pair: */
3591        if (q > e) {
3592            errmsg = "unexpected end of data";
3593            startinpos = (((const char *)q) - 2) - starts;
3594            endinpos = ((const char *)e) + 1 - starts;
3595            goto utf16Error;
3596        }
3597        if (0xD800 <= ch && ch <= 0xDBFF) {
3598            Py_UNICODE ch2 = (q[ihi] << 8) | q[ilo];
3599            q += 2;
3600            if (0xDC00 <= ch2 && ch2 <= 0xDFFF) {
3601#ifndef Py_UNICODE_WIDE
3602                *p++ = ch;
3603                *p++ = ch2;
3604#else
3605                *p++ = (((ch & 0x3FF)<<10) | (ch2 & 0x3FF)) + 0x10000;
3606#endif
3607                continue;
3608            }
3609            else {
3610                errmsg = "illegal UTF-16 surrogate";
3611                startinpos = (((const char *)q)-4)-starts;
3612                endinpos = startinpos+2;
3613                goto utf16Error;
3614            }
3615
3616        }
3617        errmsg = "illegal encoding";
3618        startinpos = (((const char *)q)-2)-starts;
3619        endinpos = startinpos+2;
3620        /* Fall through to report the error */
3621
3622      utf16Error:
3623        outpos = p - PyUnicode_AS_UNICODE(unicode);
3624        if (unicode_decode_call_errorhandler(
3625                errors,
3626                &errorHandler,
3627                "utf16", errmsg,
3628                &starts,
3629                (const char **)&e,
3630                &startinpos,
3631                &endinpos,
3632                &exc,
3633                (const char **)&q,
3634                &unicode,
3635                &outpos,
3636                &p))
3637            goto onError;
3638    }
3639    /* remaining byte at the end? (size should be even) */
3640    if (e == q) {
3641        if (!consumed) {
3642            errmsg = "truncated data";
3643            startinpos = ((const char *)q) - starts;
3644            endinpos = ((const char *)e) + 1 - starts;
3645            outpos = p - PyUnicode_AS_UNICODE(unicode);
3646            if (unicode_decode_call_errorhandler(
3647                    errors,
3648                    &errorHandler,
3649                    "utf16", errmsg,
3650                    &starts,
3651                    (const char **)&e,
3652                    &startinpos,
3653                    &endinpos,
3654                    &exc,
3655                    (const char **)&q,
3656                    &unicode,
3657                    &outpos,
3658                    &p))
3659                goto onError;
3660            /* The remaining input chars are ignored if the callback
3661               chooses to skip the input */
3662        }
3663    }
3664
3665    if (byteorder)
3666        *byteorder = bo;
3667
3668    if (consumed)
3669        *consumed = (const char *)q-starts;
3670
3671    /* Adjust length */
3672    if (_PyUnicode_Resize(&unicode, p - unicode->str) < 0)
3673        goto onError;
3674
3675    Py_XDECREF(errorHandler);
3676    Py_XDECREF(exc);
3677    return (PyObject *)unicode;
3678
3679  onError:
3680    Py_DECREF(unicode);
3681    Py_XDECREF(errorHandler);
3682    Py_XDECREF(exc);
3683    return NULL;
3684}
3685
3686#undef FAST_CHAR_MASK
3687#undef SWAPPED_FAST_CHAR_MASK
3688
3689PyObject *
3690PyUnicode_EncodeUTF16(const Py_UNICODE *s,
3691                      Py_ssize_t size,
3692                      const char *errors,
3693                      int byteorder)
3694{
3695    PyObject *v;
3696    unsigned char *p;
3697    Py_ssize_t nsize, bytesize;
3698#ifdef Py_UNICODE_WIDE
3699    Py_ssize_t i, pairs;
3700#else
3701    const int pairs = 0;
3702#endif
3703    /* Offsets from p for storing byte pairs in the right order. */
3704#ifdef BYTEORDER_IS_LITTLE_ENDIAN
3705    int ihi = 1, ilo = 0;
3706#else
3707    int ihi = 0, ilo = 1;
3708#endif
3709
3710#define STORECHAR(CH)                           \
3711    do {                                        \
3712        p[ihi] = ((CH) >> 8) & 0xff;            \
3713        p[ilo] = (CH) & 0xff;                   \
3714        p += 2;                                 \
3715    } while(0)
3716
3717#ifdef Py_UNICODE_WIDE
3718    for (i = pairs = 0; i < size; i++)
3719        if (s[i] >= 0x10000)
3720            pairs++;
3721#endif
3722    /* 2 * (size + pairs + (byteorder == 0)) */
3723    if (size > PY_SSIZE_T_MAX ||
3724        size > PY_SSIZE_T_MAX - pairs - (byteorder == 0))
3725        return PyErr_NoMemory();
3726    nsize = size + pairs + (byteorder == 0);
3727    bytesize = nsize * 2;
3728    if (bytesize / 2 != nsize)
3729        return PyErr_NoMemory();
3730    v = PyBytes_FromStringAndSize(NULL, bytesize);
3731    if (v == NULL)
3732        return NULL;
3733
3734    p = (unsigned char *)PyBytes_AS_STRING(v);
3735    if (byteorder == 0)
3736        STORECHAR(0xFEFF);
3737    if (size == 0)
3738        goto done;
3739
3740    if (byteorder == -1) {
3741        /* force LE */
3742        ihi = 1;
3743        ilo = 0;
3744    }
3745    else if (byteorder == 1) {
3746        /* force BE */
3747        ihi = 0;
3748        ilo = 1;
3749    }
3750
3751    while (size-- > 0) {
3752        Py_UNICODE ch = *s++;
3753        Py_UNICODE ch2 = 0;
3754#ifdef Py_UNICODE_WIDE
3755        if (ch >= 0x10000) {
3756            ch2 = 0xDC00 | ((ch-0x10000) & 0x3FF);
3757            ch  = 0xD800 | ((ch-0x10000) >> 10);
3758        }
3759#endif
3760        STORECHAR(ch);
3761        if (ch2)
3762            STORECHAR(ch2);
3763    }
3764
3765  done:
3766    return v;
3767#undef STORECHAR
3768}
3769
3770PyObject *
3771PyUnicode_AsUTF16String(PyObject *unicode)
3772{
3773    if (!PyUnicode_Check(unicode)) {
3774        PyErr_BadArgument();
3775        return NULL;
3776    }
3777    return PyUnicode_EncodeUTF16(PyUnicode_AS_UNICODE(unicode),
3778                                 PyUnicode_GET_SIZE(unicode),
3779                                 NULL,
3780                                 0);
3781}
3782
3783/* --- Unicode Escape Codec ----------------------------------------------- */
3784
3785static _PyUnicode_Name_CAPI *ucnhash_CAPI = NULL;
3786
3787PyObject *
3788PyUnicode_DecodeUnicodeEscape(const char *s,
3789			      Py_ssize_t size,
3790			      const char *errors)
3791{
3792    const char *starts = s;
3793    Py_ssize_t startinpos;
3794    Py_ssize_t endinpos;
3795    Py_ssize_t outpos;
3796    int i;
3797    PyUnicodeObject *v;
3798    Py_UNICODE *p;
3799    const char *end;
3800    char* message;
3801    Py_UCS4 chr = 0xffffffff; /* in case 'getcode' messes up */
3802    PyObject *errorHandler = NULL;
3803    PyObject *exc = NULL;
3804
3805    /* Escaped strings will always be longer than the resulting
3806       Unicode string, so we start with size here and then reduce the
3807       length after conversion to the true value.
3808       (but if the error callback returns a long replacement string
3809       we'll have to allocate more space) */
3810    v = _PyUnicode_New(size);
3811    if (v == NULL)
3812        goto onError;
3813    if (size == 0)
3814        return (PyObject *)v;
3815
3816    p = PyUnicode_AS_UNICODE(v);
3817    end = s + size;
3818
3819    while (s < end) {
3820        unsigned char c;
3821        Py_UNICODE x;
3822        int digits;
3823
3824        /* Non-escape characters are interpreted as Unicode ordinals */
3825        if (*s != '\\') {
3826            *p++ = (unsigned char) *s++;
3827            continue;
3828        }
3829
3830        startinpos = s-starts;
3831        /* \ - Escapes */
3832        s++;
3833        c = *s++;
3834        if (s > end)
3835            c = '\0'; /* Invalid after \ */
3836        switch (c) {
3837
3838            /* \x escapes */
3839        case '\n': break;
3840        case '\\': *p++ = '\\'; break;
3841        case '\'': *p++ = '\''; break;
3842        case '\"': *p++ = '\"'; break;
3843        case 'b': *p++ = '\b'; break;
3844        case 'f': *p++ = '\014'; break; /* FF */
3845        case 't': *p++ = '\t'; break;
3846        case 'n': *p++ = '\n'; break;
3847        case 'r': *p++ = '\r'; break;
3848        case 'v': *p++ = '\013'; break; /* VT */
3849        case 'a': *p++ = '\007'; break; /* BEL, not classic C */
3850
3851            /* \OOO (octal) escapes */
3852        case '0': case '1': case '2': case '3':
3853        case '4': case '5': case '6': case '7':
3854            x = s[-1] - '0';
3855            if (s < end && '0' <= *s && *s <= '7') {
3856                x = (x<<3) + *s++ - '0';
3857                if (s < end && '0' <= *s && *s <= '7')
3858                    x = (x<<3) + *s++ - '0';
3859            }
3860            *p++ = x;
3861            break;
3862
3863            /* hex escapes */
3864            /* \xXX */
3865        case 'x':
3866            digits = 2;
3867            message = "truncated \\xXX escape";
3868            goto hexescape;
3869
3870            /* \uXXXX */
3871        case 'u':
3872            digits = 4;
3873            message = "truncated \\uXXXX escape";
3874            goto hexescape;
3875
3876            /* \UXXXXXXXX */
3877        case 'U':
3878            digits = 8;
3879            message = "truncated \\UXXXXXXXX escape";
3880        hexescape:
3881            chr = 0;
3882            outpos = p-PyUnicode_AS_UNICODE(v);
3883            if (s+digits>end) {
3884                endinpos = size;
3885                if (unicode_decode_call_errorhandler(
3886                        errors, &errorHandler,
3887                        "unicodeescape", "end of string in escape sequence",
3888                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3889                        &v, &outpos, &p))
3890                    goto onError;
3891                goto nextByte;
3892            }
3893            for (i = 0; i < digits; ++i) {
3894                c = (unsigned char) s[i];
3895                if (!Py_ISXDIGIT(c)) {
3896                    endinpos = (s+i+1)-starts;
3897                    if (unicode_decode_call_errorhandler(
3898                            errors, &errorHandler,
3899                            "unicodeescape", message,
3900                            &starts, &end, &startinpos, &endinpos, &exc, &s,
3901                            &v, &outpos, &p))
3902                        goto onError;
3903                    goto nextByte;
3904                }
3905                chr = (chr<<4) & ~0xF;
3906                if (c >= '0' && c <= '9')
3907                    chr += c - '0';
3908                else if (c >= 'a' && c <= 'f')
3909                    chr += 10 + c - 'a';
3910                else
3911                    chr += 10 + c - 'A';
3912            }
3913            s += i;
3914            if (chr == 0xffffffff && PyErr_Occurred())
3915                /* _decoding_error will have already written into the
3916                   target buffer. */
3917                break;
3918        store:
3919            /* when we get here, chr is a 32-bit unicode character */
3920            if (chr <= 0xffff)
3921                /* UCS-2 character */
3922                *p++ = (Py_UNICODE) chr;
3923            else if (chr <= 0x10ffff) {
3924                /* UCS-4 character. Either store directly, or as
3925                   surrogate pair. */
3926#ifdef Py_UNICODE_WIDE
3927                *p++ = chr;
3928#else
3929                chr -= 0x10000L;
3930                *p++ = 0xD800 + (Py_UNICODE) (chr >> 10);
3931                *p++ = 0xDC00 + (Py_UNICODE) (chr & 0x03FF);
3932#endif
3933            } else {
3934                endinpos = s-starts;
3935                outpos = p-PyUnicode_AS_UNICODE(v);
3936                if (unicode_decode_call_errorhandler(
3937                        errors, &errorHandler,
3938                        "unicodeescape", "illegal Unicode character",
3939                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3940                        &v, &outpos, &p))
3941                    goto onError;
3942            }
3943            break;
3944
3945            /* \N{name} */
3946        case 'N':
3947            message = "malformed \\N character escape";
3948            if (ucnhash_CAPI == NULL) {
3949                /* load the unicode data module */
3950                ucnhash_CAPI = (_PyUnicode_Name_CAPI *)PyCapsule_Import(PyUnicodeData_CAPSULE_NAME, 1);
3951                if (ucnhash_CAPI == NULL)
3952                    goto ucnhashError;
3953            }
3954            if (*s == '{') {
3955                const char *start = s+1;
3956                /* look for the closing brace */
3957                while (*s != '}' && s < end)
3958                    s++;
3959                if (s > start && s < end && *s == '}') {
3960                    /* found a name.  look it up in the unicode database */
3961                    message = "unknown Unicode character name";
3962                    s++;
3963                    if (ucnhash_CAPI->getcode(NULL, start, (int)(s-start-1), &chr))
3964                        goto store;
3965                }
3966            }
3967            endinpos = s-starts;
3968            outpos = p-PyUnicode_AS_UNICODE(v);
3969            if (unicode_decode_call_errorhandler(
3970                    errors, &errorHandler,
3971                    "unicodeescape", message,
3972                    &starts, &end, &startinpos, &endinpos, &exc, &s,
3973                    &v, &outpos, &p))
3974                goto onError;
3975            break;
3976
3977        default:
3978            if (s > end) {
3979                message = "\\ at end of string";
3980                s--;
3981                endinpos = s-starts;
3982                outpos = p-PyUnicode_AS_UNICODE(v);
3983                if (unicode_decode_call_errorhandler(
3984                        errors, &errorHandler,
3985                        "unicodeescape", message,
3986                        &starts, &end, &startinpos, &endinpos, &exc, &s,
3987                        &v, &outpos, &p))
3988                    goto onError;
3989            }
3990            else {
3991                *p++ = '\\';
3992                *p++ = (unsigned char)s[-1];
3993            }
3994            break;
3995        }
3996      nextByte:
3997        ;
3998    }
3999    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4000        goto onError;
4001    Py_XDECREF(errorHandler);
4002    Py_XDECREF(exc);
4003    return (PyObject *)v;
4004
4005  ucnhashError:
4006    PyErr_SetString(
4007        PyExc_UnicodeError,
4008        "\\N escapes not supported (can't load unicodedata module)"
4009        );
4010    Py_XDECREF(v);
4011    Py_XDECREF(errorHandler);
4012    Py_XDECREF(exc);
4013    return NULL;
4014
4015  onError:
4016    Py_XDECREF(v);
4017    Py_XDECREF(errorHandler);
4018    Py_XDECREF(exc);
4019    return NULL;
4020}
4021
4022/* Return a Unicode-Escape string version of the Unicode object.
4023
4024   If quotes is true, the string is enclosed in u"" or u'' quotes as
4025   appropriate.
4026
4027*/
4028
4029Py_LOCAL_INLINE(const Py_UNICODE *) findchar(const Py_UNICODE *s,
4030                                             Py_ssize_t size,
4031                                             Py_UNICODE ch)
4032{
4033    /* like wcschr, but doesn't stop at NULL characters */
4034
4035    while (size-- > 0) {
4036        if (*s == ch)
4037            return s;
4038        s++;
4039    }
4040
4041    return NULL;
4042}
4043
4044static const char *hexdigits = "0123456789abcdef";
4045
4046PyObject *
4047PyUnicode_EncodeUnicodeEscape(const Py_UNICODE *s,
4048			      Py_ssize_t size)
4049{
4050    PyObject *repr;
4051    char *p;
4052
4053#ifdef Py_UNICODE_WIDE
4054    const Py_ssize_t expandsize = 10;
4055#else
4056    const Py_ssize_t expandsize = 6;
4057#endif
4058
4059    /* XXX(nnorwitz): rather than over-allocating, it would be
4060       better to choose a different scheme.  Perhaps scan the
4061       first N-chars of the string and allocate based on that size.
4062    */
4063    /* Initial allocation is based on the longest-possible unichr
4064       escape.
4065
4066       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
4067       unichr, so in this case it's the longest unichr escape. In
4068       narrow (UTF-16) builds this is five chars per source unichr
4069       since there are two unichrs in the surrogate pair, so in narrow
4070       (UTF-16) builds it's not the longest unichr escape.
4071
4072       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
4073       so in the narrow (UTF-16) build case it's the longest unichr
4074       escape.
4075    */
4076
4077    if (size == 0)
4078        return PyBytes_FromStringAndSize(NULL, 0);
4079
4080    if (size > (PY_SSIZE_T_MAX - 2 - 1) / expandsize)
4081        return PyErr_NoMemory();
4082
4083    repr = PyBytes_FromStringAndSize(NULL,
4084                                     2
4085                                     + expandsize*size
4086                                     + 1);
4087    if (repr == NULL)
4088        return NULL;
4089
4090    p = PyBytes_AS_STRING(repr);
4091
4092    while (size-- > 0) {
4093        Py_UNICODE ch = *s++;
4094
4095        /* Escape backslashes */
4096        if (ch == '\\') {
4097            *p++ = '\\';
4098            *p++ = (char) ch;
4099            continue;
4100        }
4101
4102#ifdef Py_UNICODE_WIDE
4103        /* Map 21-bit characters to '\U00xxxxxx' */
4104        else if (ch >= 0x10000) {
4105            *p++ = '\\';
4106            *p++ = 'U';
4107            *p++ = hexdigits[(ch >> 28) & 0x0000000F];
4108            *p++ = hexdigits[(ch >> 24) & 0x0000000F];
4109            *p++ = hexdigits[(ch >> 20) & 0x0000000F];
4110            *p++ = hexdigits[(ch >> 16) & 0x0000000F];
4111            *p++ = hexdigits[(ch >> 12) & 0x0000000F];
4112            *p++ = hexdigits[(ch >> 8) & 0x0000000F];
4113            *p++ = hexdigits[(ch >> 4) & 0x0000000F];
4114            *p++ = hexdigits[ch & 0x0000000F];
4115            continue;
4116        }
4117#else
4118        /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4119        else if (ch >= 0xD800 && ch < 0xDC00) {
4120            Py_UNICODE ch2;
4121            Py_UCS4 ucs;
4122
4123            ch2 = *s++;
4124            size--;
4125            if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4126                ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4127                *p++ = '\\';
4128                *p++ = 'U';
4129                *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
4130                *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
4131                *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
4132                *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
4133                *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
4134                *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
4135                *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
4136                *p++ = hexdigits[ucs & 0x0000000F];
4137                continue;
4138            }
4139            /* Fall through: isolated surrogates are copied as-is */
4140            s--;
4141            size++;
4142        }
4143#endif
4144
4145        /* Map 16-bit characters to '\uxxxx' */
4146        if (ch >= 256) {
4147            *p++ = '\\';
4148            *p++ = 'u';
4149            *p++ = hexdigits[(ch >> 12) & 0x000F];
4150            *p++ = hexdigits[(ch >> 8) & 0x000F];
4151            *p++ = hexdigits[(ch >> 4) & 0x000F];
4152            *p++ = hexdigits[ch & 0x000F];
4153        }
4154
4155        /* Map special whitespace to '\t', \n', '\r' */
4156        else if (ch == '\t') {
4157            *p++ = '\\';
4158            *p++ = 't';
4159        }
4160        else if (ch == '\n') {
4161            *p++ = '\\';
4162            *p++ = 'n';
4163        }
4164        else if (ch == '\r') {
4165            *p++ = '\\';
4166            *p++ = 'r';
4167        }
4168
4169        /* Map non-printable US ASCII to '\xhh' */
4170        else if (ch < ' ' || ch >= 0x7F) {
4171            *p++ = '\\';
4172            *p++ = 'x';
4173            *p++ = hexdigits[(ch >> 4) & 0x000F];
4174            *p++ = hexdigits[ch & 0x000F];
4175        }
4176
4177        /* Copy everything else as-is */
4178        else
4179            *p++ = (char) ch;
4180    }
4181
4182    assert(p - PyBytes_AS_STRING(repr) > 0);
4183    if (_PyBytes_Resize(&repr, p - PyBytes_AS_STRING(repr)) < 0)
4184        return NULL;
4185    return repr;
4186}
4187
4188PyObject *
4189PyUnicode_AsUnicodeEscapeString(PyObject *unicode)
4190{
4191    PyObject *s;
4192    if (!PyUnicode_Check(unicode)) {
4193        PyErr_BadArgument();
4194        return NULL;
4195    }
4196    s = PyUnicode_EncodeUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4197                                      PyUnicode_GET_SIZE(unicode));
4198    return s;
4199}
4200
4201/* --- Raw Unicode Escape Codec ------------------------------------------- */
4202
4203PyObject *
4204PyUnicode_DecodeRawUnicodeEscape(const char *s,
4205				 Py_ssize_t size,
4206				 const char *errors)
4207{
4208    const char *starts = s;
4209    Py_ssize_t startinpos;
4210    Py_ssize_t endinpos;
4211    Py_ssize_t outpos;
4212    PyUnicodeObject *v;
4213    Py_UNICODE *p;
4214    const char *end;
4215    const char *bs;
4216    PyObject *errorHandler = NULL;
4217    PyObject *exc = NULL;
4218
4219    /* Escaped strings will always be longer than the resulting
4220       Unicode string, so we start with size here and then reduce the
4221       length after conversion to the true value. (But decoding error
4222       handler might have to resize the string) */
4223    v = _PyUnicode_New(size);
4224    if (v == NULL)
4225        goto onError;
4226    if (size == 0)
4227        return (PyObject *)v;
4228    p = PyUnicode_AS_UNICODE(v);
4229    end = s + size;
4230    while (s < end) {
4231        unsigned char c;
4232        Py_UCS4 x;
4233        int i;
4234        int count;
4235
4236        /* Non-escape characters are interpreted as Unicode ordinals */
4237        if (*s != '\\') {
4238            *p++ = (unsigned char)*s++;
4239            continue;
4240        }
4241        startinpos = s-starts;
4242
4243        /* \u-escapes are only interpreted iff the number of leading
4244           backslashes if odd */
4245        bs = s;
4246        for (;s < end;) {
4247            if (*s != '\\')
4248                break;
4249            *p++ = (unsigned char)*s++;
4250        }
4251        if (((s - bs) & 1) == 0 ||
4252            s >= end ||
4253            (*s != 'u' && *s != 'U')) {
4254            continue;
4255        }
4256        p--;
4257        count = *s=='u' ? 4 : 8;
4258        s++;
4259
4260        /* \uXXXX with 4 hex digits, \Uxxxxxxxx with 8 */
4261        outpos = p-PyUnicode_AS_UNICODE(v);
4262        for (x = 0, i = 0; i < count; ++i, ++s) {
4263            c = (unsigned char)*s;
4264            if (!Py_ISXDIGIT(c)) {
4265                endinpos = s-starts;
4266                if (unicode_decode_call_errorhandler(
4267                        errors, &errorHandler,
4268                        "rawunicodeescape", "truncated \\uXXXX",
4269                        &starts, &end, &startinpos, &endinpos, &exc, &s,
4270                        &v, &outpos, &p))
4271                    goto onError;
4272                goto nextByte;
4273            }
4274            x = (x<<4) & ~0xF;
4275            if (c >= '0' && c <= '9')
4276                x += c - '0';
4277            else if (c >= 'a' && c <= 'f')
4278                x += 10 + c - 'a';
4279            else
4280                x += 10 + c - 'A';
4281        }
4282        if (x <= 0xffff)
4283            /* UCS-2 character */
4284            *p++ = (Py_UNICODE) x;
4285        else if (x <= 0x10ffff) {
4286            /* UCS-4 character. Either store directly, or as
4287               surrogate pair. */
4288#ifdef Py_UNICODE_WIDE
4289            *p++ = (Py_UNICODE) x;
4290#else
4291            x -= 0x10000L;
4292            *p++ = 0xD800 + (Py_UNICODE) (x >> 10);
4293            *p++ = 0xDC00 + (Py_UNICODE) (x & 0x03FF);
4294#endif
4295        } else {
4296            endinpos = s-starts;
4297            outpos = p-PyUnicode_AS_UNICODE(v);
4298            if (unicode_decode_call_errorhandler(
4299                    errors, &errorHandler,
4300                    "rawunicodeescape", "\\Uxxxxxxxx out of range",
4301                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4302                    &v, &outpos, &p))
4303                goto onError;
4304        }
4305      nextByte:
4306        ;
4307    }
4308    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4309        goto onError;
4310    Py_XDECREF(errorHandler);
4311    Py_XDECREF(exc);
4312    return (PyObject *)v;
4313
4314  onError:
4315    Py_XDECREF(v);
4316    Py_XDECREF(errorHandler);
4317    Py_XDECREF(exc);
4318    return NULL;
4319}
4320
4321PyObject *
4322PyUnicode_EncodeRawUnicodeEscape(const Py_UNICODE *s,
4323				 Py_ssize_t size)
4324{
4325    PyObject *repr;
4326    char *p;
4327    char *q;
4328
4329#ifdef Py_UNICODE_WIDE
4330    const Py_ssize_t expandsize = 10;
4331#else
4332    const Py_ssize_t expandsize = 6;
4333#endif
4334
4335    if (size > PY_SSIZE_T_MAX / expandsize)
4336        return PyErr_NoMemory();
4337
4338    repr = PyBytes_FromStringAndSize(NULL, expandsize * size);
4339    if (repr == NULL)
4340        return NULL;
4341    if (size == 0)
4342        return repr;
4343
4344    p = q = PyBytes_AS_STRING(repr);
4345    while (size-- > 0) {
4346        Py_UNICODE ch = *s++;
4347#ifdef Py_UNICODE_WIDE
4348        /* Map 32-bit characters to '\Uxxxxxxxx' */
4349        if (ch >= 0x10000) {
4350            *p++ = '\\';
4351            *p++ = 'U';
4352            *p++ = hexdigits[(ch >> 28) & 0xf];
4353            *p++ = hexdigits[(ch >> 24) & 0xf];
4354            *p++ = hexdigits[(ch >> 20) & 0xf];
4355            *p++ = hexdigits[(ch >> 16) & 0xf];
4356            *p++ = hexdigits[(ch >> 12) & 0xf];
4357            *p++ = hexdigits[(ch >> 8) & 0xf];
4358            *p++ = hexdigits[(ch >> 4) & 0xf];
4359            *p++ = hexdigits[ch & 15];
4360        }
4361        else
4362#else
4363            /* Map UTF-16 surrogate pairs to '\U00xxxxxx' */
4364            if (ch >= 0xD800 && ch < 0xDC00) {
4365                Py_UNICODE ch2;
4366                Py_UCS4 ucs;
4367
4368                ch2 = *s++;
4369                size--;
4370                if (ch2 >= 0xDC00 && ch2 <= 0xDFFF) {
4371                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF)) + 0x00010000;
4372                    *p++ = '\\';
4373                    *p++ = 'U';
4374                    *p++ = hexdigits[(ucs >> 28) & 0xf];
4375                    *p++ = hexdigits[(ucs >> 24) & 0xf];
4376                    *p++ = hexdigits[(ucs >> 20) & 0xf];
4377                    *p++ = hexdigits[(ucs >> 16) & 0xf];
4378                    *p++ = hexdigits[(ucs >> 12) & 0xf];
4379                    *p++ = hexdigits[(ucs >> 8) & 0xf];
4380                    *p++ = hexdigits[(ucs >> 4) & 0xf];
4381                    *p++ = hexdigits[ucs & 0xf];
4382                    continue;
4383                }
4384                /* Fall through: isolated surrogates are copied as-is */
4385                s--;
4386                size++;
4387            }
4388#endif
4389        /* Map 16-bit characters to '\uxxxx' */
4390        if (ch >= 256) {
4391            *p++ = '\\';
4392            *p++ = 'u';
4393            *p++ = hexdigits[(ch >> 12) & 0xf];
4394            *p++ = hexdigits[(ch >> 8) & 0xf];
4395            *p++ = hexdigits[(ch >> 4) & 0xf];
4396            *p++ = hexdigits[ch & 15];
4397        }
4398        /* Copy everything else as-is */
4399        else
4400            *p++ = (char) ch;
4401    }
4402    size = p - q;
4403
4404    assert(size > 0);
4405    if (_PyBytes_Resize(&repr, size) < 0)
4406        return NULL;
4407    return repr;
4408}
4409
4410PyObject *
4411PyUnicode_AsRawUnicodeEscapeString(PyObject *unicode)
4412{
4413    PyObject *s;
4414    if (!PyUnicode_Check(unicode)) {
4415        PyErr_BadArgument();
4416        return NULL;
4417    }
4418    s = PyUnicode_EncodeRawUnicodeEscape(PyUnicode_AS_UNICODE(unicode),
4419                                         PyUnicode_GET_SIZE(unicode));
4420
4421    return s;
4422}
4423
4424/* --- Unicode Internal Codec ------------------------------------------- */
4425
4426PyObject *
4427_PyUnicode_DecodeUnicodeInternal(const char *s,
4428				 Py_ssize_t size,
4429				 const char *errors)
4430{
4431    const char *starts = s;
4432    Py_ssize_t startinpos;
4433    Py_ssize_t endinpos;
4434    Py_ssize_t outpos;
4435    PyUnicodeObject *v;
4436    Py_UNICODE *p;
4437    const char *end;
4438    const char *reason;
4439    PyObject *errorHandler = NULL;
4440    PyObject *exc = NULL;
4441
4442#ifdef Py_UNICODE_WIDE
4443    Py_UNICODE unimax = PyUnicode_GetMax();
4444#endif
4445
4446    /* XXX overflow detection missing */
4447    v = _PyUnicode_New((size+Py_UNICODE_SIZE-1)/ Py_UNICODE_SIZE);
4448    if (v == NULL)
4449        goto onError;
4450    if (PyUnicode_GetSize((PyObject *)v) == 0)
4451        return (PyObject *)v;
4452    p = PyUnicode_AS_UNICODE(v);
4453    end = s + size;
4454
4455    while (s < end) {
4456        memcpy(p, s, sizeof(Py_UNICODE));
4457        /* We have to sanity check the raw data, otherwise doom looms for
4458           some malformed UCS-4 data. */
4459        if (
4460#ifdef Py_UNICODE_WIDE
4461            *p > unimax || *p < 0 ||
4462#endif
4463            end-s < Py_UNICODE_SIZE
4464            )
4465        {
4466            startinpos = s - starts;
4467            if (end-s < Py_UNICODE_SIZE) {
4468                endinpos = end-starts;
4469                reason = "truncated input";
4470            }
4471            else {
4472                endinpos = s - starts + Py_UNICODE_SIZE;
4473                reason = "illegal code point (> 0x10FFFF)";
4474            }
4475            outpos = p - PyUnicode_AS_UNICODE(v);
4476            if (unicode_decode_call_errorhandler(
4477                    errors, &errorHandler,
4478                    "unicode_internal", reason,
4479                    &starts, &end, &startinpos, &endinpos, &exc, &s,
4480                    &v, &outpos, &p)) {
4481                goto onError;
4482            }
4483        }
4484        else {
4485            p++;
4486            s += Py_UNICODE_SIZE;
4487        }
4488    }
4489
4490    if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4491        goto onError;
4492    Py_XDECREF(errorHandler);
4493    Py_XDECREF(exc);
4494    return (PyObject *)v;
4495
4496  onError:
4497    Py_XDECREF(v);
4498    Py_XDECREF(errorHandler);
4499    Py_XDECREF(exc);
4500    return NULL;
4501}
4502
4503/* --- Latin-1 Codec ------------------------------------------------------ */
4504
4505PyObject *
4506PyUnicode_DecodeLatin1(const char *s,
4507		       Py_ssize_t size,
4508		       const char *errors)
4509{
4510    PyUnicodeObject *v;
4511    Py_UNICODE *p;
4512    const char *e, *unrolled_end;
4513
4514    /* Latin-1 is equivalent to the first 256 ordinals in Unicode. */
4515    if (size == 1) {
4516        Py_UNICODE r = *(unsigned char*)s;
4517        return PyUnicode_FromUnicode(&r, 1);
4518    }
4519
4520    v = _PyUnicode_New(size);
4521    if (v == NULL)
4522        goto onError;
4523    if (size == 0)
4524        return (PyObject *)v;
4525    p = PyUnicode_AS_UNICODE(v);
4526    e = s + size;
4527    /* Unrolling the copy makes it much faster by reducing the looping
4528       overhead. This is similar to what many memcpy() implementations do. */
4529    unrolled_end = e - 4;
4530    while (s < unrolled_end) {
4531        p[0] = (unsigned char) s[0];
4532        p[1] = (unsigned char) s[1];
4533        p[2] = (unsigned char) s[2];
4534        p[3] = (unsigned char) s[3];
4535        s += 4;
4536        p += 4;
4537    }
4538    while (s < e)
4539        *p++ = (unsigned char) *s++;
4540    return (PyObject *)v;
4541
4542  onError:
4543    Py_XDECREF(v);
4544    return NULL;
4545}
4546
4547/* create or adjust a UnicodeEncodeError */
4548static void
4549make_encode_exception(PyObject **exceptionObject,
4550		      const char *encoding,
4551		      const Py_UNICODE *unicode, Py_ssize_t size,
4552		      Py_ssize_t startpos, Py_ssize_t endpos,
4553		      const char *reason)
4554{
4555    if (*exceptionObject == NULL) {
4556        *exceptionObject = PyUnicodeEncodeError_Create(
4557            encoding, unicode, size, startpos, endpos, reason);
4558    }
4559    else {
4560        if (PyUnicodeEncodeError_SetStart(*exceptionObject, startpos))
4561            goto onError;
4562        if (PyUnicodeEncodeError_SetEnd(*exceptionObject, endpos))
4563            goto onError;
4564        if (PyUnicodeEncodeError_SetReason(*exceptionObject, reason))
4565            goto onError;
4566        return;
4567      onError:
4568        Py_DECREF(*exceptionObject);
4569        *exceptionObject = NULL;
4570    }
4571}
4572
4573/* raises a UnicodeEncodeError */
4574static void
4575raise_encode_exception(PyObject **exceptionObject,
4576		       const char *encoding,
4577		       const Py_UNICODE *unicode, Py_ssize_t size,
4578		       Py_ssize_t startpos, Py_ssize_t endpos,
4579		       const char *reason)
4580{
4581    make_encode_exception(exceptionObject,
4582                          encoding, unicode, size, startpos, endpos, reason);
4583    if (*exceptionObject != NULL)
4584        PyCodec_StrictErrors(*exceptionObject);
4585}
4586
4587/* error handling callback helper:
4588   build arguments, call the callback and check the arguments,
4589   put the result into newpos and return the replacement string, which
4590   has to be freed by the caller */
4591static PyObject *
4592unicode_encode_call_errorhandler(const char *errors,
4593				 PyObject **errorHandler,
4594				 const char *encoding, const char *reason,
4595				 const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
4596				 Py_ssize_t startpos, Py_ssize_t endpos,
4597				 Py_ssize_t *newpos)
4598{
4599    static char *argparse = "On;encoding error handler must return (str/bytes, int) tuple";
4600
4601    PyObject *restuple;
4602    PyObject *resunicode;
4603
4604    if (*errorHandler == NULL) {
4605        *errorHandler = PyCodec_LookupError(errors);
4606        if (*errorHandler == NULL)
4607            return NULL;
4608    }
4609
4610    make_encode_exception(exceptionObject,
4611                          encoding, unicode, size, startpos, endpos, reason);
4612    if (*exceptionObject == NULL)
4613        return NULL;
4614
4615    restuple = PyObject_CallFunctionObjArgs(
4616        *errorHandler, *exceptionObject, NULL);
4617    if (restuple == NULL)
4618        return NULL;
4619    if (!PyTuple_Check(restuple)) {
4620        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4621        Py_DECREF(restuple);
4622        return NULL;
4623    }
4624    if (!PyArg_ParseTuple(restuple, argparse,
4625                          &resunicode, newpos)) {
4626        Py_DECREF(restuple);
4627        return NULL;
4628    }
4629    if (!PyUnicode_Check(resunicode) && !PyBytes_Check(resunicode)) {
4630        PyErr_SetString(PyExc_TypeError, &argparse[3]);
4631        Py_DECREF(restuple);
4632        return NULL;
4633    }
4634    if (*newpos<0)
4635        *newpos = size+*newpos;
4636    if (*newpos<0 || *newpos>size) {
4637        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
4638        Py_DECREF(restuple);
4639        return NULL;
4640    }
4641    Py_INCREF(resunicode);
4642    Py_DECREF(restuple);
4643    return resunicode;
4644}
4645
4646static PyObject *
4647unicode_encode_ucs1(const Py_UNICODE *p,
4648		    Py_ssize_t size,
4649		    const char *errors,
4650		    int limit)
4651{
4652    /* output object */
4653    PyObject *res;
4654    /* pointers to the beginning and end+1 of input */
4655    const Py_UNICODE *startp = p;
4656    const Py_UNICODE *endp = p + size;
4657    /* pointer to the beginning of the unencodable characters */
4658    /* const Py_UNICODE *badp = NULL; */
4659    /* pointer into the output */
4660    char *str;
4661    /* current output position */
4662    Py_ssize_t ressize;
4663    const char *encoding = (limit == 256) ? "latin-1" : "ascii";
4664    const char *reason = (limit == 256) ? "ordinal not in range(256)" : "ordinal not in range(128)";
4665    PyObject *errorHandler = NULL;
4666    PyObject *exc = NULL;
4667    /* the following variable is used for caching string comparisons
4668     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
4669    int known_errorHandler = -1;
4670
4671    /* allocate enough for a simple encoding without
4672       replacements, if we need more, we'll resize */
4673    if (size == 0)
4674        return PyBytes_FromStringAndSize(NULL, 0);
4675    res = PyBytes_FromStringAndSize(NULL, size);
4676    if (res == NULL)
4677        return NULL;
4678    str = PyBytes_AS_STRING(res);
4679    ressize = size;
4680
4681    while (p<endp) {
4682        Py_UNICODE c = *p;
4683
4684        /* can we encode this? */
4685        if (c<limit) {
4686            /* no overflow check, because we know that the space is enough */
4687            *str++ = (char)c;
4688            ++p;
4689        }
4690        else {
4691            Py_ssize_t unicodepos = p-startp;
4692            Py_ssize_t requiredsize;
4693            PyObject *repunicode;
4694            Py_ssize_t repsize;
4695            Py_ssize_t newpos;
4696            Py_ssize_t respos;
4697            Py_UNICODE *uni2;
4698            /* startpos for collecting unencodable chars */
4699            const Py_UNICODE *collstart = p;
4700            const Py_UNICODE *collend = p;
4701            /* find all unecodable characters */
4702            while ((collend < endp) && ((*collend)>=limit))
4703                ++collend;
4704            /* cache callback name lookup (if not done yet, i.e. it's the first error) */
4705            if (known_errorHandler==-1) {
4706                if ((errors==NULL) || (!strcmp(errors, "strict")))
4707                    known_errorHandler = 1;
4708                else if (!strcmp(errors, "replace"))
4709                    known_errorHandler = 2;
4710                else if (!strcmp(errors, "ignore"))
4711                    known_errorHandler = 3;
4712                else if (!strcmp(errors, "xmlcharrefreplace"))
4713                    known_errorHandler = 4;
4714                else
4715                    known_errorHandler = 0;
4716            }
4717            switch (known_errorHandler) {
4718            case 1: /* strict */
4719                raise_encode_exception(&exc, encoding, startp, size, collstart-startp, collend-startp, reason);
4720                goto onError;
4721            case 2: /* replace */
4722                while (collstart++<collend)
4723                    *str++ = '?'; /* fall through */
4724            case 3: /* ignore */
4725                p = collend;
4726                break;
4727            case 4: /* xmlcharrefreplace */
4728                respos = str - PyBytes_AS_STRING(res);
4729                /* determine replacement size (temporarily (mis)uses p) */
4730                for (p = collstart, repsize = 0; p < collend; ++p) {
4731                    if (*p<10)
4732                        repsize += 2+1+1;
4733                    else if (*p<100)
4734                        repsize += 2+2+1;
4735                    else if (*p<1000)
4736                        repsize += 2+3+1;
4737                    else if (*p<10000)
4738                        repsize += 2+4+1;
4739#ifndef Py_UNICODE_WIDE
4740                    else
4741                        repsize += 2+5+1;
4742#else
4743                    else if (*p<100000)
4744                        repsize += 2+5+1;
4745                    else if (*p<1000000)
4746                        repsize += 2+6+1;
4747                    else
4748                        repsize += 2+7+1;
4749#endif
4750                }
4751                requiredsize = respos+repsize+(endp-collend);
4752                if (requiredsize > ressize) {
4753                    if (requiredsize<2*ressize)
4754                        requiredsize = 2*ressize;
4755                    if (_PyBytes_Resize(&res, requiredsize))
4756                        goto onError;
4757                    str = PyBytes_AS_STRING(res) + respos;
4758                    ressize = requiredsize;
4759                }
4760                /* generate replacement (temporarily (mis)uses p) */
4761                for (p = collstart; p < collend; ++p) {
4762                    str += sprintf(str, "&#%d;", (int)*p);
4763                }
4764                p = collend;
4765                break;
4766            default:
4767                repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
4768                                                              encoding, reason, startp, size, &exc,
4769                                                              collstart-startp, collend-startp, &newpos);
4770                if (repunicode == NULL)
4771                    goto onError;
4772                if (PyBytes_Check(repunicode)) {
4773                    /* Directly copy bytes result to output. */
4774                    repsize = PyBytes_Size(repunicode);
4775                    if (repsize > 1) {
4776                        /* Make room for all additional bytes. */
4777                        respos = str - PyBytes_AS_STRING(res);
4778                        if (_PyBytes_Resize(&res, ressize+repsize-1)) {
4779                            Py_DECREF(repunicode);
4780                            goto onError;
4781                        }
4782                        str = PyBytes_AS_STRING(res) + respos;
4783                        ressize += repsize-1;
4784                    }
4785                    memcpy(str, PyBytes_AsString(repunicode), repsize);
4786                    str += repsize;
4787                    p = startp + newpos;
4788                    Py_DECREF(repunicode);
4789                    break;
4790                }
4791                /* need more space? (at least enough for what we
4792                   have+the replacement+the rest of the string, so
4793                   we won't have to check space for encodable characters) */
4794                respos = str - PyBytes_AS_STRING(res);
4795                repsize = PyUnicode_GET_SIZE(repunicode);
4796                requiredsize = respos+repsize+(endp-collend);
4797                if (requiredsize > ressize) {
4798                    if (requiredsize<2*ressize)
4799                        requiredsize = 2*ressize;
4800                    if (_PyBytes_Resize(&res, requiredsize)) {
4801                        Py_DECREF(repunicode);
4802                        goto onError;
4803                    }
4804                    str = PyBytes_AS_STRING(res) + respos;
4805                    ressize = requiredsize;
4806                }
4807                /* check if there is anything unencodable in the replacement
4808                   and copy it to the output */
4809                for (uni2 = PyUnicode_AS_UNICODE(repunicode);repsize-->0; ++uni2, ++str) {
4810                    c = *uni2;
4811                    if (c >= limit) {
4812                        raise_encode_exception(&exc, encoding, startp, size,
4813                                               unicodepos, unicodepos+1, reason);
4814                        Py_DECREF(repunicode);
4815                        goto onError;
4816                    }
4817                    *str = (char)c;
4818                }
4819                p = startp + newpos;
4820                Py_DECREF(repunicode);
4821            }
4822        }
4823    }
4824    /* Resize if we allocated to much */
4825    size = str - PyBytes_AS_STRING(res);
4826    if (size < ressize) { /* If this falls res will be NULL */
4827        assert(size >= 0);
4828        if (_PyBytes_Resize(&res, size) < 0)
4829            goto onError;
4830    }
4831
4832    Py_XDECREF(errorHandler);
4833    Py_XDECREF(exc);
4834    return res;
4835
4836  onError:
4837    Py_XDECREF(res);
4838    Py_XDECREF(errorHandler);
4839    Py_XDECREF(exc);
4840    return NULL;
4841}
4842
4843PyObject *
4844PyUnicode_EncodeLatin1(const Py_UNICODE *p,
4845		       Py_ssize_t size,
4846		       const char *errors)
4847{
4848    return unicode_encode_ucs1(p, size, errors, 256);
4849}
4850
4851PyObject *
4852PyUnicode_AsLatin1String(PyObject *unicode)
4853{
4854    if (!PyUnicode_Check(unicode)) {
4855        PyErr_BadArgument();
4856        return NULL;
4857    }
4858    return PyUnicode_EncodeLatin1(PyUnicode_AS_UNICODE(unicode),
4859                                  PyUnicode_GET_SIZE(unicode),
4860                                  NULL);
4861}
4862
4863/* --- 7-bit ASCII Codec -------------------------------------------------- */
4864
4865PyObject *
4866PyUnicode_DecodeASCII(const char *s,
4867                      Py_ssize_t size,
4868                      const char *errors)
4869{
4870    const char *starts = s;
4871    PyUnicodeObject *v;
4872    Py_UNICODE *p;
4873    Py_ssize_t startinpos;
4874    Py_ssize_t endinpos;
4875    Py_ssize_t outpos;
4876    const char *e;
4877    PyObject *errorHandler = NULL;
4878    PyObject *exc = NULL;
4879
4880    /* ASCII is equivalent to the first 128 ordinals in Unicode. */
4881    if (size == 1 && *(unsigned char*)s < 128) {
4882        Py_UNICODE r = *(unsigned char*)s;
4883        return PyUnicode_FromUnicode(&r, 1);
4884    }
4885
4886    v = _PyUnicode_New(size);
4887    if (v == NULL)
4888        goto onError;
4889    if (size == 0)
4890        return (PyObject *)v;
4891    p = PyUnicode_AS_UNICODE(v);
4892    e = s + size;
4893    while (s < e) {
4894        register unsigned char c = (unsigned char)*s;
4895        if (c < 128) {
4896            *p++ = c;
4897            ++s;
4898        }
4899        else {
4900            startinpos = s-starts;
4901            endinpos = startinpos + 1;
4902            outpos = p - (Py_UNICODE *)PyUnicode_AS_UNICODE(v);
4903            if (unicode_decode_call_errorhandler(
4904                    errors, &errorHandler,
4905                    "ascii", "ordinal not in range(128)",
4906                    &starts, &e, &startinpos, &endinpos, &exc, &s,
4907                    &v, &outpos, &p))
4908                goto onError;
4909        }
4910    }
4911    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
4912        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
4913            goto onError;
4914    Py_XDECREF(errorHandler);
4915    Py_XDECREF(exc);
4916    return (PyObject *)v;
4917
4918  onError:
4919    Py_XDECREF(v);
4920    Py_XDECREF(errorHandler);
4921    Py_XDECREF(exc);
4922    return NULL;
4923}
4924
4925PyObject *
4926PyUnicode_EncodeASCII(const Py_UNICODE *p,
4927                      Py_ssize_t size,
4928                      const char *errors)
4929{
4930    return unicode_encode_ucs1(p, size, errors, 128);
4931}
4932
4933PyObject *
4934PyUnicode_AsASCIIString(PyObject *unicode)
4935{
4936    if (!PyUnicode_Check(unicode)) {
4937        PyErr_BadArgument();
4938        return NULL;
4939    }
4940    return PyUnicode_EncodeASCII(PyUnicode_AS_UNICODE(unicode),
4941                                 PyUnicode_GET_SIZE(unicode),
4942                                 NULL);
4943}
4944
4945#ifdef HAVE_MBCS
4946
4947/* --- MBCS codecs for Windows -------------------------------------------- */
4948
4949#if SIZEOF_INT < SIZEOF_SIZE_T
4950#define NEED_RETRY
4951#endif
4952
4953/* XXX This code is limited to "true" double-byte encodings, as
4954   a) it assumes an incomplete character consists of a single byte, and
4955   b) IsDBCSLeadByte (probably) does not work for non-DBCS multi-byte
4956   encodings, see IsDBCSLeadByteEx documentation. */
4957
4958static int
4959is_dbcs_lead_byte(const char *s, int offset)
4960{
4961    const char *curr = s + offset;
4962
4963    if (IsDBCSLeadByte(*curr)) {
4964        const char *prev = CharPrev(s, curr);
4965        return (prev == curr) || !IsDBCSLeadByte(*prev) || (curr - prev == 2);
4966    }
4967    return 0;
4968}
4969
4970/*
4971 * Decode MBCS string into unicode object. If 'final' is set, converts
4972 * trailing lead-byte too. Returns consumed size if succeed, -1 otherwise.
4973 */
4974static int
4975decode_mbcs(PyUnicodeObject **v,
4976            const char *s, /* MBCS string */
4977            int size, /* sizeof MBCS string */
4978            int final,
4979            const char *errors)
4980{
4981    Py_UNICODE *p;
4982    Py_ssize_t n;
4983    DWORD usize;
4984    DWORD flags;
4985
4986    assert(size >= 0);
4987
4988    /* check and handle 'errors' arg */
4989    if (errors==NULL || strcmp(errors, "strict")==0)
4990        flags = MB_ERR_INVALID_CHARS;
4991    else if (strcmp(errors, "ignore")==0)
4992        flags = 0;
4993    else {
4994        PyErr_Format(PyExc_ValueError,
4995                     "mbcs encoding does not support errors='%s'",
4996                     errors);
4997        return -1;
4998    }
4999
5000    /* Skip trailing lead-byte unless 'final' is set */
5001    if (!final && size >= 1 && is_dbcs_lead_byte(s, size - 1))
5002        --size;
5003
5004    /* First get the size of the result */
5005    if (size > 0) {
5006        usize = MultiByteToWideChar(CP_ACP, flags, s, size, NULL, 0);
5007        if (usize==0)
5008            goto mbcs_decode_error;
5009    } else
5010        usize = 0;
5011
5012    if (*v == NULL) {
5013        /* Create unicode object */
5014        *v = _PyUnicode_New(usize);
5015        if (*v == NULL)
5016            return -1;
5017        n = 0;
5018    }
5019    else {
5020        /* Extend unicode object */
5021        n = PyUnicode_GET_SIZE(*v);
5022        if (_PyUnicode_Resize(v, n + usize) < 0)
5023            return -1;
5024    }
5025
5026    /* Do the conversion */
5027    if (usize > 0) {
5028        p = PyUnicode_AS_UNICODE(*v) + n;
5029        if (0 == MultiByteToWideChar(CP_ACP, flags, s, size, p, usize)) {
5030            goto mbcs_decode_error;
5031        }
5032    }
5033    return size;
5034
5035mbcs_decode_error:
5036    /* If the last error was ERROR_NO_UNICODE_TRANSLATION, then
5037       we raise a UnicodeDecodeError - else it is a 'generic'
5038       windows error
5039     */
5040    if (GetLastError()==ERROR_NO_UNICODE_TRANSLATION) {
5041        /* Ideally, we should get reason from FormatMessage - this
5042           is the Windows 2000 English version of the message
5043        */
5044        PyObject *exc = NULL;
5045        const char *reason = "No mapping for the Unicode character exists "
5046                             "in the target multi-byte code page.";
5047        make_decode_exception(&exc, "mbcs", s, size, 0, 0, reason);
5048        if (exc != NULL) {
5049            PyCodec_StrictErrors(exc);
5050            Py_DECREF(exc);
5051        }
5052    } else {
5053        PyErr_SetFromWindowsErrWithFilename(0, NULL);
5054    }
5055    return -1;
5056}
5057
5058PyObject *
5059PyUnicode_DecodeMBCSStateful(const char *s,
5060                             Py_ssize_t size,
5061                             const char *errors,
5062                             Py_ssize_t *consumed)
5063{
5064    PyUnicodeObject *v = NULL;
5065    int done;
5066
5067    if (consumed)
5068        *consumed = 0;
5069
5070#ifdef NEED_RETRY
5071  retry:
5072    if (size > INT_MAX)
5073        done = decode_mbcs(&v, s, INT_MAX, 0, errors);
5074    else
5075#endif
5076        done = decode_mbcs(&v, s, (int)size, !consumed, errors);
5077
5078    if (done < 0) {
5079        Py_XDECREF(v);
5080        return NULL;
5081    }
5082
5083    if (consumed)
5084        *consumed += done;
5085
5086#ifdef NEED_RETRY
5087    if (size > INT_MAX) {
5088        s += done;
5089        size -= done;
5090        goto retry;
5091    }
5092#endif
5093
5094    return (PyObject *)v;
5095}
5096
5097PyObject *
5098PyUnicode_DecodeMBCS(const char *s,
5099                     Py_ssize_t size,
5100                     const char *errors)
5101{
5102    return PyUnicode_DecodeMBCSStateful(s, size, errors, NULL);
5103}
5104
5105/*
5106 * Convert unicode into string object (MBCS).
5107 * Returns 0 if succeed, -1 otherwise.
5108 */
5109static int
5110encode_mbcs(PyObject **repr,
5111            const Py_UNICODE *p, /* unicode */
5112            int size, /* size of unicode */
5113            const char* errors)
5114{
5115    BOOL usedDefaultChar = FALSE;
5116    BOOL *pusedDefaultChar;
5117    int mbcssize;
5118    Py_ssize_t n;
5119    PyObject *exc = NULL;
5120    DWORD flags;
5121
5122    assert(size >= 0);
5123
5124    /* check and handle 'errors' arg */
5125    if (errors==NULL || strcmp(errors, "strict")==0) {
5126        flags = WC_NO_BEST_FIT_CHARS;
5127        pusedDefaultChar = &usedDefaultChar;
5128    } else if (strcmp(errors, "replace")==0) {
5129        flags = 0;
5130        pusedDefaultChar = NULL;
5131    } else {
5132         PyErr_Format(PyExc_ValueError,
5133                      "mbcs encoding does not support errors='%s'",
5134                      errors);
5135         return -1;
5136    }
5137
5138    /* First get the size of the result */
5139    if (size > 0) {
5140        mbcssize = WideCharToMultiByte(CP_ACP, flags, p, size, NULL, 0,
5141                                       NULL, pusedDefaultChar);
5142        if (mbcssize == 0) {
5143            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5144            return -1;
5145        }
5146        /* If we used a default char, then we failed! */
5147        if (pusedDefaultChar && *pusedDefaultChar)
5148            goto mbcs_encode_error;
5149    } else {
5150        mbcssize = 0;
5151    }
5152
5153    if (*repr == NULL) {
5154        /* Create string object */
5155        *repr = PyBytes_FromStringAndSize(NULL, mbcssize);
5156        if (*repr == NULL)
5157            return -1;
5158        n = 0;
5159    }
5160    else {
5161        /* Extend string object */
5162        n = PyBytes_Size(*repr);
5163        if (_PyBytes_Resize(repr, n + mbcssize) < 0)
5164            return -1;
5165    }
5166
5167    /* Do the conversion */
5168    if (size > 0) {
5169        char *s = PyBytes_AS_STRING(*repr) + n;
5170        if (0 == WideCharToMultiByte(CP_ACP, flags, p, size, s, mbcssize,
5171                                     NULL, pusedDefaultChar)) {
5172            PyErr_SetFromWindowsErrWithFilename(0, NULL);
5173            return -1;
5174        }
5175        if (pusedDefaultChar && *pusedDefaultChar)
5176            goto mbcs_encode_error;
5177    }
5178    return 0;
5179
5180mbcs_encode_error:
5181    raise_encode_exception(&exc, "mbcs", p, size, 0, 0, "invalid character");
5182    Py_XDECREF(exc);
5183    return -1;
5184}
5185
5186PyObject *
5187PyUnicode_EncodeMBCS(const Py_UNICODE *p,
5188                     Py_ssize_t size,
5189                     const char *errors)
5190{
5191    PyObject *repr = NULL;
5192    int ret;
5193
5194#ifdef NEED_RETRY
5195  retry:
5196    if (size > INT_MAX)
5197        ret = encode_mbcs(&repr, p, INT_MAX, errors);
5198    else
5199#endif
5200        ret = encode_mbcs(&repr, p, (int)size, errors);
5201
5202    if (ret < 0) {
5203        Py_XDECREF(repr);
5204        return NULL;
5205    }
5206
5207#ifdef NEED_RETRY
5208    if (size > INT_MAX) {
5209        p += INT_MAX;
5210        size -= INT_MAX;
5211        goto retry;
5212    }
5213#endif
5214
5215    return repr;
5216}
5217
5218PyObject *
5219PyUnicode_AsMBCSString(PyObject *unicode)
5220{
5221    if (!PyUnicode_Check(unicode)) {
5222        PyErr_BadArgument();
5223        return NULL;
5224    }
5225    return PyUnicode_EncodeMBCS(PyUnicode_AS_UNICODE(unicode),
5226                                PyUnicode_GET_SIZE(unicode),
5227                                NULL);
5228}
5229
5230#undef NEED_RETRY
5231
5232#endif /* HAVE_MBCS */
5233
5234/* --- Character Mapping Codec -------------------------------------------- */
5235
5236PyObject *
5237PyUnicode_DecodeCharmap(const char *s,
5238                        Py_ssize_t size,
5239                        PyObject *mapping,
5240                        const char *errors)
5241{
5242    const char *starts = s;
5243    Py_ssize_t startinpos;
5244    Py_ssize_t endinpos;
5245    Py_ssize_t outpos;
5246    const char *e;
5247    PyUnicodeObject *v;
5248    Py_UNICODE *p;
5249    Py_ssize_t extrachars = 0;
5250    PyObject *errorHandler = NULL;
5251    PyObject *exc = NULL;
5252    Py_UNICODE *mapstring = NULL;
5253    Py_ssize_t maplen = 0;
5254
5255    /* Default to Latin-1 */
5256    if (mapping == NULL)
5257        return PyUnicode_DecodeLatin1(s, size, errors);
5258
5259    v = _PyUnicode_New(size);
5260    if (v == NULL)
5261        goto onError;
5262    if (size == 0)
5263        return (PyObject *)v;
5264    p = PyUnicode_AS_UNICODE(v);
5265    e = s + size;
5266    if (PyUnicode_CheckExact(mapping)) {
5267        mapstring = PyUnicode_AS_UNICODE(mapping);
5268        maplen = PyUnicode_GET_SIZE(mapping);
5269        while (s < e) {
5270            unsigned char ch = *s;
5271            Py_UNICODE x = 0xfffe; /* illegal value */
5272
5273            if (ch < maplen)
5274                x = mapstring[ch];
5275
5276            if (x == 0xfffe) {
5277                /* undefined mapping */
5278                outpos = p-PyUnicode_AS_UNICODE(v);
5279                startinpos = s-starts;
5280                endinpos = startinpos+1;
5281                if (unicode_decode_call_errorhandler(
5282                        errors, &errorHandler,
5283                        "charmap", "character maps to <undefined>",
5284                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5285                        &v, &outpos, &p)) {
5286                    goto onError;
5287                }
5288                continue;
5289            }
5290            *p++ = x;
5291            ++s;
5292        }
5293    }
5294    else {
5295        while (s < e) {
5296            unsigned char ch = *s;
5297            PyObject *w, *x;
5298
5299            /* Get mapping (char ordinal -> integer, Unicode char or None) */
5300            w = PyLong_FromLong((long)ch);
5301            if (w == NULL)
5302                goto onError;
5303            x = PyObject_GetItem(mapping, w);
5304            Py_DECREF(w);
5305            if (x == NULL) {
5306                if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5307                    /* No mapping found means: mapping is undefined. */
5308                    PyErr_Clear();
5309                    x = Py_None;
5310                    Py_INCREF(x);
5311                } else
5312                    goto onError;
5313            }
5314
5315            /* Apply mapping */
5316            if (PyLong_Check(x)) {
5317                long value = PyLong_AS_LONG(x);
5318                if (value < 0 || value > 65535) {
5319                    PyErr_SetString(PyExc_TypeError,
5320                                    "character mapping must be in range(65536)");
5321                    Py_DECREF(x);
5322                    goto onError;
5323                }
5324                *p++ = (Py_UNICODE)value;
5325            }
5326            else if (x == Py_None) {
5327                /* undefined mapping */
5328                outpos = p-PyUnicode_AS_UNICODE(v);
5329                startinpos = s-starts;
5330                endinpos = startinpos+1;
5331                if (unicode_decode_call_errorhandler(
5332                        errors, &errorHandler,
5333                        "charmap", "character maps to <undefined>",
5334                        &starts, &e, &startinpos, &endinpos, &exc, &s,
5335                        &v, &outpos, &p)) {
5336                    Py_DECREF(x);
5337                    goto onError;
5338                }
5339                Py_DECREF(x);
5340                continue;
5341            }
5342            else if (PyUnicode_Check(x)) {
5343                Py_ssize_t targetsize = PyUnicode_GET_SIZE(x);
5344
5345                if (targetsize == 1)
5346                    /* 1-1 mapping */
5347                    *p++ = *PyUnicode_AS_UNICODE(x);
5348
5349                else if (targetsize > 1) {
5350                    /* 1-n mapping */
5351                    if (targetsize > extrachars) {
5352                        /* resize first */
5353                        Py_ssize_t oldpos = p - PyUnicode_AS_UNICODE(v);
5354                        Py_ssize_t needed = (targetsize - extrachars) + \
5355                            (targetsize << 2);
5356                        extrachars += needed;
5357                        /* XXX overflow detection missing */
5358                        if (_PyUnicode_Resize(&v,
5359                                              PyUnicode_GET_SIZE(v) + needed) < 0) {
5360                            Py_DECREF(x);
5361                            goto onError;
5362                        }
5363                        p = PyUnicode_AS_UNICODE(v) + oldpos;
5364                    }
5365                    Py_UNICODE_COPY(p,
5366                                    PyUnicode_AS_UNICODE(x),
5367                                    targetsize);
5368                    p += targetsize;
5369                    extrachars -= targetsize;
5370                }
5371                /* 1-0 mapping: skip the character */
5372            }
5373            else {
5374                /* wrong return value */
5375                PyErr_SetString(PyExc_TypeError,
5376                                "character mapping must return integer, None or str");
5377                Py_DECREF(x);
5378                goto onError;
5379            }
5380            Py_DECREF(x);
5381            ++s;
5382        }
5383    }
5384    if (p - PyUnicode_AS_UNICODE(v) < PyUnicode_GET_SIZE(v))
5385        if (_PyUnicode_Resize(&v, p - PyUnicode_AS_UNICODE(v)) < 0)
5386            goto onError;
5387    Py_XDECREF(errorHandler);
5388    Py_XDECREF(exc);
5389    return (PyObject *)v;
5390
5391  onError:
5392    Py_XDECREF(errorHandler);
5393    Py_XDECREF(exc);
5394    Py_XDECREF(v);
5395    return NULL;
5396}
5397
5398/* Charmap encoding: the lookup table */
5399
5400struct encoding_map {
5401    PyObject_HEAD
5402    unsigned char level1[32];
5403    int count2, count3;
5404    unsigned char level23[1];
5405};
5406
5407static PyObject*
5408encoding_map_size(PyObject *obj, PyObject* args)
5409{
5410    struct encoding_map *map = (struct encoding_map*)obj;
5411    return PyLong_FromLong(sizeof(*map) - 1 + 16*map->count2 +
5412                           128*map->count3);
5413}
5414
5415static PyMethodDef encoding_map_methods[] = {
5416    {"size", encoding_map_size, METH_NOARGS,
5417     PyDoc_STR("Return the size (in bytes) of this object") },
5418    { 0 }
5419};
5420
5421static void
5422encoding_map_dealloc(PyObject* o)
5423{
5424    PyObject_FREE(o);
5425}
5426
5427static PyTypeObject EncodingMapType = {
5428    PyVarObject_HEAD_INIT(NULL, 0)
5429    "EncodingMap",          /*tp_name*/
5430    sizeof(struct encoding_map),   /*tp_basicsize*/
5431    0,                      /*tp_itemsize*/
5432    /* methods */
5433    encoding_map_dealloc,   /*tp_dealloc*/
5434    0,                      /*tp_print*/
5435    0,                      /*tp_getattr*/
5436    0,                      /*tp_setattr*/
5437    0,                      /*tp_reserved*/
5438    0,                      /*tp_repr*/
5439    0,                      /*tp_as_number*/
5440    0,                      /*tp_as_sequence*/
5441    0,                      /*tp_as_mapping*/
5442    0,                      /*tp_hash*/
5443    0,                      /*tp_call*/
5444    0,                      /*tp_str*/
5445    0,                      /*tp_getattro*/
5446    0,                      /*tp_setattro*/
5447    0,                      /*tp_as_buffer*/
5448    Py_TPFLAGS_DEFAULT,     /*tp_flags*/
5449    0,                      /*tp_doc*/
5450    0,                      /*tp_traverse*/
5451    0,                      /*tp_clear*/
5452    0,                      /*tp_richcompare*/
5453    0,                      /*tp_weaklistoffset*/
5454    0,                      /*tp_iter*/
5455    0,                      /*tp_iternext*/
5456    encoding_map_methods,   /*tp_methods*/
5457    0,                      /*tp_members*/
5458    0,                      /*tp_getset*/
5459    0,                      /*tp_base*/
5460    0,                      /*tp_dict*/
5461    0,                      /*tp_descr_get*/
5462    0,                      /*tp_descr_set*/
5463    0,                      /*tp_dictoffset*/
5464    0,                      /*tp_init*/
5465    0,                      /*tp_alloc*/
5466    0,                      /*tp_new*/
5467    0,                      /*tp_free*/
5468    0,                      /*tp_is_gc*/
5469};
5470
5471PyObject*
5472PyUnicode_BuildEncodingMap(PyObject* string)
5473{
5474    Py_UNICODE *decode;
5475    PyObject *result;
5476    struct encoding_map *mresult;
5477    int i;
5478    int need_dict = 0;
5479    unsigned char level1[32];
5480    unsigned char level2[512];
5481    unsigned char *mlevel1, *mlevel2, *mlevel3;
5482    int count2 = 0, count3 = 0;
5483
5484    if (!PyUnicode_Check(string) || PyUnicode_GetSize(string) != 256) {
5485        PyErr_BadArgument();
5486        return NULL;
5487    }
5488    decode = PyUnicode_AS_UNICODE(string);
5489    memset(level1, 0xFF, sizeof level1);
5490    memset(level2, 0xFF, sizeof level2);
5491
5492    /* If there isn't a one-to-one mapping of NULL to \0,
5493       or if there are non-BMP characters, we need to use
5494       a mapping dictionary. */
5495    if (decode[0] != 0)
5496        need_dict = 1;
5497    for (i = 1; i < 256; i++) {
5498        int l1, l2;
5499        if (decode[i] == 0
5500#ifdef Py_UNICODE_WIDE
5501            || decode[i] > 0xFFFF
5502#endif
5503            ) {
5504            need_dict = 1;
5505            break;
5506        }
5507        if (decode[i] == 0xFFFE)
5508            /* unmapped character */
5509            continue;
5510        l1 = decode[i] >> 11;
5511        l2 = decode[i] >> 7;
5512        if (level1[l1] == 0xFF)
5513            level1[l1] = count2++;
5514        if (level2[l2] == 0xFF)
5515            level2[l2] = count3++;
5516    }
5517
5518    if (count2 >= 0xFF || count3 >= 0xFF)
5519        need_dict = 1;
5520
5521    if (need_dict) {
5522        PyObject *result = PyDict_New();
5523        PyObject *key, *value;
5524        if (!result)
5525            return NULL;
5526        for (i = 0; i < 256; i++) {
5527            key = PyLong_FromLong(decode[i]);
5528            value = PyLong_FromLong(i);
5529            if (!key || !value)
5530                goto failed1;
5531            if (PyDict_SetItem(result, key, value) == -1)
5532                goto failed1;
5533            Py_DECREF(key);
5534            Py_DECREF(value);
5535        }
5536        return result;
5537      failed1:
5538        Py_XDECREF(key);
5539        Py_XDECREF(value);
5540        Py_DECREF(result);
5541        return NULL;
5542    }
5543
5544    /* Create a three-level trie */
5545    result = PyObject_MALLOC(sizeof(struct encoding_map) +
5546                             16*count2 + 128*count3 - 1);
5547    if (!result)
5548        return PyErr_NoMemory();
5549    PyObject_Init(result, &EncodingMapType);
5550    mresult = (struct encoding_map*)result;
5551    mresult->count2 = count2;
5552    mresult->count3 = count3;
5553    mlevel1 = mresult->level1;
5554    mlevel2 = mresult->level23;
5555    mlevel3 = mresult->level23 + 16*count2;
5556    memcpy(mlevel1, level1, 32);
5557    memset(mlevel2, 0xFF, 16*count2);
5558    memset(mlevel3, 0, 128*count3);
5559    count3 = 0;
5560    for (i = 1; i < 256; i++) {
5561        int o1, o2, o3, i2, i3;
5562        if (decode[i] == 0xFFFE)
5563            /* unmapped character */
5564            continue;
5565        o1 = decode[i]>>11;
5566        o2 = (decode[i]>>7) & 0xF;
5567        i2 = 16*mlevel1[o1] + o2;
5568        if (mlevel2[i2] == 0xFF)
5569            mlevel2[i2] = count3++;
5570        o3 = decode[i] & 0x7F;
5571        i3 = 128*mlevel2[i2] + o3;
5572        mlevel3[i3] = i;
5573    }
5574    return result;
5575}
5576
5577static int
5578encoding_map_lookup(Py_UNICODE c, PyObject *mapping)
5579{
5580    struct encoding_map *map = (struct encoding_map*)mapping;
5581    int l1 = c>>11;
5582    int l2 = (c>>7) & 0xF;
5583    int l3 = c & 0x7F;
5584    int i;
5585
5586#ifdef Py_UNICODE_WIDE
5587    if (c > 0xFFFF) {
5588        return -1;
5589    }
5590#endif
5591    if (c == 0)
5592        return 0;
5593    /* level 1*/
5594    i = map->level1[l1];
5595    if (i == 0xFF) {
5596        return -1;
5597    }
5598    /* level 2*/
5599    i = map->level23[16*i+l2];
5600    if (i == 0xFF) {
5601        return -1;
5602    }
5603    /* level 3 */
5604    i = map->level23[16*map->count2 + 128*i + l3];
5605    if (i == 0) {
5606        return -1;
5607    }
5608    return i;
5609}
5610
5611/* Lookup the character ch in the mapping. If the character
5612   can't be found, Py_None is returned (or NULL, if another
5613   error occurred). */
5614static PyObject *
5615charmapencode_lookup(Py_UNICODE c, PyObject *mapping)
5616{
5617    PyObject *w = PyLong_FromLong((long)c);
5618    PyObject *x;
5619
5620    if (w == NULL)
5621        return NULL;
5622    x = PyObject_GetItem(mapping, w);
5623    Py_DECREF(w);
5624    if (x == NULL) {
5625        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
5626            /* No mapping found means: mapping is undefined. */
5627            PyErr_Clear();
5628            x = Py_None;
5629            Py_INCREF(x);
5630            return x;
5631        } else
5632            return NULL;
5633    }
5634    else if (x == Py_None)
5635        return x;
5636    else if (PyLong_Check(x)) {
5637        long value = PyLong_AS_LONG(x);
5638        if (value < 0 || value > 255) {
5639            PyErr_SetString(PyExc_TypeError,
5640                            "character mapping must be in range(256)");
5641            Py_DECREF(x);
5642            return NULL;
5643        }
5644        return x;
5645    }
5646    else if (PyBytes_Check(x))
5647        return x;
5648    else {
5649        /* wrong return value */
5650        PyErr_Format(PyExc_TypeError,
5651                     "character mapping must return integer, bytes or None, not %.400s",
5652                     x->ob_type->tp_name);
5653        Py_DECREF(x);
5654        return NULL;
5655    }
5656}
5657
5658static int
5659charmapencode_resize(PyObject **outobj, Py_ssize_t *outpos, Py_ssize_t requiredsize)
5660{
5661    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5662    /* exponentially overallocate to minimize reallocations */
5663    if (requiredsize < 2*outsize)
5664        requiredsize = 2*outsize;
5665    if (_PyBytes_Resize(outobj, requiredsize))
5666        return -1;
5667    return 0;
5668}
5669
5670typedef enum charmapencode_result {
5671    enc_SUCCESS, enc_FAILED, enc_EXCEPTION
5672} charmapencode_result;
5673/* lookup the character, put the result in the output string and adjust
5674   various state variables. Resize the output bytes object if not enough
5675   space is available. Return a new reference to the object that
5676   was put in the output buffer, or Py_None, if the mapping was undefined
5677   (in which case no character was written) or NULL, if a
5678   reallocation error occurred. The caller must decref the result */
5679static charmapencode_result
5680charmapencode_output(Py_UNICODE c, PyObject *mapping,
5681                     PyObject **outobj, Py_ssize_t *outpos)
5682{
5683    PyObject *rep;
5684    char *outstart;
5685    Py_ssize_t outsize = PyBytes_GET_SIZE(*outobj);
5686
5687    if (Py_TYPE(mapping) == &EncodingMapType) {
5688        int res = encoding_map_lookup(c, mapping);
5689        Py_ssize_t requiredsize = *outpos+1;
5690        if (res == -1)
5691            return enc_FAILED;
5692        if (outsize<requiredsize)
5693            if (charmapencode_resize(outobj, outpos, requiredsize))
5694                return enc_EXCEPTION;
5695        outstart = PyBytes_AS_STRING(*outobj);
5696        outstart[(*outpos)++] = (char)res;
5697        return enc_SUCCESS;
5698    }
5699
5700    rep = charmapencode_lookup(c, mapping);
5701    if (rep==NULL)
5702        return enc_EXCEPTION;
5703    else if (rep==Py_None) {
5704        Py_DECREF(rep);
5705        return enc_FAILED;
5706    } else {
5707        if (PyLong_Check(rep)) {
5708            Py_ssize_t requiredsize = *outpos+1;
5709            if (outsize<requiredsize)
5710                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5711                    Py_DECREF(rep);
5712                    return enc_EXCEPTION;
5713                }
5714            outstart = PyBytes_AS_STRING(*outobj);
5715            outstart[(*outpos)++] = (char)PyLong_AS_LONG(rep);
5716        }
5717        else {
5718            const char *repchars = PyBytes_AS_STRING(rep);
5719            Py_ssize_t repsize = PyBytes_GET_SIZE(rep);
5720            Py_ssize_t requiredsize = *outpos+repsize;
5721            if (outsize<requiredsize)
5722                if (charmapencode_resize(outobj, outpos, requiredsize)) {
5723                    Py_DECREF(rep);
5724                    return enc_EXCEPTION;
5725                }
5726            outstart = PyBytes_AS_STRING(*outobj);
5727            memcpy(outstart + *outpos, repchars, repsize);
5728            *outpos += repsize;
5729        }
5730    }
5731    Py_DECREF(rep);
5732    return enc_SUCCESS;
5733}
5734
5735/* handle an error in PyUnicode_EncodeCharmap
5736   Return 0 on success, -1 on error */
5737static int
5738charmap_encoding_error(
5739    const Py_UNICODE *p, Py_ssize_t size, Py_ssize_t *inpos, PyObject *mapping,
5740    PyObject **exceptionObject,
5741    int *known_errorHandler, PyObject **errorHandler, const char *errors,
5742    PyObject **res, Py_ssize_t *respos)
5743{
5744    PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
5745    Py_ssize_t repsize;
5746    Py_ssize_t newpos;
5747    Py_UNICODE *uni2;
5748    /* startpos for collecting unencodable chars */
5749    Py_ssize_t collstartpos = *inpos;
5750    Py_ssize_t collendpos = *inpos+1;
5751    Py_ssize_t collpos;
5752    char *encoding = "charmap";
5753    char *reason = "character maps to <undefined>";
5754    charmapencode_result x;
5755
5756    /* find all unencodable characters */
5757    while (collendpos < size) {
5758        PyObject *rep;
5759        if (Py_TYPE(mapping) == &EncodingMapType) {
5760            int res = encoding_map_lookup(p[collendpos], mapping);
5761            if (res != -1)
5762                break;
5763            ++collendpos;
5764            continue;
5765        }
5766
5767        rep = charmapencode_lookup(p[collendpos], mapping);
5768        if (rep==NULL)
5769            return -1;
5770        else if (rep!=Py_None) {
5771            Py_DECREF(rep);
5772            break;
5773        }
5774        Py_DECREF(rep);
5775        ++collendpos;
5776    }
5777    /* cache callback name lookup
5778     * (if not done yet, i.e. it's the first error) */
5779    if (*known_errorHandler==-1) {
5780        if ((errors==NULL) || (!strcmp(errors, "strict")))
5781            *known_errorHandler = 1;
5782        else if (!strcmp(errors, "replace"))
5783            *known_errorHandler = 2;
5784        else if (!strcmp(errors, "ignore"))
5785            *known_errorHandler = 3;
5786        else if (!strcmp(errors, "xmlcharrefreplace"))
5787            *known_errorHandler = 4;
5788        else
5789            *known_errorHandler = 0;
5790    }
5791    switch (*known_errorHandler) {
5792    case 1: /* strict */
5793        raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5794        return -1;
5795    case 2: /* replace */
5796        for (collpos = collstartpos; collpos<collendpos; ++collpos) {
5797            x = charmapencode_output('?', mapping, res, respos);
5798            if (x==enc_EXCEPTION) {
5799                return -1;
5800            }
5801            else if (x==enc_FAILED) {
5802                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5803                return -1;
5804            }
5805        }
5806        /* fall through */
5807    case 3: /* ignore */
5808        *inpos = collendpos;
5809        break;
5810    case 4: /* xmlcharrefreplace */
5811        /* generate replacement (temporarily (mis)uses p) */
5812        for (collpos = collstartpos; collpos < collendpos; ++collpos) {
5813            char buffer[2+29+1+1];
5814            char *cp;
5815            sprintf(buffer, "&#%d;", (int)p[collpos]);
5816            for (cp = buffer; *cp; ++cp) {
5817                x = charmapencode_output(*cp, mapping, res, respos);
5818                if (x==enc_EXCEPTION)
5819                    return -1;
5820                else if (x==enc_FAILED) {
5821                    raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5822                    return -1;
5823                }
5824            }
5825        }
5826        *inpos = collendpos;
5827        break;
5828    default:
5829        repunicode = unicode_encode_call_errorhandler(errors, errorHandler,
5830                                                      encoding, reason, p, size, exceptionObject,
5831                                                      collstartpos, collendpos, &newpos);
5832        if (repunicode == NULL)
5833            return -1;
5834        if (PyBytes_Check(repunicode)) {
5835            /* Directly copy bytes result to output. */
5836            Py_ssize_t outsize = PyBytes_Size(*res);
5837            Py_ssize_t requiredsize;
5838            repsize = PyBytes_Size(repunicode);
5839            requiredsize = *respos + repsize;
5840            if (requiredsize > outsize)
5841                /* Make room for all additional bytes. */
5842                if (charmapencode_resize(res, respos, requiredsize)) {
5843                    Py_DECREF(repunicode);
5844                    return -1;
5845                }
5846            memcpy(PyBytes_AsString(*res) + *respos,
5847                   PyBytes_AsString(repunicode),  repsize);
5848            *respos += repsize;
5849            *inpos = newpos;
5850            Py_DECREF(repunicode);
5851            break;
5852        }
5853        /* generate replacement  */
5854        repsize = PyUnicode_GET_SIZE(repunicode);
5855        for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
5856            x = charmapencode_output(*uni2, mapping, res, respos);
5857            if (x==enc_EXCEPTION) {
5858                return -1;
5859            }
5860            else if (x==enc_FAILED) {
5861                Py_DECREF(repunicode);
5862                raise_encode_exception(exceptionObject, encoding, p, size, collstartpos, collendpos, reason);
5863                return -1;
5864            }
5865        }
5866        *inpos = newpos;
5867        Py_DECREF(repunicode);
5868    }
5869    return 0;
5870}
5871
5872PyObject *
5873PyUnicode_EncodeCharmap(const Py_UNICODE *p,
5874                        Py_ssize_t size,
5875                        PyObject *mapping,
5876                        const char *errors)
5877{
5878    /* output object */
5879    PyObject *res = NULL;
5880    /* current input position */
5881    Py_ssize_t inpos = 0;
5882    /* current output position */
5883    Py_ssize_t respos = 0;
5884    PyObject *errorHandler = NULL;
5885    PyObject *exc = NULL;
5886    /* the following variable is used for caching string comparisons
5887     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
5888     * 3=ignore, 4=xmlcharrefreplace */
5889    int known_errorHandler = -1;
5890
5891    /* Default to Latin-1 */
5892    if (mapping == NULL)
5893        return PyUnicode_EncodeLatin1(p, size, errors);
5894
5895    /* allocate enough for a simple encoding without
5896       replacements, if we need more, we'll resize */
5897    res = PyBytes_FromStringAndSize(NULL, size);
5898    if (res == NULL)
5899        goto onError;
5900    if (size == 0)
5901        return res;
5902
5903    while (inpos<size) {
5904        /* try to encode it */
5905        charmapencode_result x = charmapencode_output(p[inpos], mapping, &res, &respos);
5906        if (x==enc_EXCEPTION) /* error */
5907            goto onError;
5908        if (x==enc_FAILED) { /* unencodable character */
5909            if (charmap_encoding_error(p, size, &inpos, mapping,
5910                                       &exc,
5911                                       &known_errorHandler, &errorHandler, errors,
5912                                       &res, &respos)) {
5913                goto onError;
5914            }
5915        }
5916        else
5917            /* done with this character => adjust input position */
5918            ++inpos;
5919    }
5920
5921    /* Resize if we allocated to much */
5922    if (respos<PyBytes_GET_SIZE(res))
5923        if (_PyBytes_Resize(&res, respos) < 0)
5924            goto onError;
5925
5926    Py_XDECREF(exc);
5927    Py_XDECREF(errorHandler);
5928    return res;
5929
5930  onError:
5931    Py_XDECREF(res);
5932    Py_XDECREF(exc);
5933    Py_XDECREF(errorHandler);
5934    return NULL;
5935}
5936
5937PyObject *
5938PyUnicode_AsCharmapString(PyObject *unicode,
5939                          PyObject *mapping)
5940{
5941    if (!PyUnicode_Check(unicode) || mapping == NULL) {
5942        PyErr_BadArgument();
5943        return NULL;
5944    }
5945    return PyUnicode_EncodeCharmap(PyUnicode_AS_UNICODE(unicode),
5946                                   PyUnicode_GET_SIZE(unicode),
5947                                   mapping,
5948                                   NULL);
5949}
5950
5951/* create or adjust a UnicodeTranslateError */
5952static void
5953make_translate_exception(PyObject **exceptionObject,
5954                         const Py_UNICODE *unicode, Py_ssize_t size,
5955                         Py_ssize_t startpos, Py_ssize_t endpos,
5956                         const char *reason)
5957{
5958    if (*exceptionObject == NULL) {
5959        *exceptionObject = PyUnicodeTranslateError_Create(
5960            unicode, size, startpos, endpos, reason);
5961    }
5962    else {
5963        if (PyUnicodeTranslateError_SetStart(*exceptionObject, startpos))
5964            goto onError;
5965        if (PyUnicodeTranslateError_SetEnd(*exceptionObject, endpos))
5966            goto onError;
5967        if (PyUnicodeTranslateError_SetReason(*exceptionObject, reason))
5968            goto onError;
5969        return;
5970      onError:
5971        Py_DECREF(*exceptionObject);
5972        *exceptionObject = NULL;
5973    }
5974}
5975
5976/* raises a UnicodeTranslateError */
5977static void
5978raise_translate_exception(PyObject **exceptionObject,
5979                          const Py_UNICODE *unicode, Py_ssize_t size,
5980                          Py_ssize_t startpos, Py_ssize_t endpos,
5981                          const char *reason)
5982{
5983    make_translate_exception(exceptionObject,
5984                             unicode, size, startpos, endpos, reason);
5985    if (*exceptionObject != NULL)
5986        PyCodec_StrictErrors(*exceptionObject);
5987}
5988
5989/* error handling callback helper:
5990   build arguments, call the callback and check the arguments,
5991   put the result into newpos and return the replacement string, which
5992   has to be freed by the caller */
5993static PyObject *
5994unicode_translate_call_errorhandler(const char *errors,
5995                                    PyObject **errorHandler,
5996                                    const char *reason,
5997                                    const Py_UNICODE *unicode, Py_ssize_t size, PyObject **exceptionObject,
5998                                    Py_ssize_t startpos, Py_ssize_t endpos,
5999                                    Py_ssize_t *newpos)
6000{
6001    static char *argparse = "O!n;translating error handler must return (str, int) tuple";
6002
6003    Py_ssize_t i_newpos;
6004    PyObject *restuple;
6005    PyObject *resunicode;
6006
6007    if (*errorHandler == NULL) {
6008        *errorHandler = PyCodec_LookupError(errors);
6009        if (*errorHandler == NULL)
6010            return NULL;
6011    }
6012
6013    make_translate_exception(exceptionObject,
6014                             unicode, size, startpos, endpos, reason);
6015    if (*exceptionObject == NULL)
6016        return NULL;
6017
6018    restuple = PyObject_CallFunctionObjArgs(
6019        *errorHandler, *exceptionObject, NULL);
6020    if (restuple == NULL)
6021        return NULL;
6022    if (!PyTuple_Check(restuple)) {
6023        PyErr_SetString(PyExc_TypeError, &argparse[4]);
6024        Py_DECREF(restuple);
6025        return NULL;
6026    }
6027    if (!PyArg_ParseTuple(restuple, argparse, &PyUnicode_Type,
6028                          &resunicode, &i_newpos)) {
6029        Py_DECREF(restuple);
6030        return NULL;
6031    }
6032    if (i_newpos<0)
6033        *newpos = size+i_newpos;
6034    else
6035        *newpos = i_newpos;
6036    if (*newpos<0 || *newpos>size) {
6037        PyErr_Format(PyExc_IndexError, "position %zd from error handler out of bounds", *newpos);
6038        Py_DECREF(restuple);
6039        return NULL;
6040    }
6041    Py_INCREF(resunicode);
6042    Py_DECREF(restuple);
6043    return resunicode;
6044}
6045
6046/* Lookup the character ch in the mapping and put the result in result,
6047   which must be decrefed by the caller.
6048   Return 0 on success, -1 on error */
6049static int
6050charmaptranslate_lookup(Py_UNICODE c, PyObject *mapping, PyObject **result)
6051{
6052    PyObject *w = PyLong_FromLong((long)c);
6053    PyObject *x;
6054
6055    if (w == NULL)
6056        return -1;
6057    x = PyObject_GetItem(mapping, w);
6058    Py_DECREF(w);
6059    if (x == NULL) {
6060        if (PyErr_ExceptionMatches(PyExc_LookupError)) {
6061            /* No mapping found means: use 1:1 mapping. */
6062            PyErr_Clear();
6063            *result = NULL;
6064            return 0;
6065        } else
6066            return -1;
6067    }
6068    else if (x == Py_None) {
6069        *result = x;
6070        return 0;
6071    }
6072    else if (PyLong_Check(x)) {
6073        long value = PyLong_AS_LONG(x);
6074        long max = PyUnicode_GetMax();
6075        if (value < 0 || value > max) {
6076            PyErr_Format(PyExc_TypeError,
6077                         "character mapping must be in range(0x%x)", max+1);
6078            Py_DECREF(x);
6079            return -1;
6080        }
6081        *result = x;
6082        return 0;
6083    }
6084    else if (PyUnicode_Check(x)) {
6085        *result = x;
6086        return 0;
6087    }
6088    else {
6089        /* wrong return value */
6090        PyErr_SetString(PyExc_TypeError,
6091                        "character mapping must return integer, None or str");
6092        Py_DECREF(x);
6093        return -1;
6094    }
6095}
6096/* ensure that *outobj is at least requiredsize characters long,
6097   if not reallocate and adjust various state variables.
6098   Return 0 on success, -1 on error */
6099static int
6100charmaptranslate_makespace(PyObject **outobj, Py_UNICODE **outp,
6101                               Py_ssize_t requiredsize)
6102{
6103    Py_ssize_t oldsize = PyUnicode_GET_SIZE(*outobj);
6104    if (requiredsize > oldsize) {
6105        /* remember old output position */
6106        Py_ssize_t outpos = *outp-PyUnicode_AS_UNICODE(*outobj);
6107        /* exponentially overallocate to minimize reallocations */
6108        if (requiredsize < 2 * oldsize)
6109            requiredsize = 2 * oldsize;
6110        if (PyUnicode_Resize(outobj, requiredsize) < 0)
6111            return -1;
6112        *outp = PyUnicode_AS_UNICODE(*outobj) + outpos;
6113    }
6114    return 0;
6115}
6116/* lookup the character, put the result in the output string and adjust
6117   various state variables. Return a new reference to the object that
6118   was put in the output buffer in *result, or Py_None, if the mapping was
6119   undefined (in which case no character was written).
6120   The called must decref result.
6121   Return 0 on success, -1 on error. */
6122static int
6123charmaptranslate_output(const Py_UNICODE *startinp, const Py_UNICODE *curinp,
6124                        Py_ssize_t insize, PyObject *mapping, PyObject **outobj, Py_UNICODE **outp,
6125                        PyObject **res)
6126{
6127    if (charmaptranslate_lookup(*curinp, mapping, res))
6128        return -1;
6129    if (*res==NULL) {
6130        /* not found => default to 1:1 mapping */
6131        *(*outp)++ = *curinp;
6132    }
6133    else if (*res==Py_None)
6134        ;
6135    else if (PyLong_Check(*res)) {
6136        /* no overflow check, because we know that the space is enough */
6137        *(*outp)++ = (Py_UNICODE)PyLong_AS_LONG(*res);
6138    }
6139    else if (PyUnicode_Check(*res)) {
6140        Py_ssize_t repsize = PyUnicode_GET_SIZE(*res);
6141        if (repsize==1) {
6142            /* no overflow check, because we know that the space is enough */
6143            *(*outp)++ = *PyUnicode_AS_UNICODE(*res);
6144        }
6145        else if (repsize!=0) {
6146            /* more than one character */
6147            Py_ssize_t requiredsize = (*outp-PyUnicode_AS_UNICODE(*outobj)) +
6148                (insize - (curinp-startinp)) +
6149                repsize - 1;
6150            if (charmaptranslate_makespace(outobj, outp, requiredsize))
6151                return -1;
6152            memcpy(*outp, PyUnicode_AS_UNICODE(*res), sizeof(Py_UNICODE)*repsize);
6153            *outp += repsize;
6154        }
6155    }
6156    else
6157        return -1;
6158    return 0;
6159}
6160
6161PyObject *
6162PyUnicode_TranslateCharmap(const Py_UNICODE *p,
6163                           Py_ssize_t size,
6164                           PyObject *mapping,
6165                           const char *errors)
6166{
6167    /* output object */
6168    PyObject *res = NULL;
6169    /* pointers to the beginning and end+1 of input */
6170    const Py_UNICODE *startp = p;
6171    const Py_UNICODE *endp = p + size;
6172    /* pointer into the output */
6173    Py_UNICODE *str;
6174    /* current output position */
6175    Py_ssize_t respos = 0;
6176    char *reason = "character maps to <undefined>";
6177    PyObject *errorHandler = NULL;
6178    PyObject *exc = NULL;
6179    /* the following variable is used for caching string comparisons
6180     * -1=not initialized, 0=unknown, 1=strict, 2=replace,
6181     * 3=ignore, 4=xmlcharrefreplace */
6182    int known_errorHandler = -1;
6183
6184    if (mapping == NULL) {
6185        PyErr_BadArgument();
6186        return NULL;
6187    }
6188
6189    /* allocate enough for a simple 1:1 translation without
6190       replacements, if we need more, we'll resize */
6191    res = PyUnicode_FromUnicode(NULL, size);
6192    if (res == NULL)
6193        goto onError;
6194    if (size == 0)
6195        return res;
6196    str = PyUnicode_AS_UNICODE(res);
6197
6198    while (p<endp) {
6199        /* try to encode it */
6200        PyObject *x = NULL;
6201        if (charmaptranslate_output(startp, p, size, mapping, &res, &str, &x)) {
6202            Py_XDECREF(x);
6203            goto onError;
6204        }
6205        Py_XDECREF(x);
6206        if (x!=Py_None) /* it worked => adjust input pointer */
6207            ++p;
6208        else { /* untranslatable character */
6209            PyObject *repunicode = NULL; /* initialize to prevent gcc warning */
6210            Py_ssize_t repsize;
6211            Py_ssize_t newpos;
6212            Py_UNICODE *uni2;
6213            /* startpos for collecting untranslatable chars */
6214            const Py_UNICODE *collstart = p;
6215            const Py_UNICODE *collend = p+1;
6216            const Py_UNICODE *coll;
6217
6218            /* find all untranslatable characters */
6219            while (collend < endp) {
6220                if (charmaptranslate_lookup(*collend, mapping, &x))
6221                    goto onError;
6222                Py_XDECREF(x);
6223                if (x!=Py_None)
6224                    break;
6225                ++collend;
6226            }
6227            /* cache callback name lookup
6228             * (if not done yet, i.e. it's the first error) */
6229            if (known_errorHandler==-1) {
6230                if ((errors==NULL) || (!strcmp(errors, "strict")))
6231                    known_errorHandler = 1;
6232                else if (!strcmp(errors, "replace"))
6233                    known_errorHandler = 2;
6234                else if (!strcmp(errors, "ignore"))
6235                    known_errorHandler = 3;
6236                else if (!strcmp(errors, "xmlcharrefreplace"))
6237                    known_errorHandler = 4;
6238                else
6239                    known_errorHandler = 0;
6240            }
6241            switch (known_errorHandler) {
6242            case 1: /* strict */
6243                raise_translate_exception(&exc, startp, size, collstart-startp, collend-startp, reason);
6244                goto onError;
6245            case 2: /* replace */
6246                /* No need to check for space, this is a 1:1 replacement */
6247                for (coll = collstart; coll<collend; ++coll)
6248                    *str++ = '?';
6249                /* fall through */
6250            case 3: /* ignore */
6251                p = collend;
6252                break;
6253            case 4: /* xmlcharrefreplace */
6254                /* generate replacement (temporarily (mis)uses p) */
6255                for (p = collstart; p < collend; ++p) {
6256                    char buffer[2+29+1+1];
6257                    char *cp;
6258                    sprintf(buffer, "&#%d;", (int)*p);
6259                    if (charmaptranslate_makespace(&res, &str,
6260                                                   (str-PyUnicode_AS_UNICODE(res))+strlen(buffer)+(endp-collend)))
6261                        goto onError;
6262                    for (cp = buffer; *cp; ++cp)
6263                        *str++ = *cp;
6264                }
6265                p = collend;
6266                break;
6267            default:
6268                repunicode = unicode_translate_call_errorhandler(errors, &errorHandler,
6269                                                                 reason, startp, size, &exc,
6270                                                                 collstart-startp, collend-startp, &newpos);
6271                if (repunicode == NULL)
6272                    goto onError;
6273                /* generate replacement  */
6274                repsize = PyUnicode_GET_SIZE(repunicode);
6275                if (charmaptranslate_makespace(&res, &str,
6276                                               (str-PyUnicode_AS_UNICODE(res))+repsize+(endp-collend))) {
6277                    Py_DECREF(repunicode);
6278                    goto onError;
6279                }
6280                for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2)
6281                    *str++ = *uni2;
6282                p = startp + newpos;
6283                Py_DECREF(repunicode);
6284            }
6285        }
6286    }
6287    /* Resize if we allocated to much */
6288    respos = str-PyUnicode_AS_UNICODE(res);
6289    if (respos<PyUnicode_GET_SIZE(res)) {
6290        if (PyUnicode_Resize(&res, respos) < 0)
6291            goto onError;
6292    }
6293    Py_XDECREF(exc);
6294    Py_XDECREF(errorHandler);
6295    return res;
6296
6297  onError:
6298    Py_XDECREF(res);
6299    Py_XDECREF(exc);
6300    Py_XDECREF(errorHandler);
6301    return NULL;
6302}
6303
6304PyObject *
6305PyUnicode_Translate(PyObject *str,
6306                    PyObject *mapping,
6307                    const char *errors)
6308{
6309    PyObject *result;
6310
6311    str = PyUnicode_FromObject(str);
6312    if (str == NULL)
6313        goto onError;
6314    result = PyUnicode_TranslateCharmap(PyUnicode_AS_UNICODE(str),
6315                                        PyUnicode_GET_SIZE(str),
6316                                        mapping,
6317                                        errors);
6318    Py_DECREF(str);
6319    return result;
6320
6321  onError:
6322    Py_XDECREF(str);
6323    return NULL;
6324}
6325
6326PyObject *
6327PyUnicode_TransformDecimalToASCII(Py_UNICODE *s,
6328                                  Py_ssize_t length)
6329{
6330    PyObject *result;
6331    Py_UNICODE *p; /* write pointer into result */
6332    Py_ssize_t i;
6333    /* Copy to a new string */
6334    result = (PyObject *)_PyUnicode_New(length);
6335    Py_UNICODE_COPY(PyUnicode_AS_UNICODE(result), s, length);
6336    if (result == NULL)
6337        return result;
6338    p = PyUnicode_AS_UNICODE(result);
6339    /* Iterate over code points */
6340    for (i = 0; i < length; i++) {
6341        Py_UNICODE ch =s[i];
6342        if (ch > 127) {
6343            int decimal = Py_UNICODE_TODECIMAL(ch);
6344            if (decimal >= 0)
6345                p[i] = '0' + decimal;
6346        }
6347    }
6348    return result;
6349}
6350/* --- Decimal Encoder ---------------------------------------------------- */
6351
6352int
6353PyUnicode_EncodeDecimal(Py_UNICODE *s,
6354                        Py_ssize_t length,
6355                        char *output,
6356                        const char *errors)
6357{
6358    Py_UNICODE *p, *end;
6359    PyObject *errorHandler = NULL;
6360    PyObject *exc = NULL;
6361    const char *encoding = "decimal";
6362    const char *reason = "invalid decimal Unicode string";
6363    /* the following variable is used for caching string comparisons
6364     * -1=not initialized, 0=unknown, 1=strict, 2=replace, 3=ignore, 4=xmlcharrefreplace */
6365    int known_errorHandler = -1;
6366
6367    if (output == NULL) {
6368        PyErr_BadArgument();
6369        return -1;
6370    }
6371
6372    p = s;
6373    end = s + length;
6374    while (p < end) {
6375        register Py_UNICODE ch = *p;
6376        int decimal;
6377        PyObject *repunicode;
6378        Py_ssize_t repsize;
6379        Py_ssize_t newpos;
6380        Py_UNICODE *uni2;
6381        Py_UNICODE *collstart;
6382        Py_UNICODE *collend;
6383
6384        if (Py_UNICODE_ISSPACE(ch)) {
6385            *output++ = ' ';
6386            ++p;
6387            continue;
6388        }
6389        decimal = Py_UNICODE_TODECIMAL(ch);
6390        if (decimal >= 0) {
6391            *output++ = '0' + decimal;
6392            ++p;
6393            continue;
6394        }
6395        if (0 < ch && ch < 256) {
6396            *output++ = (char)ch;
6397            ++p;
6398            continue;
6399        }
6400        /* All other characters are considered unencodable */
6401        collstart = p;
6402        collend = p+1;
6403        while (collend < end) {
6404            if ((0 < *collend && *collend < 256) ||
6405                !Py_UNICODE_ISSPACE(*collend) ||
6406                Py_UNICODE_TODECIMAL(*collend))
6407                break;
6408        }
6409        /* cache callback name lookup
6410         * (if not done yet, i.e. it's the first error) */
6411        if (known_errorHandler==-1) {
6412            if ((errors==NULL) || (!strcmp(errors, "strict")))
6413                known_errorHandler = 1;
6414            else if (!strcmp(errors, "replace"))
6415                known_errorHandler = 2;
6416            else if (!strcmp(errors, "ignore"))
6417                known_errorHandler = 3;
6418            else if (!strcmp(errors, "xmlcharrefreplace"))
6419                known_errorHandler = 4;
6420            else
6421                known_errorHandler = 0;
6422        }
6423        switch (known_errorHandler) {
6424        case 1: /* strict */
6425            raise_encode_exception(&exc, encoding, s, length, collstart-s, collend-s, reason);
6426            goto onError;
6427        case 2: /* replace */
6428            for (p = collstart; p < collend; ++p)
6429                *output++ = '?';
6430            /* fall through */
6431        case 3: /* ignore */
6432            p = collend;
6433            break;
6434        case 4: /* xmlcharrefreplace */
6435            /* generate replacement (temporarily (mis)uses p) */
6436            for (p = collstart; p < collend; ++p)
6437                output += sprintf(output, "&#%d;", (int)*p);
6438            p = collend;
6439            break;
6440        default:
6441            repunicode = unicode_encode_call_errorhandler(errors, &errorHandler,
6442                                                          encoding, reason, s, length, &exc,
6443                                                          collstart-s, collend-s, &newpos);
6444            if (repunicode == NULL)
6445                goto onError;
6446            if (!PyUnicode_Check(repunicode)) {
6447                /* Byte results not supported, since they have no decimal property. */
6448                PyErr_SetString(PyExc_TypeError, "error handler should return unicode");
6449                Py_DECREF(repunicode);
6450                goto onError;
6451            }
6452            /* generate replacement  */
6453            repsize = PyUnicode_GET_SIZE(repunicode);
6454            for (uni2 = PyUnicode_AS_UNICODE(repunicode); repsize-->0; ++uni2) {
6455                Py_UNICODE ch = *uni2;
6456                if (Py_UNICODE_ISSPACE(ch))
6457                    *output++ = ' ';
6458                else {
6459                    decimal = Py_UNICODE_TODECIMAL(ch);
6460                    if (decimal >= 0)
6461                        *output++ = '0' + decimal;
6462                    else if (0 < ch && ch < 256)
6463                        *output++ = (char)ch;
6464                    else {
6465                        Py_DECREF(repunicode);
6466                        raise_encode_exception(&exc, encoding,
6467                                               s, length, collstart-s, collend-s, reason);
6468                        goto onError;
6469                    }
6470                }
6471            }
6472            p = s + newpos;
6473            Py_DECREF(repunicode);
6474        }
6475    }
6476    /* 0-terminate the output string */
6477    *output++ = '\0';
6478    Py_XDECREF(exc);
6479    Py_XDECREF(errorHandler);
6480    return 0;
6481
6482  onError:
6483    Py_XDECREF(exc);
6484    Py_XDECREF(errorHandler);
6485    return -1;
6486}
6487
6488/* --- Helpers ------------------------------------------------------------ */
6489
6490#include "stringlib/unicodedefs.h"
6491#include "stringlib/fastsearch.h"
6492
6493#include "stringlib/count.h"
6494#include "stringlib/find.h"
6495#include "stringlib/partition.h"
6496#include "stringlib/split.h"
6497
6498#define _Py_InsertThousandsGrouping _PyUnicode_InsertThousandsGrouping
6499#define _Py_InsertThousandsGroupingLocale _PyUnicode_InsertThousandsGroupingLocale
6500#include "stringlib/localeutil.h"
6501
6502/* helper macro to fixup start/end slice values */
6503#define ADJUST_INDICES(start, end, len)         \
6504    if (end > len)                              \
6505        end = len;                              \
6506    else if (end < 0) {                         \
6507        end += len;                             \
6508        if (end < 0)                            \
6509            end = 0;                            \
6510    }                                           \
6511    if (start < 0) {                            \
6512        start += len;                           \
6513        if (start < 0)                          \
6514            start = 0;                          \
6515    }
6516
6517Py_ssize_t
6518PyUnicode_Count(PyObject *str,
6519                PyObject *substr,
6520                Py_ssize_t start,
6521                Py_ssize_t end)
6522{
6523    Py_ssize_t result;
6524    PyUnicodeObject* str_obj;
6525    PyUnicodeObject* sub_obj;
6526
6527    str_obj = (PyUnicodeObject*) PyUnicode_FromObject(str);
6528    if (!str_obj)
6529        return -1;
6530    sub_obj = (PyUnicodeObject*) PyUnicode_FromObject(substr);
6531    if (!sub_obj) {
6532        Py_DECREF(str_obj);
6533        return -1;
6534    }
6535
6536    ADJUST_INDICES(start, end, str_obj->length);
6537    result = stringlib_count(
6538        str_obj->str + start, end - start, sub_obj->str, sub_obj->length,
6539        PY_SSIZE_T_MAX
6540        );
6541
6542    Py_DECREF(sub_obj);
6543    Py_DECREF(str_obj);
6544
6545    return result;
6546}
6547
6548Py_ssize_t
6549PyUnicode_Find(PyObject *str,
6550               PyObject *sub,
6551               Py_ssize_t start,
6552               Py_ssize_t end,
6553               int direction)
6554{
6555    Py_ssize_t result;
6556
6557    str = PyUnicode_FromObject(str);
6558    if (!str)
6559        return -2;
6560    sub = PyUnicode_FromObject(sub);
6561    if (!sub) {
6562        Py_DECREF(str);
6563        return -2;
6564    }
6565
6566    if (direction > 0)
6567        result = stringlib_find_slice(
6568            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6569            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6570            start, end
6571            );
6572    else
6573        result = stringlib_rfind_slice(
6574            PyUnicode_AS_UNICODE(str), PyUnicode_GET_SIZE(str),
6575            PyUnicode_AS_UNICODE(sub), PyUnicode_GET_SIZE(sub),
6576            start, end
6577            );
6578
6579    Py_DECREF(str);
6580    Py_DECREF(sub);
6581
6582    return result;
6583}
6584
6585static int
6586tailmatch(PyUnicodeObject *self,
6587          PyUnicodeObject *substring,
6588          Py_ssize_t start,
6589          Py_ssize_t end,
6590          int direction)
6591{
6592    if (substring->length == 0)
6593        return 1;
6594
6595    ADJUST_INDICES(start, end, self->length);
6596    end -= substring->length;
6597    if (end < start)
6598        return 0;
6599
6600    if (direction > 0) {
6601        if (Py_UNICODE_MATCH(self, end, substring))
6602            return 1;
6603    } else {
6604        if (Py_UNICODE_MATCH(self, start, substring))
6605            return 1;
6606    }
6607
6608    return 0;
6609}
6610
6611Py_ssize_t
6612PyUnicode_Tailmatch(PyObject *str,
6613                    PyObject *substr,
6614                    Py_ssize_t start,
6615                    Py_ssize_t end,
6616                    int direction)
6617{
6618    Py_ssize_t result;
6619
6620    str = PyUnicode_FromObject(str);
6621    if (str == NULL)
6622        return -1;
6623    substr = PyUnicode_FromObject(substr);
6624    if (substr == NULL) {
6625        Py_DECREF(str);
6626        return -1;
6627    }
6628
6629    result = tailmatch((PyUnicodeObject *)str,
6630                       (PyUnicodeObject *)substr,
6631                       start, end, direction);
6632    Py_DECREF(str);
6633    Py_DECREF(substr);
6634    return result;
6635}
6636
6637/* Apply fixfct filter to the Unicode object self and return a
6638   reference to the modified object */
6639
6640static PyObject *
6641fixup(PyUnicodeObject *self,
6642      int (*fixfct)(PyUnicodeObject *s))
6643{
6644
6645    PyUnicodeObject *u;
6646
6647    u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
6648    if (u == NULL)
6649        return NULL;
6650
6651    Py_UNICODE_COPY(u->str, self->str, self->length);
6652
6653    if (!fixfct(u) && PyUnicode_CheckExact(self)) {
6654        /* fixfct should return TRUE if it modified the buffer. If
6655           FALSE, return a reference to the original buffer instead
6656           (to save space, not time) */
6657        Py_INCREF(self);
6658        Py_DECREF(u);
6659        return (PyObject*) self;
6660    }
6661    return (PyObject*) u;
6662}
6663
6664static int
6665fixupper(PyUnicodeObject *self)
6666{
6667    Py_ssize_t len = self->length;
6668    Py_UNICODE *s = self->str;
6669    int status = 0;
6670
6671    while (len-- > 0) {
6672        register Py_UNICODE ch;
6673
6674        ch = Py_UNICODE_TOUPPER(*s);
6675        if (ch != *s) {
6676            status = 1;
6677            *s = ch;
6678        }
6679        s++;
6680    }
6681
6682    return status;
6683}
6684
6685static int
6686fixlower(PyUnicodeObject *self)
6687{
6688    Py_ssize_t len = self->length;
6689    Py_UNICODE *s = self->str;
6690    int status = 0;
6691
6692    while (len-- > 0) {
6693        register Py_UNICODE ch;
6694
6695        ch = Py_UNICODE_TOLOWER(*s);
6696        if (ch != *s) {
6697            status = 1;
6698            *s = ch;
6699        }
6700        s++;
6701    }
6702
6703    return status;
6704}
6705
6706static int
6707fixswapcase(PyUnicodeObject *self)
6708{
6709    Py_ssize_t len = self->length;
6710    Py_UNICODE *s = self->str;
6711    int status = 0;
6712
6713    while (len-- > 0) {
6714        if (Py_UNICODE_ISUPPER(*s)) {
6715            *s = Py_UNICODE_TOLOWER(*s);
6716            status = 1;
6717        } else if (Py_UNICODE_ISLOWER(*s)) {
6718            *s = Py_UNICODE_TOUPPER(*s);
6719            status = 1;
6720        }
6721        s++;
6722    }
6723
6724    return status;
6725}
6726
6727static int
6728fixcapitalize(PyUnicodeObject *self)
6729{
6730    Py_ssize_t len = self->length;
6731    Py_UNICODE *s = self->str;
6732    int status = 0;
6733
6734    if (len == 0)
6735        return 0;
6736    if (!Py_UNICODE_ISUPPER(*s)) {
6737        *s = Py_UNICODE_TOUPPER(*s);
6738        status = 1;
6739    }
6740    s++;
6741    while (--len > 0) {
6742        if (!Py_UNICODE_ISLOWER(*s)) {
6743            *s = Py_UNICODE_TOLOWER(*s);
6744            status = 1;
6745        }
6746        s++;
6747    }
6748    return status;
6749}
6750
6751static int
6752fixtitle(PyUnicodeObject *self)
6753{
6754    register Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
6755    register Py_UNICODE *e;
6756    int previous_is_cased;
6757
6758    /* Shortcut for single character strings */
6759    if (PyUnicode_GET_SIZE(self) == 1) {
6760        Py_UNICODE ch = Py_UNICODE_TOTITLE(*p);
6761        if (*p != ch) {
6762            *p = ch;
6763            return 1;
6764        }
6765        else
6766            return 0;
6767    }
6768
6769    e = p + PyUnicode_GET_SIZE(self);
6770    previous_is_cased = 0;
6771    for (; p < e; p++) {
6772        register const Py_UNICODE ch = *p;
6773
6774        if (previous_is_cased)
6775            *p = Py_UNICODE_TOLOWER(ch);
6776        else
6777            *p = Py_UNICODE_TOTITLE(ch);
6778
6779        if (Py_UNICODE_ISLOWER(ch) ||
6780            Py_UNICODE_ISUPPER(ch) ||
6781            Py_UNICODE_ISTITLE(ch))
6782            previous_is_cased = 1;
6783        else
6784            previous_is_cased = 0;
6785    }
6786    return 1;
6787}
6788
6789PyObject *
6790PyUnicode_Join(PyObject *separator, PyObject *seq)
6791{
6792    const Py_UNICODE blank = ' ';
6793    const Py_UNICODE *sep = &blank;
6794    Py_ssize_t seplen = 1;
6795    PyUnicodeObject *res = NULL; /* the result */
6796    Py_UNICODE *res_p;       /* pointer to free byte in res's string area */
6797    PyObject *fseq;          /* PySequence_Fast(seq) */
6798    Py_ssize_t seqlen;       /* len(fseq) -- number of items in sequence */
6799    PyObject **items;
6800    PyObject *item;
6801    Py_ssize_t sz, i;
6802
6803    fseq = PySequence_Fast(seq, "");
6804    if (fseq == NULL) {
6805        return NULL;
6806    }
6807
6808    /* NOTE: the following code can't call back into Python code,
6809     * so we are sure that fseq won't be mutated.
6810     */
6811
6812    seqlen = PySequence_Fast_GET_SIZE(fseq);
6813    /* If empty sequence, return u"". */
6814    if (seqlen == 0) {
6815        res = _PyUnicode_New(0);  /* empty sequence; return u"" */
6816        goto Done;
6817    }
6818    items = PySequence_Fast_ITEMS(fseq);
6819    /* If singleton sequence with an exact Unicode, return that. */
6820    if (seqlen == 1) {
6821        item = items[0];
6822        if (PyUnicode_CheckExact(item)) {
6823            Py_INCREF(item);
6824            res = (PyUnicodeObject *)item;
6825            goto Done;
6826        }
6827    }
6828    else {
6829        /* Set up sep and seplen */
6830        if (separator == NULL) {
6831            sep = &blank;
6832            seplen = 1;
6833        }
6834        else {
6835            if (!PyUnicode_Check(separator)) {
6836                PyErr_Format(PyExc_TypeError,
6837                             "separator: expected str instance,"
6838                             " %.80s found",
6839                             Py_TYPE(separator)->tp_name);
6840                goto onError;
6841            }
6842            sep = PyUnicode_AS_UNICODE(separator);
6843            seplen = PyUnicode_GET_SIZE(separator);
6844        }
6845    }
6846
6847    /* There are at least two things to join, or else we have a subclass
6848     * of str in the sequence.
6849     * Do a pre-pass to figure out the total amount of space we'll
6850     * need (sz), and see whether all argument are strings.
6851     */
6852    sz = 0;
6853    for (i = 0; i < seqlen; i++) {
6854        const Py_ssize_t old_sz = sz;
6855        item = items[i];
6856        if (!PyUnicode_Check(item)) {
6857            PyErr_Format(PyExc_TypeError,
6858                         "sequence item %zd: expected str instance,"
6859                         " %.80s found",
6860                         i, Py_TYPE(item)->tp_name);
6861            goto onError;
6862        }
6863        sz += PyUnicode_GET_SIZE(item);
6864        if (i != 0)
6865            sz += seplen;
6866        if (sz < old_sz || sz > PY_SSIZE_T_MAX) {
6867            PyErr_SetString(PyExc_OverflowError,
6868                            "join() result is too long for a Python string");
6869            goto onError;
6870        }
6871    }
6872
6873    res = _PyUnicode_New(sz);
6874    if (res == NULL)
6875        goto onError;
6876
6877    /* Catenate everything. */
6878    res_p = PyUnicode_AS_UNICODE(res);
6879    for (i = 0; i < seqlen; ++i) {
6880        Py_ssize_t itemlen;
6881        item = items[i];
6882        itemlen = PyUnicode_GET_SIZE(item);
6883        /* Copy item, and maybe the separator. */
6884        if (i) {
6885            Py_UNICODE_COPY(res_p, sep, seplen);
6886            res_p += seplen;
6887        }
6888        Py_UNICODE_COPY(res_p, PyUnicode_AS_UNICODE(item), itemlen);
6889        res_p += itemlen;
6890    }
6891
6892  Done:
6893    Py_DECREF(fseq);
6894    return (PyObject *)res;
6895
6896  onError:
6897    Py_DECREF(fseq);
6898    Py_XDECREF(res);
6899    return NULL;
6900}
6901
6902static PyUnicodeObject *
6903pad(PyUnicodeObject *self,
6904    Py_ssize_t left,
6905    Py_ssize_t right,
6906    Py_UNICODE fill)
6907{
6908    PyUnicodeObject *u;
6909
6910    if (left < 0)
6911        left = 0;
6912    if (right < 0)
6913        right = 0;
6914
6915    if (left == 0 && right == 0 && PyUnicode_CheckExact(self)) {
6916        Py_INCREF(self);
6917        return self;
6918    }
6919
6920    if (left > PY_SSIZE_T_MAX - self->length ||
6921        right > PY_SSIZE_T_MAX - (left + self->length)) {
6922        PyErr_SetString(PyExc_OverflowError, "padded string is too long");
6923        return NULL;
6924    }
6925    u = _PyUnicode_New(left + self->length + right);
6926    if (u) {
6927        if (left)
6928            Py_UNICODE_FILL(u->str, fill, left);
6929        Py_UNICODE_COPY(u->str + left, self->str, self->length);
6930        if (right)
6931            Py_UNICODE_FILL(u->str + left + self->length, fill, right);
6932    }
6933
6934    return u;
6935}
6936
6937PyObject *
6938PyUnicode_Splitlines(PyObject *string, int keepends)
6939{
6940    PyObject *list;
6941
6942    string = PyUnicode_FromObject(string);
6943    if (string == NULL)
6944        return NULL;
6945
6946    list = stringlib_splitlines(
6947        (PyObject*) string, PyUnicode_AS_UNICODE(string),
6948        PyUnicode_GET_SIZE(string), keepends);
6949
6950    Py_DECREF(string);
6951    return list;
6952}
6953
6954static PyObject *
6955split(PyUnicodeObject *self,
6956      PyUnicodeObject *substring,
6957      Py_ssize_t maxcount)
6958{
6959    if (maxcount < 0)
6960        maxcount = PY_SSIZE_T_MAX;
6961
6962    if (substring == NULL)
6963        return stringlib_split_whitespace(
6964            (PyObject*) self,  self->str, self->length, maxcount
6965            );
6966
6967    return stringlib_split(
6968        (PyObject*) self,  self->str, self->length,
6969        substring->str, substring->length,
6970        maxcount
6971        );
6972}
6973
6974static PyObject *
6975rsplit(PyUnicodeObject *self,
6976       PyUnicodeObject *substring,
6977       Py_ssize_t maxcount)
6978{
6979    if (maxcount < 0)
6980        maxcount = PY_SSIZE_T_MAX;
6981
6982    if (substring == NULL)
6983        return stringlib_rsplit_whitespace(
6984            (PyObject*) self,  self->str, self->length, maxcount
6985            );
6986
6987    return stringlib_rsplit(
6988        (PyObject*) self,  self->str, self->length,
6989        substring->str, substring->length,
6990        maxcount
6991        );
6992}
6993
6994static PyObject *
6995replace(PyUnicodeObject *self,
6996        PyUnicodeObject *str1,
6997        PyUnicodeObject *str2,
6998        Py_ssize_t maxcount)
6999{
7000    PyUnicodeObject *u;
7001
7002    if (maxcount < 0)
7003        maxcount = PY_SSIZE_T_MAX;
7004    else if (maxcount == 0 || self->length == 0)
7005        goto nothing;
7006
7007    if (str1->length == str2->length) {
7008        Py_ssize_t i;
7009        /* same length */
7010        if (str1->length == 0)
7011            goto nothing;
7012        if (str1->length == 1) {
7013            /* replace characters */
7014            Py_UNICODE u1, u2;
7015            if (!findchar(self->str, self->length, str1->str[0]))
7016                goto nothing;
7017            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7018            if (!u)
7019                return NULL;
7020            Py_UNICODE_COPY(u->str, self->str, self->length);
7021            u1 = str1->str[0];
7022            u2 = str2->str[0];
7023            for (i = 0; i < u->length; i++)
7024                if (u->str[i] == u1) {
7025                    if (--maxcount < 0)
7026                        break;
7027                    u->str[i] = u2;
7028                }
7029        } else {
7030            i = stringlib_find(
7031                self->str, self->length, str1->str, str1->length, 0
7032                );
7033            if (i < 0)
7034                goto nothing;
7035            u = (PyUnicodeObject*) PyUnicode_FromUnicode(NULL, self->length);
7036            if (!u)
7037                return NULL;
7038            Py_UNICODE_COPY(u->str, self->str, self->length);
7039
7040            /* change everything in-place, starting with this one */
7041            Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7042            i += str1->length;
7043
7044            while ( --maxcount > 0) {
7045                i = stringlib_find(self->str+i, self->length-i,
7046                                   str1->str, str1->length,
7047                                   i);
7048                if (i == -1)
7049                    break;
7050                Py_UNICODE_COPY(u->str+i, str2->str, str2->length);
7051                i += str1->length;
7052            }
7053        }
7054    } else {
7055
7056        Py_ssize_t n, i, j;
7057        Py_ssize_t product, new_size, delta;
7058        Py_UNICODE *p;
7059
7060        /* replace strings */
7061        n = stringlib_count(self->str, self->length, str1->str, str1->length,
7062                            maxcount);
7063        if (n == 0)
7064            goto nothing;
7065        /* new_size = self->length + n * (str2->length - str1->length)); */
7066        delta = (str2->length - str1->length);
7067        if (delta == 0) {
7068            new_size = self->length;
7069        } else {
7070            product = n * (str2->length - str1->length);
7071            if ((product / (str2->length - str1->length)) != n) {
7072                PyErr_SetString(PyExc_OverflowError,
7073                                "replace string is too long");
7074                return NULL;
7075            }
7076            new_size = self->length + product;
7077            if (new_size < 0) {
7078                PyErr_SetString(PyExc_OverflowError,
7079                                "replace string is too long");
7080                return NULL;
7081            }
7082        }
7083        u = _PyUnicode_New(new_size);
7084        if (!u)
7085            return NULL;
7086        i = 0;
7087        p = u->str;
7088        if (str1->length > 0) {
7089            while (n-- > 0) {
7090                /* look for next match */
7091                j = stringlib_find(self->str+i, self->length-i,
7092                                   str1->str, str1->length,
7093                                   i);
7094                if (j == -1)
7095                    break;
7096                else if (j > i) {
7097                    /* copy unchanged part [i:j] */
7098                    Py_UNICODE_COPY(p, self->str+i, j-i);
7099                    p += j - i;
7100                }
7101                /* copy substitution string */
7102                if (str2->length > 0) {
7103                    Py_UNICODE_COPY(p, str2->str, str2->length);
7104                    p += str2->length;
7105                }
7106                i = j + str1->length;
7107            }
7108            if (i < self->length)
7109                /* copy tail [i:] */
7110                Py_UNICODE_COPY(p, self->str+i, self->length-i);
7111        } else {
7112            /* interleave */
7113            while (n > 0) {
7114                Py_UNICODE_COPY(p, str2->str, str2->length);
7115                p += str2->length;
7116                if (--n <= 0)
7117                    break;
7118                *p++ = self->str[i++];
7119            }
7120            Py_UNICODE_COPY(p, self->str+i, self->length-i);
7121        }
7122    }
7123    return (PyObject *) u;
7124
7125  nothing:
7126    /* nothing to replace; return original string (when possible) */
7127    if (PyUnicode_CheckExact(self)) {
7128        Py_INCREF(self);
7129        return (PyObject *) self;
7130    }
7131    return PyUnicode_FromUnicode(self->str, self->length);
7132}
7133
7134/* --- Unicode Object Methods --------------------------------------------- */
7135
7136PyDoc_STRVAR(title__doc__,
7137             "S.title() -> str\n\
7138\n\
7139Return a titlecased version of S, i.e. words start with title case\n\
7140characters, all remaining cased characters have lower case.");
7141
7142static PyObject*
7143unicode_title(PyUnicodeObject *self)
7144{
7145    return fixup(self, fixtitle);
7146}
7147
7148PyDoc_STRVAR(capitalize__doc__,
7149             "S.capitalize() -> str\n\
7150\n\
7151Return a capitalized version of S, i.e. make the first character\n\
7152have upper case and the rest lower case.");
7153
7154static PyObject*
7155unicode_capitalize(PyUnicodeObject *self)
7156{
7157    return fixup(self, fixcapitalize);
7158}
7159
7160#if 0
7161PyDoc_STRVAR(capwords__doc__,
7162             "S.capwords() -> str\n\
7163\n\
7164Apply .capitalize() to all words in S and return the result with\n\
7165normalized whitespace (all whitespace strings are replaced by ' ').");
7166
7167static PyObject*
7168unicode_capwords(PyUnicodeObject *self)
7169{
7170    PyObject *list;
7171    PyObject *item;
7172    Py_ssize_t i;
7173
7174    /* Split into words */
7175    list = split(self, NULL, -1);
7176    if (!list)
7177        return NULL;
7178
7179    /* Capitalize each word */
7180    for (i = 0; i < PyList_GET_SIZE(list); i++) {
7181        item = fixup((PyUnicodeObject *)PyList_GET_ITEM(list, i),
7182                     fixcapitalize);
7183        if (item == NULL)
7184            goto onError;
7185        Py_DECREF(PyList_GET_ITEM(list, i));
7186        PyList_SET_ITEM(list, i, item);
7187    }
7188
7189    /* Join the words to form a new string */
7190    item = PyUnicode_Join(NULL, list);
7191
7192  onError:
7193    Py_DECREF(list);
7194    return (PyObject *)item;
7195}
7196#endif
7197
7198/* Argument converter.  Coerces to a single unicode character */
7199
7200static int
7201convert_uc(PyObject *obj, void *addr)
7202{
7203    Py_UNICODE *fillcharloc = (Py_UNICODE *)addr;
7204    PyObject *uniobj;
7205    Py_UNICODE *unistr;
7206
7207    uniobj = PyUnicode_FromObject(obj);
7208    if (uniobj == NULL) {
7209        PyErr_SetString(PyExc_TypeError,
7210                        "The fill character cannot be converted to Unicode");
7211        return 0;
7212    }
7213    if (PyUnicode_GET_SIZE(uniobj) != 1) {
7214        PyErr_SetString(PyExc_TypeError,
7215                        "The fill character must be exactly one character long");
7216        Py_DECREF(uniobj);
7217        return 0;
7218    }
7219    unistr = PyUnicode_AS_UNICODE(uniobj);
7220    *fillcharloc = unistr[0];
7221    Py_DECREF(uniobj);
7222    return 1;
7223}
7224
7225PyDoc_STRVAR(center__doc__,
7226             "S.center(width[, fillchar]) -> str\n\
7227\n\
7228Return S centered in a string of length width. Padding is\n\
7229done using the specified fill character (default is a space)");
7230
7231static PyObject *
7232unicode_center(PyUnicodeObject *self, PyObject *args)
7233{
7234    Py_ssize_t marg, left;
7235    Py_ssize_t width;
7236    Py_UNICODE fillchar = ' ';
7237
7238    if (!PyArg_ParseTuple(args, "n|O&:center", &width, convert_uc, &fillchar))
7239        return NULL;
7240
7241    if (self->length >= width && PyUnicode_CheckExact(self)) {
7242        Py_INCREF(self);
7243        return (PyObject*) self;
7244    }
7245
7246    marg = width - self->length;
7247    left = marg / 2 + (marg & width & 1);
7248
7249    return (PyObject*) pad(self, left, marg - left, fillchar);
7250}
7251
7252#if 0
7253
7254/* This code should go into some future Unicode collation support
7255   module. The basic comparison should compare ordinals on a naive
7256   basis (this is what Java does and thus Jython too). */
7257
7258/* speedy UTF-16 code point order comparison */
7259/* gleaned from: */
7260/* http://www-4.ibm.com/software/developer/library/utf16.html?dwzone=unicode */
7261
7262static short utf16Fixup[32] =
7263{
7264    0, 0, 0, 0, 0, 0, 0, 0,
7265    0, 0, 0, 0, 0, 0, 0, 0,
7266    0, 0, 0, 0, 0, 0, 0, 0,
7267    0, 0, 0, 0x2000, -0x800, -0x800, -0x800, -0x800
7268};
7269
7270static int
7271unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7272{
7273    Py_ssize_t len1, len2;
7274
7275    Py_UNICODE *s1 = str1->str;
7276    Py_UNICODE *s2 = str2->str;
7277
7278    len1 = str1->length;
7279    len2 = str2->length;
7280
7281    while (len1 > 0 && len2 > 0) {
7282        Py_UNICODE c1, c2;
7283
7284        c1 = *s1++;
7285        c2 = *s2++;
7286
7287        if (c1 > (1<<11) * 26)
7288            c1 += utf16Fixup[c1>>11];
7289        if (c2 > (1<<11) * 26)
7290            c2 += utf16Fixup[c2>>11];
7291        /* now c1 and c2 are in UTF-32-compatible order */
7292
7293        if (c1 != c2)
7294            return (c1 < c2) ? -1 : 1;
7295
7296        len1--; len2--;
7297    }
7298
7299    return (len1 < len2) ? -1 : (len1 != len2);
7300}
7301
7302#else
7303
7304static int
7305unicode_compare(PyUnicodeObject *str1, PyUnicodeObject *str2)
7306{
7307    register Py_ssize_t len1, len2;
7308
7309    Py_UNICODE *s1 = str1->str;
7310    Py_UNICODE *s2 = str2->str;
7311
7312    len1 = str1->length;
7313    len2 = str2->length;
7314
7315    while (len1 > 0 && len2 > 0) {
7316        Py_UNICODE c1, c2;
7317
7318        c1 = *s1++;
7319        c2 = *s2++;
7320
7321        if (c1 != c2)
7322            return (c1 < c2) ? -1 : 1;
7323
7324        len1--; len2--;
7325    }
7326
7327    return (len1 < len2) ? -1 : (len1 != len2);
7328}
7329
7330#endif
7331
7332int
7333PyUnicode_Compare(PyObject *left, PyObject *right)
7334{
7335    if (PyUnicode_Check(left) && PyUnicode_Check(right))
7336        return unicode_compare((PyUnicodeObject *)left,
7337                               (PyUnicodeObject *)right);
7338    PyErr_Format(PyExc_TypeError,
7339                 "Can't compare %.100s and %.100s",
7340                 left->ob_type->tp_name,
7341                 right->ob_type->tp_name);
7342    return -1;
7343}
7344
7345int
7346PyUnicode_CompareWithASCIIString(PyObject* uni, const char* str)
7347{
7348    int i;
7349    Py_UNICODE *id;
7350    assert(PyUnicode_Check(uni));
7351    id = PyUnicode_AS_UNICODE(uni);
7352    /* Compare Unicode string and source character set string */
7353    for (i = 0; id[i] && str[i]; i++)
7354        if (id[i] != str[i])
7355            return ((int)id[i] < (int)str[i]) ? -1 : 1;
7356    /* This check keeps Python strings that end in '\0' from comparing equal
7357     to C strings identical up to that point. */
7358    if (PyUnicode_GET_SIZE(uni) != i || id[i])
7359        return 1; /* uni is longer */
7360    if (str[i])
7361        return -1; /* str is longer */
7362    return 0;
7363}
7364
7365
7366#define TEST_COND(cond)                         \
7367    ((cond) ? Py_True : Py_False)
7368
7369PyObject *
7370PyUnicode_RichCompare(PyObject *left, PyObject *right, int op)
7371{
7372    int result;
7373
7374    if (PyUnicode_Check(left) && PyUnicode_Check(right)) {
7375        PyObject *v;
7376        if (PyUnicode_GET_SIZE(left) != PyUnicode_GET_SIZE(right)) {
7377            if (op == Py_EQ) {
7378                Py_INCREF(Py_False);
7379                return Py_False;
7380            }
7381            if (op == Py_NE) {
7382                Py_INCREF(Py_True);
7383                return Py_True;
7384            }
7385        }
7386        if (left == right)
7387            result = 0;
7388        else
7389            result = unicode_compare((PyUnicodeObject *)left,
7390                                     (PyUnicodeObject *)right);
7391
7392        /* Convert the return value to a Boolean */
7393        switch (op) {
7394        case Py_EQ:
7395            v = TEST_COND(result == 0);
7396            break;
7397        case Py_NE:
7398            v = TEST_COND(result != 0);
7399            break;
7400        case Py_LE:
7401            v = TEST_COND(result <= 0);
7402            break;
7403        case Py_GE:
7404            v = TEST_COND(result >= 0);
7405            break;
7406        case Py_LT:
7407            v = TEST_COND(result == -1);
7408            break;
7409        case Py_GT:
7410            v = TEST_COND(result == 1);
7411            break;
7412        default:
7413            PyErr_BadArgument();
7414            return NULL;
7415        }
7416        Py_INCREF(v);
7417        return v;
7418    }
7419
7420    Py_RETURN_NOTIMPLEMENTED;
7421}
7422
7423int
7424PyUnicode_Contains(PyObject *container, PyObject *element)
7425{
7426    PyObject *str, *sub;
7427    int result;
7428
7429    /* Coerce the two arguments */
7430    sub = PyUnicode_FromObject(element);
7431    if (!sub) {
7432        PyErr_Format(PyExc_TypeError,
7433                     "'in <string>' requires string as left operand, not %s",
7434                     element->ob_type->tp_name);
7435        return -1;
7436    }
7437
7438    str = PyUnicode_FromObject(container);
7439    if (!str) {
7440        Py_DECREF(sub);
7441        return -1;
7442    }
7443
7444    result = stringlib_contains_obj(str, sub);
7445
7446    Py_DECREF(str);
7447    Py_DECREF(sub);
7448
7449    return result;
7450}
7451
7452/* Concat to string or Unicode object giving a new Unicode object. */
7453
7454PyObject *
7455PyUnicode_Concat(PyObject *left, PyObject *right)
7456{
7457    PyUnicodeObject *u = NULL, *v = NULL, *w;
7458
7459    /* Coerce the two arguments */
7460    u = (PyUnicodeObject *)PyUnicode_FromObject(left);
7461    if (u == NULL)
7462        goto onError;
7463    v = (PyUnicodeObject *)PyUnicode_FromObject(right);
7464    if (v == NULL)
7465        goto onError;
7466
7467    /* Shortcuts */
7468    if (v == unicode_empty) {
7469        Py_DECREF(v);
7470        return (PyObject *)u;
7471    }
7472    if (u == unicode_empty) {
7473        Py_DECREF(u);
7474        return (PyObject *)v;
7475    }
7476
7477    /* Concat the two Unicode strings */
7478    w = _PyUnicode_New(u->length + v->length);
7479    if (w == NULL)
7480        goto onError;
7481    Py_UNICODE_COPY(w->str, u->str, u->length);
7482    Py_UNICODE_COPY(w->str + u->length, v->str, v->length);
7483
7484    Py_DECREF(u);
7485    Py_DECREF(v);
7486    return (PyObject *)w;
7487
7488  onError:
7489    Py_XDECREF(u);
7490    Py_XDECREF(v);
7491    return NULL;
7492}
7493
7494void
7495PyUnicode_Append(PyObject **pleft, PyObject *right)
7496{
7497    PyObject *new;
7498    if (*pleft == NULL)
7499        return;
7500    if (right == NULL || !PyUnicode_Check(*pleft)) {
7501        Py_DECREF(*pleft);
7502        *pleft = NULL;
7503        return;
7504    }
7505    new = PyUnicode_Concat(*pleft, right);
7506    Py_DECREF(*pleft);
7507    *pleft = new;
7508}
7509
7510void
7511PyUnicode_AppendAndDel(PyObject **pleft, PyObject *right)
7512{
7513    PyUnicode_Append(pleft, right);
7514    Py_XDECREF(right);
7515}
7516
7517PyDoc_STRVAR(count__doc__,
7518             "S.count(sub[, start[, end]]) -> int\n\
7519\n\
7520Return the number of non-overlapping occurrences of substring sub in\n\
7521string S[start:end].  Optional arguments start and end are\n\
7522interpreted as in slice notation.");
7523
7524static PyObject *
7525unicode_count(PyUnicodeObject *self, PyObject *args)
7526{
7527    PyUnicodeObject *substring;
7528    Py_ssize_t start = 0;
7529    Py_ssize_t end = PY_SSIZE_T_MAX;
7530    PyObject *result;
7531
7532    if (!stringlib_parse_args_finds_unicode("count", args, &substring,
7533                                            &start, &end))
7534        return NULL;
7535
7536    ADJUST_INDICES(start, end, self->length);
7537    result = PyLong_FromSsize_t(
7538        stringlib_count(self->str + start, end - start,
7539                        substring->str, substring->length,
7540                        PY_SSIZE_T_MAX)
7541        );
7542
7543    Py_DECREF(substring);
7544
7545    return result;
7546}
7547
7548PyDoc_STRVAR(encode__doc__,
7549             "S.encode(encoding='utf-8', errors='strict') -> bytes\n\
7550\n\
7551Encode S using the codec registered for encoding. Default encoding\n\
7552is 'utf-8'. errors may be given to set a different error\n\
7553handling scheme. Default is 'strict' meaning that encoding errors raise\n\
7554a UnicodeEncodeError. Other possible values are 'ignore', 'replace' and\n\
7555'xmlcharrefreplace' as well as any other name registered with\n\
7556codecs.register_error that can handle UnicodeEncodeErrors.");
7557
7558static PyObject *
7559unicode_encode(PyUnicodeObject *self, PyObject *args, PyObject *kwargs)
7560{
7561    static char *kwlist[] = {"encoding", "errors", 0};
7562    char *encoding = NULL;
7563    char *errors = NULL;
7564
7565    if (!PyArg_ParseTupleAndKeywords(args, kwargs, "|ss:encode",
7566                                     kwlist, &encoding, &errors))
7567        return NULL;
7568    return PyUnicode_AsEncodedString((PyObject *)self, encoding, errors);
7569}
7570
7571PyDoc_STRVAR(expandtabs__doc__,
7572             "S.expandtabs([tabsize]) -> str\n\
7573\n\
7574Return a copy of S where all tab characters are expanded using spaces.\n\
7575If tabsize is not given, a tab size of 8 characters is assumed.");
7576
7577static PyObject*
7578unicode_expandtabs(PyUnicodeObject *self, PyObject *args)
7579{
7580    Py_UNICODE *e;
7581    Py_UNICODE *p;
7582    Py_UNICODE *q;
7583    Py_UNICODE *qe;
7584    Py_ssize_t i, j, incr;
7585    PyUnicodeObject *u;
7586    int tabsize = 8;
7587
7588    if (!PyArg_ParseTuple(args, "|i:expandtabs", &tabsize))
7589        return NULL;
7590
7591    /* First pass: determine size of output string */
7592    i = 0; /* chars up to and including most recent \n or \r */
7593    j = 0; /* chars since most recent \n or \r (use in tab calculations) */
7594    e = self->str + self->length; /* end of input */
7595    for (p = self->str; p < e; p++)
7596        if (*p == '\t') {
7597            if (tabsize > 0) {
7598                incr = tabsize - (j % tabsize); /* cannot overflow */
7599                if (j > PY_SSIZE_T_MAX - incr)
7600                    goto overflow1;
7601                j += incr;
7602            }
7603        }
7604        else {
7605            if (j > PY_SSIZE_T_MAX - 1)
7606                goto overflow1;
7607            j++;
7608            if (*p == '\n' || *p == '\r') {
7609                if (i > PY_SSIZE_T_MAX - j)
7610                    goto overflow1;
7611                i += j;
7612                j = 0;
7613            }
7614        }
7615
7616    if (i > PY_SSIZE_T_MAX - j)
7617        goto overflow1;
7618
7619    /* Second pass: create output string and fill it */
7620    u = _PyUnicode_New(i + j);
7621    if (!u)
7622        return NULL;
7623
7624    j = 0; /* same as in first pass */
7625    q = u->str; /* next output char */
7626    qe = u->str + u->length; /* end of output */
7627
7628    for (p = self->str; p < e; p++)
7629        if (*p == '\t') {
7630            if (tabsize > 0) {
7631                i = tabsize - (j % tabsize);
7632                j += i;
7633                while (i--) {
7634                    if (q >= qe)
7635                        goto overflow2;
7636                    *q++ = ' ';
7637                }
7638            }
7639        }
7640        else {
7641            if (q >= qe)
7642                goto overflow2;
7643            *q++ = *p;
7644            j++;
7645            if (*p == '\n' || *p == '\r')
7646                j = 0;
7647        }
7648
7649    return (PyObject*) u;
7650
7651  overflow2:
7652    Py_DECREF(u);
7653  overflow1:
7654    PyErr_SetString(PyExc_OverflowError, "new string is too long");
7655    return NULL;
7656}
7657
7658PyDoc_STRVAR(find__doc__,
7659             "S.find(sub[, start[, end]]) -> int\n\
7660\n\
7661Return the lowest index in S where substring sub is found,\n\
7662such that sub is contained within S[start:end].  Optional\n\
7663arguments start and end are interpreted as in slice notation.\n\
7664\n\
7665Return -1 on failure.");
7666
7667static PyObject *
7668unicode_find(PyUnicodeObject *self, PyObject *args)
7669{
7670    PyUnicodeObject *substring;
7671    Py_ssize_t start;
7672    Py_ssize_t end;
7673    Py_ssize_t result;
7674
7675    if (!stringlib_parse_args_finds_unicode("find", args, &substring,
7676                                            &start, &end))
7677        return NULL;
7678
7679    result = stringlib_find_slice(
7680        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7681        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7682        start, end
7683        );
7684
7685    Py_DECREF(substring);
7686
7687    return PyLong_FromSsize_t(result);
7688}
7689
7690static PyObject *
7691unicode_getitem(PyUnicodeObject *self, Py_ssize_t index)
7692{
7693    if (index < 0 || index >= self->length) {
7694        PyErr_SetString(PyExc_IndexError, "string index out of range");
7695        return NULL;
7696    }
7697
7698    return (PyObject*) PyUnicode_FromUnicode(&self->str[index], 1);
7699}
7700
7701/* Believe it or not, this produces the same value for ASCII strings
7702   as string_hash(). */
7703static Py_hash_t
7704unicode_hash(PyUnicodeObject *self)
7705{
7706    Py_ssize_t len;
7707    Py_UNICODE *p;
7708    Py_hash_t x;
7709
7710    if (self->hash != -1)
7711        return self->hash;
7712    len = Py_SIZE(self);
7713    p = self->str;
7714    x = *p << 7;
7715    while (--len >= 0)
7716        x = (1000003*x) ^ *p++;
7717    x ^= Py_SIZE(self);
7718    if (x == -1)
7719        x = -2;
7720    self->hash = x;
7721    return x;
7722}
7723
7724PyDoc_STRVAR(index__doc__,
7725             "S.index(sub[, start[, end]]) -> int\n\
7726\n\
7727Like S.find() but raise ValueError when the substring is not found.");
7728
7729static PyObject *
7730unicode_index(PyUnicodeObject *self, PyObject *args)
7731{
7732    Py_ssize_t result;
7733    PyUnicodeObject *substring;
7734    Py_ssize_t start;
7735    Py_ssize_t end;
7736
7737    if (!stringlib_parse_args_finds_unicode("index", args, &substring,
7738                                            &start, &end))
7739        return NULL;
7740
7741    result = stringlib_find_slice(
7742        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
7743        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
7744        start, end
7745        );
7746
7747    Py_DECREF(substring);
7748
7749    if (result < 0) {
7750        PyErr_SetString(PyExc_ValueError, "substring not found");
7751        return NULL;
7752    }
7753
7754    return PyLong_FromSsize_t(result);
7755}
7756
7757PyDoc_STRVAR(islower__doc__,
7758             "S.islower() -> bool\n\
7759\n\
7760Return True if all cased characters in S are lowercase and there is\n\
7761at least one cased character in S, False otherwise.");
7762
7763static PyObject*
7764unicode_islower(PyUnicodeObject *self)
7765{
7766    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7767    register const Py_UNICODE *e;
7768    int cased;
7769
7770    /* Shortcut for single character strings */
7771    if (PyUnicode_GET_SIZE(self) == 1)
7772        return PyBool_FromLong(Py_UNICODE_ISLOWER(*p));
7773
7774    /* Special case for empty strings */
7775    if (PyUnicode_GET_SIZE(self) == 0)
7776        return PyBool_FromLong(0);
7777
7778    e = p + PyUnicode_GET_SIZE(self);
7779    cased = 0;
7780    for (; p < e; p++) {
7781        register const Py_UNICODE ch = *p;
7782
7783        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch))
7784            return PyBool_FromLong(0);
7785        else if (!cased && Py_UNICODE_ISLOWER(ch))
7786            cased = 1;
7787    }
7788    return PyBool_FromLong(cased);
7789}
7790
7791PyDoc_STRVAR(isupper__doc__,
7792             "S.isupper() -> bool\n\
7793\n\
7794Return True if all cased characters in S are uppercase and there is\n\
7795at least one cased character in S, False otherwise.");
7796
7797static PyObject*
7798unicode_isupper(PyUnicodeObject *self)
7799{
7800    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7801    register const Py_UNICODE *e;
7802    int cased;
7803
7804    /* Shortcut for single character strings */
7805    if (PyUnicode_GET_SIZE(self) == 1)
7806        return PyBool_FromLong(Py_UNICODE_ISUPPER(*p) != 0);
7807
7808    /* Special case for empty strings */
7809    if (PyUnicode_GET_SIZE(self) == 0)
7810        return PyBool_FromLong(0);
7811
7812    e = p + PyUnicode_GET_SIZE(self);
7813    cased = 0;
7814    for (; p < e; p++) {
7815        register const Py_UNICODE ch = *p;
7816
7817        if (Py_UNICODE_ISLOWER(ch) || Py_UNICODE_ISTITLE(ch))
7818            return PyBool_FromLong(0);
7819        else if (!cased && Py_UNICODE_ISUPPER(ch))
7820            cased = 1;
7821    }
7822    return PyBool_FromLong(cased);
7823}
7824
7825PyDoc_STRVAR(istitle__doc__,
7826             "S.istitle() -> bool\n\
7827\n\
7828Return True if S is a titlecased string and there is at least one\n\
7829character in S, i.e. upper- and titlecase characters may only\n\
7830follow uncased characters and lowercase characters only cased ones.\n\
7831Return False otherwise.");
7832
7833static PyObject*
7834unicode_istitle(PyUnicodeObject *self)
7835{
7836    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7837    register const Py_UNICODE *e;
7838    int cased, previous_is_cased;
7839
7840    /* Shortcut for single character strings */
7841    if (PyUnicode_GET_SIZE(self) == 1)
7842        return PyBool_FromLong((Py_UNICODE_ISTITLE(*p) != 0) ||
7843                               (Py_UNICODE_ISUPPER(*p) != 0));
7844
7845    /* Special case for empty strings */
7846    if (PyUnicode_GET_SIZE(self) == 0)
7847        return PyBool_FromLong(0);
7848
7849    e = p + PyUnicode_GET_SIZE(self);
7850    cased = 0;
7851    previous_is_cased = 0;
7852    for (; p < e; p++) {
7853        register const Py_UNICODE ch = *p;
7854
7855        if (Py_UNICODE_ISUPPER(ch) || Py_UNICODE_ISTITLE(ch)) {
7856            if (previous_is_cased)
7857                return PyBool_FromLong(0);
7858            previous_is_cased = 1;
7859            cased = 1;
7860        }
7861        else if (Py_UNICODE_ISLOWER(ch)) {
7862            if (!previous_is_cased)
7863                return PyBool_FromLong(0);
7864            previous_is_cased = 1;
7865            cased = 1;
7866        }
7867        else
7868            previous_is_cased = 0;
7869    }
7870    return PyBool_FromLong(cased);
7871}
7872
7873PyDoc_STRVAR(isspace__doc__,
7874             "S.isspace() -> bool\n\
7875\n\
7876Return True if all characters in S are whitespace\n\
7877and there is at least one character in S, False otherwise.");
7878
7879static PyObject*
7880unicode_isspace(PyUnicodeObject *self)
7881{
7882    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7883    register const Py_UNICODE *e;
7884
7885    /* Shortcut for single character strings */
7886    if (PyUnicode_GET_SIZE(self) == 1 &&
7887        Py_UNICODE_ISSPACE(*p))
7888        return PyBool_FromLong(1);
7889
7890    /* Special case for empty strings */
7891    if (PyUnicode_GET_SIZE(self) == 0)
7892        return PyBool_FromLong(0);
7893
7894    e = p + PyUnicode_GET_SIZE(self);
7895    for (; p < e; p++) {
7896        if (!Py_UNICODE_ISSPACE(*p))
7897            return PyBool_FromLong(0);
7898    }
7899    return PyBool_FromLong(1);
7900}
7901
7902PyDoc_STRVAR(isalpha__doc__,
7903             "S.isalpha() -> bool\n\
7904\n\
7905Return True if all characters in S are alphabetic\n\
7906and there is at least one character in S, False otherwise.");
7907
7908static PyObject*
7909unicode_isalpha(PyUnicodeObject *self)
7910{
7911    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7912    register const Py_UNICODE *e;
7913
7914    /* Shortcut for single character strings */
7915    if (PyUnicode_GET_SIZE(self) == 1 &&
7916        Py_UNICODE_ISALPHA(*p))
7917        return PyBool_FromLong(1);
7918
7919    /* Special case for empty strings */
7920    if (PyUnicode_GET_SIZE(self) == 0)
7921        return PyBool_FromLong(0);
7922
7923    e = p + PyUnicode_GET_SIZE(self);
7924    for (; p < e; p++) {
7925        if (!Py_UNICODE_ISALPHA(*p))
7926            return PyBool_FromLong(0);
7927    }
7928    return PyBool_FromLong(1);
7929}
7930
7931PyDoc_STRVAR(isalnum__doc__,
7932             "S.isalnum() -> bool\n\
7933\n\
7934Return True if all characters in S are alphanumeric\n\
7935and there is at least one character in S, False otherwise.");
7936
7937static PyObject*
7938unicode_isalnum(PyUnicodeObject *self)
7939{
7940    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7941    register const Py_UNICODE *e;
7942
7943    /* Shortcut for single character strings */
7944    if (PyUnicode_GET_SIZE(self) == 1 &&
7945        Py_UNICODE_ISALNUM(*p))
7946        return PyBool_FromLong(1);
7947
7948    /* Special case for empty strings */
7949    if (PyUnicode_GET_SIZE(self) == 0)
7950        return PyBool_FromLong(0);
7951
7952    e = p + PyUnicode_GET_SIZE(self);
7953    for (; p < e; p++) {
7954        if (!Py_UNICODE_ISALNUM(*p))
7955            return PyBool_FromLong(0);
7956    }
7957    return PyBool_FromLong(1);
7958}
7959
7960PyDoc_STRVAR(isdecimal__doc__,
7961             "S.isdecimal() -> bool\n\
7962\n\
7963Return True if there are only decimal characters in S,\n\
7964False otherwise.");
7965
7966static PyObject*
7967unicode_isdecimal(PyUnicodeObject *self)
7968{
7969    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7970    register const Py_UNICODE *e;
7971
7972    /* Shortcut for single character strings */
7973    if (PyUnicode_GET_SIZE(self) == 1 &&
7974        Py_UNICODE_ISDECIMAL(*p))
7975        return PyBool_FromLong(1);
7976
7977    /* Special case for empty strings */
7978    if (PyUnicode_GET_SIZE(self) == 0)
7979        return PyBool_FromLong(0);
7980
7981    e = p + PyUnicode_GET_SIZE(self);
7982    for (; p < e; p++) {
7983        if (!Py_UNICODE_ISDECIMAL(*p))
7984            return PyBool_FromLong(0);
7985    }
7986    return PyBool_FromLong(1);
7987}
7988
7989PyDoc_STRVAR(isdigit__doc__,
7990             "S.isdigit() -> bool\n\
7991\n\
7992Return True if all characters in S are digits\n\
7993and there is at least one character in S, False otherwise.");
7994
7995static PyObject*
7996unicode_isdigit(PyUnicodeObject *self)
7997{
7998    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
7999    register const Py_UNICODE *e;
8000
8001    /* Shortcut for single character strings */
8002    if (PyUnicode_GET_SIZE(self) == 1 &&
8003        Py_UNICODE_ISDIGIT(*p))
8004        return PyBool_FromLong(1);
8005
8006    /* Special case for empty strings */
8007    if (PyUnicode_GET_SIZE(self) == 0)
8008        return PyBool_FromLong(0);
8009
8010    e = p + PyUnicode_GET_SIZE(self);
8011    for (; p < e; p++) {
8012        if (!Py_UNICODE_ISDIGIT(*p))
8013            return PyBool_FromLong(0);
8014    }
8015    return PyBool_FromLong(1);
8016}
8017
8018PyDoc_STRVAR(isnumeric__doc__,
8019             "S.isnumeric() -> bool\n\
8020\n\
8021Return True if there are only numeric characters in S,\n\
8022False otherwise.");
8023
8024static PyObject*
8025unicode_isnumeric(PyUnicodeObject *self)
8026{
8027    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8028    register const Py_UNICODE *e;
8029
8030    /* Shortcut for single character strings */
8031    if (PyUnicode_GET_SIZE(self) == 1 &&
8032        Py_UNICODE_ISNUMERIC(*p))
8033        return PyBool_FromLong(1);
8034
8035    /* Special case for empty strings */
8036    if (PyUnicode_GET_SIZE(self) == 0)
8037        return PyBool_FromLong(0);
8038
8039    e = p + PyUnicode_GET_SIZE(self);
8040    for (; p < e; p++) {
8041        if (!Py_UNICODE_ISNUMERIC(*p))
8042            return PyBool_FromLong(0);
8043    }
8044    return PyBool_FromLong(1);
8045}
8046
8047static Py_UCS4
8048decode_ucs4(const Py_UNICODE *s, Py_ssize_t *i, Py_ssize_t size)
8049{
8050    Py_UCS4 ch;
8051    assert(*i < size);
8052    ch = s[(*i)++];
8053#ifndef Py_UNICODE_WIDE
8054    if ((ch & 0xfffffc00) == 0xd800 &&
8055        *i < size
8056        && (s[*i] & 0xFFFFFC00) == 0xDC00)
8057        ch = ((Py_UCS4)ch << 10UL) + (Py_UCS4)(s[(*i)++]) - 0x35fdc00;
8058#endif
8059    return ch;
8060}
8061
8062int
8063PyUnicode_IsIdentifier(PyObject *self)
8064{
8065    Py_ssize_t i = 0, size = PyUnicode_GET_SIZE(self);
8066    Py_UCS4 first;
8067    const Py_UNICODE *p = PyUnicode_AS_UNICODE((PyUnicodeObject*)self);
8068
8069    /* Special case for empty strings */
8070    if (!size)
8071        return 0;
8072
8073    /* PEP 3131 says that the first character must be in
8074       XID_Start and subsequent characters in XID_Continue,
8075       and for the ASCII range, the 2.x rules apply (i.e
8076       start with letters and underscore, continue with
8077       letters, digits, underscore). However, given the current
8078       definition of XID_Start and XID_Continue, it is sufficient
8079       to check just for these, except that _ must be allowed
8080       as starting an identifier.  */
8081    first = decode_ucs4(p, &i, size);
8082    if (!_PyUnicode_IsXidStart(first) && first != 0x5F /* LOW LINE */)
8083        return 0;
8084
8085    while (i < size)
8086        if (!_PyUnicode_IsXidContinue(decode_ucs4(p, &i, size)))
8087            return 0;
8088    return 1;
8089}
8090
8091PyDoc_STRVAR(isidentifier__doc__,
8092             "S.isidentifier() -> bool\n\
8093\n\
8094Return True if S is a valid identifier according\n\
8095to the language definition.");
8096
8097static PyObject*
8098unicode_isidentifier(PyObject *self)
8099{
8100    return PyBool_FromLong(PyUnicode_IsIdentifier(self));
8101}
8102
8103PyDoc_STRVAR(isprintable__doc__,
8104             "S.isprintable() -> bool\n\
8105\n\
8106Return True if all characters in S are considered\n\
8107printable in repr() or S is empty, False otherwise.");
8108
8109static PyObject*
8110unicode_isprintable(PyObject *self)
8111{
8112    register const Py_UNICODE *p = PyUnicode_AS_UNICODE(self);
8113    register const Py_UNICODE *e;
8114
8115    /* Shortcut for single character strings */
8116    if (PyUnicode_GET_SIZE(self) == 1 && Py_UNICODE_ISPRINTABLE(*p)) {
8117        Py_RETURN_TRUE;
8118    }
8119
8120    e = p + PyUnicode_GET_SIZE(self);
8121    for (; p < e; p++) {
8122        if (!Py_UNICODE_ISPRINTABLE(*p)) {
8123            Py_RETURN_FALSE;
8124        }
8125    }
8126    Py_RETURN_TRUE;
8127}
8128
8129PyDoc_STRVAR(join__doc__,
8130             "S.join(iterable) -> str\n\
8131\n\
8132Return a string which is the concatenation of the strings in the\n\
8133iterable.  The separator between elements is S.");
8134
8135static PyObject*
8136unicode_join(PyObject *self, PyObject *data)
8137{
8138    return PyUnicode_Join(self, data);
8139}
8140
8141static Py_ssize_t
8142unicode_length(PyUnicodeObject *self)
8143{
8144    return self->length;
8145}
8146
8147PyDoc_STRVAR(ljust__doc__,
8148             "S.ljust(width[, fillchar]) -> str\n\
8149\n\
8150Return S left-justified in a Unicode string of length width. Padding is\n\
8151done using the specified fill character (default is a space).");
8152
8153static PyObject *
8154unicode_ljust(PyUnicodeObject *self, PyObject *args)
8155{
8156    Py_ssize_t width;
8157    Py_UNICODE fillchar = ' ';
8158
8159    if (!PyArg_ParseTuple(args, "n|O&:ljust", &width, convert_uc, &fillchar))
8160        return NULL;
8161
8162    if (self->length >= width && PyUnicode_CheckExact(self)) {
8163        Py_INCREF(self);
8164        return (PyObject*) self;
8165    }
8166
8167    return (PyObject*) pad(self, 0, width - self->length, fillchar);
8168}
8169
8170PyDoc_STRVAR(lower__doc__,
8171             "S.lower() -> str\n\
8172\n\
8173Return a copy of the string S converted to lowercase.");
8174
8175static PyObject*
8176unicode_lower(PyUnicodeObject *self)
8177{
8178    return fixup(self, fixlower);
8179}
8180
8181#define LEFTSTRIP 0
8182#define RIGHTSTRIP 1
8183#define BOTHSTRIP 2
8184
8185/* Arrays indexed by above */
8186static const char *stripformat[] = {"|O:lstrip", "|O:rstrip", "|O:strip"};
8187
8188#define STRIPNAME(i) (stripformat[i]+3)
8189
8190/* externally visible for str.strip(unicode) */
8191PyObject *
8192_PyUnicode_XStrip(PyUnicodeObject *self, int striptype, PyObject *sepobj)
8193{
8194    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8195    Py_ssize_t len = PyUnicode_GET_SIZE(self);
8196    Py_UNICODE *sep = PyUnicode_AS_UNICODE(sepobj);
8197    Py_ssize_t seplen = PyUnicode_GET_SIZE(sepobj);
8198    Py_ssize_t i, j;
8199
8200    BLOOM_MASK sepmask = make_bloom_mask(sep, seplen);
8201
8202    i = 0;
8203    if (striptype != RIGHTSTRIP) {
8204        while (i < len && BLOOM_MEMBER(sepmask, s[i], sep, seplen)) {
8205            i++;
8206        }
8207    }
8208
8209    j = len;
8210    if (striptype != LEFTSTRIP) {
8211        do {
8212            j--;
8213        } while (j >= i && BLOOM_MEMBER(sepmask, s[j], sep, seplen));
8214        j++;
8215    }
8216
8217    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8218        Py_INCREF(self);
8219        return (PyObject*)self;
8220    }
8221    else
8222        return PyUnicode_FromUnicode(s+i, j-i);
8223}
8224
8225
8226static PyObject *
8227do_strip(PyUnicodeObject *self, int striptype)
8228{
8229    Py_UNICODE *s = PyUnicode_AS_UNICODE(self);
8230    Py_ssize_t len = PyUnicode_GET_SIZE(self), i, j;
8231
8232    i = 0;
8233    if (striptype != RIGHTSTRIP) {
8234        while (i < len && Py_UNICODE_ISSPACE(s[i])) {
8235            i++;
8236        }
8237    }
8238
8239    j = len;
8240    if (striptype != LEFTSTRIP) {
8241        do {
8242            j--;
8243        } while (j >= i && Py_UNICODE_ISSPACE(s[j]));
8244        j++;
8245    }
8246
8247    if (i == 0 && j == len && PyUnicode_CheckExact(self)) {
8248        Py_INCREF(self);
8249        return (PyObject*)self;
8250    }
8251    else
8252        return PyUnicode_FromUnicode(s+i, j-i);
8253}
8254
8255
8256static PyObject *
8257do_argstrip(PyUnicodeObject *self, int striptype, PyObject *args)
8258{
8259    PyObject *sep = NULL;
8260
8261    if (!PyArg_ParseTuple(args, (char *)stripformat[striptype], &sep))
8262        return NULL;
8263
8264    if (sep != NULL && sep != Py_None) {
8265        if (PyUnicode_Check(sep))
8266            return _PyUnicode_XStrip(self, striptype, sep);
8267        else {
8268            PyErr_Format(PyExc_TypeError,
8269                         "%s arg must be None or str",
8270                         STRIPNAME(striptype));
8271            return NULL;
8272        }
8273    }
8274
8275    return do_strip(self, striptype);
8276}
8277
8278
8279PyDoc_STRVAR(strip__doc__,
8280             "S.strip([chars]) -> str\n\
8281\n\
8282Return a copy of the string S with leading and trailing\n\
8283whitespace removed.\n\
8284If chars is given and not None, remove characters in chars instead.");
8285
8286static PyObject *
8287unicode_strip(PyUnicodeObject *self, PyObject *args)
8288{
8289    if (PyTuple_GET_SIZE(args) == 0)
8290        return do_strip(self, BOTHSTRIP); /* Common case */
8291    else
8292        return do_argstrip(self, BOTHSTRIP, args);
8293}
8294
8295
8296PyDoc_STRVAR(lstrip__doc__,
8297             "S.lstrip([chars]) -> str\n\
8298\n\
8299Return a copy of the string S with leading whitespace removed.\n\
8300If chars is given and not None, remove characters in chars instead.");
8301
8302static PyObject *
8303unicode_lstrip(PyUnicodeObject *self, PyObject *args)
8304{
8305    if (PyTuple_GET_SIZE(args) == 0)
8306        return do_strip(self, LEFTSTRIP); /* Common case */
8307    else
8308        return do_argstrip(self, LEFTSTRIP, args);
8309}
8310
8311
8312PyDoc_STRVAR(rstrip__doc__,
8313             "S.rstrip([chars]) -> str\n\
8314\n\
8315Return a copy of the string S with trailing whitespace removed.\n\
8316If chars is given and not None, remove characters in chars instead.");
8317
8318static PyObject *
8319unicode_rstrip(PyUnicodeObject *self, PyObject *args)
8320{
8321    if (PyTuple_GET_SIZE(args) == 0)
8322        return do_strip(self, RIGHTSTRIP); /* Common case */
8323    else
8324        return do_argstrip(self, RIGHTSTRIP, args);
8325}
8326
8327
8328static PyObject*
8329unicode_repeat(PyUnicodeObject *str, Py_ssize_t len)
8330{
8331    PyUnicodeObject *u;
8332    Py_UNICODE *p;
8333    Py_ssize_t nchars;
8334    size_t nbytes;
8335
8336    if (len < 1) {
8337        Py_INCREF(unicode_empty);
8338        return (PyObject *)unicode_empty;
8339    }
8340
8341    if (len == 1 && PyUnicode_CheckExact(str)) {
8342        /* no repeat, return original string */
8343        Py_INCREF(str);
8344        return (PyObject*) str;
8345    }
8346
8347    /* ensure # of chars needed doesn't overflow int and # of bytes
8348     * needed doesn't overflow size_t
8349     */
8350    nchars = len * str->length;
8351    if (nchars / len != str->length) {
8352        PyErr_SetString(PyExc_OverflowError,
8353                        "repeated string is too long");
8354        return NULL;
8355    }
8356    nbytes = (nchars + 1) * sizeof(Py_UNICODE);
8357    if (nbytes / sizeof(Py_UNICODE) != (size_t)(nchars + 1)) {
8358        PyErr_SetString(PyExc_OverflowError,
8359                        "repeated string is too long");
8360        return NULL;
8361    }
8362    u = _PyUnicode_New(nchars);
8363    if (!u)
8364        return NULL;
8365
8366    p = u->str;
8367
8368    if (str->length == 1) {
8369        Py_UNICODE_FILL(p, str->str[0], len);
8370    } else {
8371        Py_ssize_t done = str->length; /* number of characters copied this far */
8372        Py_UNICODE_COPY(p, str->str, str->length);
8373        while (done < nchars) {
8374            Py_ssize_t n = (done <= nchars-done) ? done : nchars-done;
8375            Py_UNICODE_COPY(p+done, p, n);
8376            done += n;
8377        }
8378    }
8379
8380    return (PyObject*) u;
8381}
8382
8383PyObject *
8384PyUnicode_Replace(PyObject *obj,
8385                  PyObject *subobj,
8386                  PyObject *replobj,
8387                  Py_ssize_t maxcount)
8388{
8389    PyObject *self;
8390    PyObject *str1;
8391    PyObject *str2;
8392    PyObject *result;
8393
8394    self = PyUnicode_FromObject(obj);
8395    if (self == NULL)
8396        return NULL;
8397    str1 = PyUnicode_FromObject(subobj);
8398    if (str1 == NULL) {
8399        Py_DECREF(self);
8400        return NULL;
8401    }
8402    str2 = PyUnicode_FromObject(replobj);
8403    if (str2 == NULL) {
8404        Py_DECREF(self);
8405        Py_DECREF(str1);
8406        return NULL;
8407    }
8408    result = replace((PyUnicodeObject *)self,
8409                     (PyUnicodeObject *)str1,
8410                     (PyUnicodeObject *)str2,
8411                     maxcount);
8412    Py_DECREF(self);
8413    Py_DECREF(str1);
8414    Py_DECREF(str2);
8415    return result;
8416}
8417
8418PyDoc_STRVAR(replace__doc__,
8419             "S.replace(old, new[, count]) -> str\n\
8420\n\
8421Return a copy of S with all occurrences of substring\n\
8422old replaced by new.  If the optional argument count is\n\
8423given, only the first count occurrences are replaced.");
8424
8425static PyObject*
8426unicode_replace(PyUnicodeObject *self, PyObject *args)
8427{
8428    PyUnicodeObject *str1;
8429    PyUnicodeObject *str2;
8430    Py_ssize_t maxcount = -1;
8431    PyObject *result;
8432
8433    if (!PyArg_ParseTuple(args, "OO|n:replace", &str1, &str2, &maxcount))
8434        return NULL;
8435    str1 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str1);
8436    if (str1 == NULL)
8437        return NULL;
8438    str2 = (PyUnicodeObject *)PyUnicode_FromObject((PyObject *)str2);
8439    if (str2 == NULL) {
8440        Py_DECREF(str1);
8441        return NULL;
8442    }
8443
8444    result = replace(self, str1, str2, maxcount);
8445
8446    Py_DECREF(str1);
8447    Py_DECREF(str2);
8448    return result;
8449}
8450
8451static PyObject *
8452unicode_repr(PyObject *unicode)
8453{
8454    PyObject *repr;
8455    Py_UNICODE *p;
8456    Py_UNICODE *s = PyUnicode_AS_UNICODE(unicode);
8457    Py_ssize_t size = PyUnicode_GET_SIZE(unicode);
8458
8459    /* XXX(nnorwitz): rather than over-allocating, it would be
8460       better to choose a different scheme.  Perhaps scan the
8461       first N-chars of the string and allocate based on that size.
8462    */
8463    /* Initial allocation is based on the longest-possible unichr
8464       escape.
8465
8466       In wide (UTF-32) builds '\U00xxxxxx' is 10 chars per source
8467       unichr, so in this case it's the longest unichr escape. In
8468       narrow (UTF-16) builds this is five chars per source unichr
8469       since there are two unichrs in the surrogate pair, so in narrow
8470       (UTF-16) builds it's not the longest unichr escape.
8471
8472       In wide or narrow builds '\uxxxx' is 6 chars per source unichr,
8473       so in the narrow (UTF-16) build case it's the longest unichr
8474       escape.
8475    */
8476
8477    repr = PyUnicode_FromUnicode(NULL,
8478                                 2 /* quotes */
8479#ifdef Py_UNICODE_WIDE
8480                                 + 10*size
8481#else
8482                                 + 6*size
8483#endif
8484                                 + 1);
8485    if (repr == NULL)
8486        return NULL;
8487
8488    p = PyUnicode_AS_UNICODE(repr);
8489
8490    /* Add quote */
8491    *p++ = (findchar(s, size, '\'') &&
8492            !findchar(s, size, '"')) ? '"' : '\'';
8493    while (size-- > 0) {
8494        Py_UNICODE ch = *s++;
8495
8496        /* Escape quotes and backslashes */
8497        if ((ch == PyUnicode_AS_UNICODE(repr)[0]) || (ch == '\\')) {
8498            *p++ = '\\';
8499            *p++ = ch;
8500            continue;
8501        }
8502
8503        /* Map special whitespace to '\t', \n', '\r' */
8504        if (ch == '\t') {
8505            *p++ = '\\';
8506            *p++ = 't';
8507        }
8508        else if (ch == '\n') {
8509            *p++ = '\\';
8510            *p++ = 'n';
8511        }
8512        else if (ch == '\r') {
8513            *p++ = '\\';
8514            *p++ = 'r';
8515        }
8516
8517        /* Map non-printable US ASCII to '\xhh' */
8518        else if (ch < ' ' || ch == 0x7F) {
8519            *p++ = '\\';
8520            *p++ = 'x';
8521            *p++ = hexdigits[(ch >> 4) & 0x000F];
8522            *p++ = hexdigits[ch & 0x000F];
8523        }
8524
8525        /* Copy ASCII characters as-is */
8526        else if (ch < 0x7F) {
8527            *p++ = ch;
8528        }
8529
8530        /* Non-ASCII characters */
8531        else {
8532            Py_UCS4 ucs = ch;
8533
8534#ifndef Py_UNICODE_WIDE
8535            Py_UNICODE ch2 = 0;
8536            /* Get code point from surrogate pair */
8537            if (size > 0) {
8538                ch2 = *s;
8539                if (ch >= 0xD800 && ch < 0xDC00 && ch2 >= 0xDC00
8540                    && ch2 <= 0xDFFF) {
8541                    ucs = (((ch & 0x03FF) << 10) | (ch2 & 0x03FF))
8542                        + 0x00010000;
8543                    s++;
8544                    size--;
8545                }
8546            }
8547#endif
8548            /* Map Unicode whitespace and control characters
8549               (categories Z* and C* except ASCII space)
8550            */
8551            if (!Py_UNICODE_ISPRINTABLE(ucs)) {
8552                /* Map 8-bit characters to '\xhh' */
8553                if (ucs <= 0xff) {
8554                    *p++ = '\\';
8555                    *p++ = 'x';
8556                    *p++ = hexdigits[(ch >> 4) & 0x000F];
8557                    *p++ = hexdigits[ch & 0x000F];
8558                }
8559                /* Map 21-bit characters to '\U00xxxxxx' */
8560                else if (ucs >= 0x10000) {
8561                    *p++ = '\\';
8562                    *p++ = 'U';
8563                    *p++ = hexdigits[(ucs >> 28) & 0x0000000F];
8564                    *p++ = hexdigits[(ucs >> 24) & 0x0000000F];
8565                    *p++ = hexdigits[(ucs >> 20) & 0x0000000F];
8566                    *p++ = hexdigits[(ucs >> 16) & 0x0000000F];
8567                    *p++ = hexdigits[(ucs >> 12) & 0x0000000F];
8568                    *p++ = hexdigits[(ucs >> 8) & 0x0000000F];
8569                    *p++ = hexdigits[(ucs >> 4) & 0x0000000F];
8570                    *p++ = hexdigits[ucs & 0x0000000F];
8571                }
8572                /* Map 16-bit characters to '\uxxxx' */
8573                else {
8574                    *p++ = '\\';
8575                    *p++ = 'u';
8576                    *p++ = hexdigits[(ucs >> 12) & 0x000F];
8577                    *p++ = hexdigits[(ucs >> 8) & 0x000F];
8578                    *p++ = hexdigits[(ucs >> 4) & 0x000F];
8579                    *p++ = hexdigits[ucs & 0x000F];
8580                }
8581            }
8582            /* Copy characters as-is */
8583            else {
8584                *p++ = ch;
8585#ifndef Py_UNICODE_WIDE
8586                if (ucs >= 0x10000)
8587                    *p++ = ch2;
8588#endif
8589            }
8590        }
8591    }
8592    /* Add quote */
8593    *p++ = PyUnicode_AS_UNICODE(repr)[0];
8594
8595    *p = '\0';
8596    PyUnicode_Resize(&repr, p - PyUnicode_AS_UNICODE(repr));
8597    return repr;
8598}
8599
8600PyDoc_STRVAR(rfind__doc__,
8601             "S.rfind(sub[, start[, end]]) -> int\n\
8602\n\
8603Return the highest index in S where substring sub is found,\n\
8604such that sub is contained within S[start:end].  Optional\n\
8605arguments start and end are interpreted as in slice notation.\n\
8606\n\
8607Return -1 on failure.");
8608
8609static PyObject *
8610unicode_rfind(PyUnicodeObject *self, PyObject *args)
8611{
8612    PyUnicodeObject *substring;
8613    Py_ssize_t start;
8614    Py_ssize_t end;
8615    Py_ssize_t result;
8616
8617    if (!stringlib_parse_args_finds_unicode("rfind", args, &substring,
8618                                            &start, &end))
8619        return NULL;
8620
8621    result = stringlib_rfind_slice(
8622        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8623        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8624        start, end
8625        );
8626
8627    Py_DECREF(substring);
8628
8629    return PyLong_FromSsize_t(result);
8630}
8631
8632PyDoc_STRVAR(rindex__doc__,
8633             "S.rindex(sub[, start[, end]]) -> int\n\
8634\n\
8635Like S.rfind() but raise ValueError when the substring is not found.");
8636
8637static PyObject *
8638unicode_rindex(PyUnicodeObject *self, PyObject *args)
8639{
8640    PyUnicodeObject *substring;
8641    Py_ssize_t start;
8642    Py_ssize_t end;
8643    Py_ssize_t result;
8644
8645    if (!stringlib_parse_args_finds_unicode("rindex", args, &substring,
8646                                            &start, &end))
8647        return NULL;
8648
8649    result = stringlib_rfind_slice(
8650        PyUnicode_AS_UNICODE(self), PyUnicode_GET_SIZE(self),
8651        PyUnicode_AS_UNICODE(substring), PyUnicode_GET_SIZE(substring),
8652        start, end
8653        );
8654
8655    Py_DECREF(substring);
8656
8657    if (result < 0) {
8658        PyErr_SetString(PyExc_ValueError, "substring not found");
8659        return NULL;
8660    }
8661    return PyLong_FromSsize_t(result);
8662}
8663
8664PyDoc_STRVAR(rjust__doc__,
8665             "S.rjust(width[, fillchar]) -> str\n\
8666\n\
8667Return S right-justified in a string of length width. Padding is\n\
8668done using the specified fill character (default is a space).");
8669
8670static PyObject *
8671unicode_rjust(PyUnicodeObject *self, PyObject *args)
8672{
8673    Py_ssize_t width;
8674    Py_UNICODE fillchar = ' ';
8675
8676    if (!PyArg_ParseTuple(args, "n|O&:rjust", &width, convert_uc, &fillchar))
8677        return NULL;
8678
8679    if (self->length >= width && PyUnicode_CheckExact(self)) {
8680        Py_INCREF(self);
8681        return (PyObject*) self;
8682    }
8683
8684    return (PyObject*) pad(self, width - self->length, 0, fillchar);
8685}
8686
8687PyObject *
8688PyUnicode_Split(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
8689{
8690    PyObject *result;
8691
8692    s = PyUnicode_FromObject(s);
8693    if (s == NULL)
8694        return NULL;
8695    if (sep != NULL) {
8696        sep = PyUnicode_FromObject(sep);
8697        if (sep == NULL) {
8698            Py_DECREF(s);
8699            return NULL;
8700        }
8701    }
8702
8703    result = split((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8704
8705    Py_DECREF(s);
8706    Py_XDECREF(sep);
8707    return result;
8708}
8709
8710PyDoc_STRVAR(split__doc__,
8711             "S.split([sep[, maxsplit]]) -> list of strings\n\
8712\n\
8713Return a list of the words in S, using sep as the\n\
8714delimiter string.  If maxsplit is given, at most maxsplit\n\
8715splits are done. If sep is not specified or is None, any\n\
8716whitespace string is a separator and empty strings are\n\
8717removed from the result.");
8718
8719static PyObject*
8720unicode_split(PyUnicodeObject *self, PyObject *args)
8721{
8722    PyObject *substring = Py_None;
8723    Py_ssize_t maxcount = -1;
8724
8725    if (!PyArg_ParseTuple(args, "|On:split", &substring, &maxcount))
8726        return NULL;
8727
8728    if (substring == Py_None)
8729        return split(self, NULL, maxcount);
8730    else if (PyUnicode_Check(substring))
8731        return split(self, (PyUnicodeObject *)substring, maxcount);
8732    else
8733        return PyUnicode_Split((PyObject *)self, substring, maxcount);
8734}
8735
8736PyObject *
8737PyUnicode_Partition(PyObject *str_in, PyObject *sep_in)
8738{
8739    PyObject* str_obj;
8740    PyObject* sep_obj;
8741    PyObject* out;
8742
8743    str_obj = PyUnicode_FromObject(str_in);
8744    if (!str_obj)
8745        return NULL;
8746    sep_obj = PyUnicode_FromObject(sep_in);
8747    if (!sep_obj) {
8748        Py_DECREF(str_obj);
8749        return NULL;
8750    }
8751
8752    out = stringlib_partition(
8753        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8754        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8755        );
8756
8757    Py_DECREF(sep_obj);
8758    Py_DECREF(str_obj);
8759
8760    return out;
8761}
8762
8763
8764PyObject *
8765PyUnicode_RPartition(PyObject *str_in, PyObject *sep_in)
8766{
8767    PyObject* str_obj;
8768    PyObject* sep_obj;
8769    PyObject* out;
8770
8771    str_obj = PyUnicode_FromObject(str_in);
8772    if (!str_obj)
8773        return NULL;
8774    sep_obj = PyUnicode_FromObject(sep_in);
8775    if (!sep_obj) {
8776        Py_DECREF(str_obj);
8777        return NULL;
8778    }
8779
8780    out = stringlib_rpartition(
8781        str_obj, PyUnicode_AS_UNICODE(str_obj), PyUnicode_GET_SIZE(str_obj),
8782        sep_obj, PyUnicode_AS_UNICODE(sep_obj), PyUnicode_GET_SIZE(sep_obj)
8783        );
8784
8785    Py_DECREF(sep_obj);
8786    Py_DECREF(str_obj);
8787
8788    return out;
8789}
8790
8791PyDoc_STRVAR(partition__doc__,
8792             "S.partition(sep) -> (head, sep, tail)\n\
8793\n\
8794Search for the separator sep in S, and return the part before it,\n\
8795the separator itself, and the part after it.  If the separator is not\n\
8796found, return S and two empty strings.");
8797
8798static PyObject*
8799unicode_partition(PyUnicodeObject *self, PyObject *separator)
8800{
8801    return PyUnicode_Partition((PyObject *)self, separator);
8802}
8803
8804PyDoc_STRVAR(rpartition__doc__,
8805             "S.rpartition(sep) -> (head, sep, tail)\n\
8806\n\
8807Search for the separator sep in S, starting at the end of S, and return\n\
8808the part before it, the separator itself, and the part after it.  If the\n\
8809separator is not found, return two empty strings and S.");
8810
8811static PyObject*
8812unicode_rpartition(PyUnicodeObject *self, PyObject *separator)
8813{
8814    return PyUnicode_RPartition((PyObject *)self, separator);
8815}
8816
8817PyObject *
8818PyUnicode_RSplit(PyObject *s, PyObject *sep, Py_ssize_t maxsplit)
8819{
8820    PyObject *result;
8821
8822    s = PyUnicode_FromObject(s);
8823    if (s == NULL)
8824        return NULL;
8825    if (sep != NULL) {
8826        sep = PyUnicode_FromObject(sep);
8827        if (sep == NULL) {
8828            Py_DECREF(s);
8829            return NULL;
8830        }
8831    }
8832
8833    result = rsplit((PyUnicodeObject *)s, (PyUnicodeObject *)sep, maxsplit);
8834
8835    Py_DECREF(s);
8836    Py_XDECREF(sep);
8837    return result;
8838}
8839
8840PyDoc_STRVAR(rsplit__doc__,
8841             "S.rsplit([sep[, maxsplit]]) -> list of strings\n\
8842\n\
8843Return a list of the words in S, using sep as the\n\
8844delimiter string, starting at the end of the string and\n\
8845working to the front.  If maxsplit is given, at most maxsplit\n\
8846splits are done. If sep is not specified, any whitespace string\n\
8847is a separator.");
8848
8849static PyObject*
8850unicode_rsplit(PyUnicodeObject *self, PyObject *args)
8851{
8852    PyObject *substring = Py_None;
8853    Py_ssize_t maxcount = -1;
8854
8855    if (!PyArg_ParseTuple(args, "|On:rsplit", &substring, &maxcount))
8856        return NULL;
8857
8858    if (substring == Py_None)
8859        return rsplit(self, NULL, maxcount);
8860    else if (PyUnicode_Check(substring))
8861        return rsplit(self, (PyUnicodeObject *)substring, maxcount);
8862    else
8863        return PyUnicode_RSplit((PyObject *)self, substring, maxcount);
8864}
8865
8866PyDoc_STRVAR(splitlines__doc__,
8867             "S.splitlines([keepends]) -> list of strings\n\
8868\n\
8869Return a list of the lines in S, breaking at line boundaries.\n\
8870Line breaks are not included in the resulting list unless keepends\n\
8871is given and true.");
8872
8873static PyObject*
8874unicode_splitlines(PyUnicodeObject *self, PyObject *args)
8875{
8876    int keepends = 0;
8877
8878    if (!PyArg_ParseTuple(args, "|i:splitlines", &keepends))
8879        return NULL;
8880
8881    return PyUnicode_Splitlines((PyObject *)self, keepends);
8882}
8883
8884static
8885PyObject *unicode_str(PyObject *self)
8886{
8887    if (PyUnicode_CheckExact(self)) {
8888        Py_INCREF(self);
8889        return self;
8890    } else
8891        /* Subtype -- return genuine unicode string with the same value. */
8892        return PyUnicode_FromUnicode(PyUnicode_AS_UNICODE(self),
8893                                     PyUnicode_GET_SIZE(self));
8894}
8895
8896PyDoc_STRVAR(swapcase__doc__,
8897             "S.swapcase() -> str\n\
8898\n\
8899Return a copy of S with uppercase characters converted to lowercase\n\
8900and vice versa.");
8901
8902static PyObject*
8903unicode_swapcase(PyUnicodeObject *self)
8904{
8905    return fixup(self, fixswapcase);
8906}
8907
8908PyDoc_STRVAR(maketrans__doc__,
8909             "str.maketrans(x[, y[, z]]) -> dict (static method)\n\
8910\n\
8911Return a translation table usable for str.translate().\n\
8912If there is only one argument, it must be a dictionary mapping Unicode\n\
8913ordinals (integers) or characters to Unicode ordinals, strings or None.\n\
8914Character keys will be then converted to ordinals.\n\
8915If there are two arguments, they must be strings of equal length, and\n\
8916in the resulting dictionary, each character in x will be mapped to the\n\
8917character at the same position in y. If there is a third argument, it\n\
8918must be a string, whose characters will be mapped to None in the result.");
8919
8920static PyObject*
8921unicode_maketrans(PyUnicodeObject *null, PyObject *args)
8922{
8923    PyObject *x, *y = NULL, *z = NULL;
8924    PyObject *new = NULL, *key, *value;
8925    Py_ssize_t i = 0;
8926    int res;
8927
8928    if (!PyArg_ParseTuple(args, "O|UU:maketrans", &x, &y, &z))
8929        return NULL;
8930    new = PyDict_New();
8931    if (!new)
8932        return NULL;
8933    if (y != NULL) {
8934        /* x must be a string too, of equal length */
8935        Py_ssize_t ylen = PyUnicode_GET_SIZE(y);
8936        if (!PyUnicode_Check(x)) {
8937            PyErr_SetString(PyExc_TypeError, "first maketrans argument must "
8938                            "be a string if there is a second argument");
8939            goto err;
8940        }
8941        if (PyUnicode_GET_SIZE(x) != ylen) {
8942            PyErr_SetString(PyExc_ValueError, "the first two maketrans "
8943                            "arguments must have equal length");
8944            goto err;
8945        }
8946        /* create entries for translating chars in x to those in y */
8947        for (i = 0; i < PyUnicode_GET_SIZE(x); i++) {
8948            key = PyLong_FromLong(PyUnicode_AS_UNICODE(x)[i]);
8949            value = PyLong_FromLong(PyUnicode_AS_UNICODE(y)[i]);
8950            if (!key || !value)
8951                goto err;
8952            res = PyDict_SetItem(new, key, value);
8953            Py_DECREF(key);
8954            Py_DECREF(value);
8955            if (res < 0)
8956                goto err;
8957        }
8958        /* create entries for deleting chars in z */
8959        if (z != NULL) {
8960            for (i = 0; i < PyUnicode_GET_SIZE(z); i++) {
8961                key = PyLong_FromLong(PyUnicode_AS_UNICODE(z)[i]);
8962                if (!key)
8963                    goto err;
8964                res = PyDict_SetItem(new, key, Py_None);
8965                Py_DECREF(key);
8966                if (res < 0)
8967                    goto err;
8968            }
8969        }
8970    } else {
8971        /* x must be a dict */
8972        if (!PyDict_CheckExact(x)) {
8973            PyErr_SetString(PyExc_TypeError, "if you give only one argument "
8974                            "to maketrans it must be a dict");
8975            goto err;
8976        }
8977        /* copy entries into the new dict, converting string keys to int keys */
8978        while (PyDict_Next(x, &i, &key, &value)) {
8979            if (PyUnicode_Check(key)) {
8980                /* convert string keys to integer keys */
8981                PyObject *newkey;
8982                if (PyUnicode_GET_SIZE(key) != 1) {
8983                    PyErr_SetString(PyExc_ValueError, "string keys in translate "
8984                                    "table must be of length 1");
8985                    goto err;
8986                }
8987                newkey = PyLong_FromLong(PyUnicode_AS_UNICODE(key)[0]);
8988                if (!newkey)
8989                    goto err;
8990                res = PyDict_SetItem(new, newkey, value);
8991                Py_DECREF(newkey);
8992                if (res < 0)
8993                    goto err;
8994            } else if (PyLong_Check(key)) {
8995                /* just keep integer keys */
8996                if (PyDict_SetItem(new, key, value) < 0)
8997                    goto err;
8998            } else {
8999                PyErr_SetString(PyExc_TypeError, "keys in translate table must "
9000                                "be strings or integers");
9001                goto err;
9002            }
9003        }
9004    }
9005    return new;
9006  err:
9007    Py_DECREF(new);
9008    return NULL;
9009}
9010
9011PyDoc_STRVAR(translate__doc__,
9012             "S.translate(table) -> str\n\
9013\n\
9014Return a copy of the string S, where all characters have been mapped\n\
9015through the given translation table, which must be a mapping of\n\
9016Unicode ordinals to Unicode ordinals, strings, or None.\n\
9017Unmapped characters are left untouched. Characters mapped to None\n\
9018are deleted.");
9019
9020static PyObject*
9021unicode_translate(PyUnicodeObject *self, PyObject *table)
9022{
9023    return PyUnicode_TranslateCharmap(self->str, self->length, table, "ignore");
9024}
9025
9026PyDoc_STRVAR(upper__doc__,
9027             "S.upper() -> str\n\
9028\n\
9029Return a copy of S converted to uppercase.");
9030
9031static PyObject*
9032unicode_upper(PyUnicodeObject *self)
9033{
9034    return fixup(self, fixupper);
9035}
9036
9037PyDoc_STRVAR(zfill__doc__,
9038             "S.zfill(width) -> str\n\
9039\n\
9040Pad a numeric string S with zeros on the left, to fill a field\n\
9041of the specified width. The string S is never truncated.");
9042
9043static PyObject *
9044unicode_zfill(PyUnicodeObject *self, PyObject *args)
9045{
9046    Py_ssize_t fill;
9047    PyUnicodeObject *u;
9048
9049    Py_ssize_t width;
9050    if (!PyArg_ParseTuple(args, "n:zfill", &width))
9051        return NULL;
9052
9053    if (self->length >= width) {
9054        if (PyUnicode_CheckExact(self)) {
9055            Py_INCREF(self);
9056            return (PyObject*) self;
9057        }
9058        else
9059            return PyUnicode_FromUnicode(
9060                PyUnicode_AS_UNICODE(self),
9061                PyUnicode_GET_SIZE(self)
9062                );
9063    }
9064
9065    fill = width - self->length;
9066
9067    u = pad(self, fill, 0, '0');
9068
9069    if (u == NULL)
9070        return NULL;
9071
9072    if (u->str[fill] == '+' || u->str[fill] == '-') {
9073        /* move sign to beginning of string */
9074        u->str[0] = u->str[fill];
9075        u->str[fill] = '0';
9076    }
9077
9078    return (PyObject*) u;
9079}
9080
9081#if 0
9082static PyObject*
9083unicode_freelistsize(PyUnicodeObject *self)
9084{
9085    return PyLong_FromLong(numfree);
9086}
9087
9088static PyObject *
9089unicode__decimal2ascii(PyObject *self)
9090{
9091    return PyUnicode_TransformDecimalToASCII(PyUnicode_AS_UNICODE(self),
9092                                             PyUnicode_GET_SIZE(self));
9093}
9094#endif
9095
9096PyDoc_STRVAR(startswith__doc__,
9097             "S.startswith(prefix[, start[, end]]) -> bool\n\
9098\n\
9099Return True if S starts with the specified prefix, False otherwise.\n\
9100With optional start, test S beginning at that position.\n\
9101With optional end, stop comparing S at that position.\n\
9102prefix can also be a tuple of strings to try.");
9103
9104static PyObject *
9105unicode_startswith(PyUnicodeObject *self,
9106                   PyObject *args)
9107{
9108    PyObject *subobj;
9109    PyUnicodeObject *substring;
9110    Py_ssize_t start = 0;
9111    Py_ssize_t end = PY_SSIZE_T_MAX;
9112    int result;
9113
9114    if (!stringlib_parse_args_finds("startswith", args, &subobj, &start, &end))
9115        return NULL;
9116    if (PyTuple_Check(subobj)) {
9117        Py_ssize_t i;
9118        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9119            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9120                PyTuple_GET_ITEM(subobj, i));
9121            if (substring == NULL)
9122                return NULL;
9123            result = tailmatch(self, substring, start, end, -1);
9124            Py_DECREF(substring);
9125            if (result) {
9126                Py_RETURN_TRUE;
9127            }
9128        }
9129        /* nothing matched */
9130        Py_RETURN_FALSE;
9131    }
9132    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9133    if (substring == NULL) {
9134        if (PyErr_ExceptionMatches(PyExc_TypeError))
9135            PyErr_Format(PyExc_TypeError, "startswith first arg must be str or "
9136                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
9137        return NULL;
9138    }
9139    result = tailmatch(self, substring, start, end, -1);
9140    Py_DECREF(substring);
9141    return PyBool_FromLong(result);
9142}
9143
9144
9145PyDoc_STRVAR(endswith__doc__,
9146             "S.endswith(suffix[, start[, end]]) -> bool\n\
9147\n\
9148Return True if S ends with the specified suffix, False otherwise.\n\
9149With optional start, test S beginning at that position.\n\
9150With optional end, stop comparing S at that position.\n\
9151suffix can also be a tuple of strings to try.");
9152
9153static PyObject *
9154unicode_endswith(PyUnicodeObject *self,
9155                 PyObject *args)
9156{
9157    PyObject *subobj;
9158    PyUnicodeObject *substring;
9159    Py_ssize_t start = 0;
9160    Py_ssize_t end = PY_SSIZE_T_MAX;
9161    int result;
9162
9163    if (!stringlib_parse_args_finds("endswith", args, &subobj, &start, &end))
9164        return NULL;
9165    if (PyTuple_Check(subobj)) {
9166        Py_ssize_t i;
9167        for (i = 0; i < PyTuple_GET_SIZE(subobj); i++) {
9168            substring = (PyUnicodeObject *)PyUnicode_FromObject(
9169                PyTuple_GET_ITEM(subobj, i));
9170            if (substring == NULL)
9171                return NULL;
9172            result = tailmatch(self, substring, start, end, +1);
9173            Py_DECREF(substring);
9174            if (result) {
9175                Py_RETURN_TRUE;
9176            }
9177        }
9178        Py_RETURN_FALSE;
9179    }
9180    substring = (PyUnicodeObject *)PyUnicode_FromObject(subobj);
9181    if (substring == NULL) {
9182        if (PyErr_ExceptionMatches(PyExc_TypeError))
9183            PyErr_Format(PyExc_TypeError, "endswith first arg must be str or "
9184                         "a tuple of str, not %s", Py_TYPE(subobj)->tp_name);
9185        return NULL;
9186    }
9187    result = tailmatch(self, substring, start, end, +1);
9188    Py_DECREF(substring);
9189    return PyBool_FromLong(result);
9190}
9191
9192#include "stringlib/string_format.h"
9193
9194PyDoc_STRVAR(format__doc__,
9195             "S.format(*args, **kwargs) -> str\n\
9196\n\
9197Return a formatted version of S, using substitutions from args and kwargs.\n\
9198The substitutions are identified by braces ('{' and '}').");
9199
9200PyDoc_STRVAR(format_map__doc__,
9201             "S.format_map(mapping) -> str\n\
9202\n\
9203Return a formatted version of S, using substitutions from mapping.\n\
9204The substitutions are identified by braces ('{' and '}').");
9205
9206static PyObject *
9207unicode__format__(PyObject* self, PyObject* args)
9208{
9209    PyObject *format_spec;
9210
9211    if (!PyArg_ParseTuple(args, "U:__format__", &format_spec))
9212        return NULL;
9213
9214    return _PyUnicode_FormatAdvanced(self,
9215                                     PyUnicode_AS_UNICODE(format_spec),
9216                                     PyUnicode_GET_SIZE(format_spec));
9217}
9218
9219PyDoc_STRVAR(p_format__doc__,
9220             "S.__format__(format_spec) -> str\n\
9221\n\
9222Return a formatted version of S as described by format_spec.");
9223
9224static PyObject *
9225unicode__sizeof__(PyUnicodeObject *v)
9226{
9227    return PyLong_FromSsize_t(sizeof(PyUnicodeObject) +
9228                              sizeof(Py_UNICODE) * (v->length + 1));
9229}
9230
9231PyDoc_STRVAR(sizeof__doc__,
9232             "S.__sizeof__() -> size of S in memory, in bytes");
9233
9234static PyObject *
9235unicode_getnewargs(PyUnicodeObject *v)
9236{
9237    return Py_BuildValue("(u#)", v->str, v->length);
9238}
9239
9240static PyMethodDef unicode_methods[] = {
9241
9242    /* Order is according to common usage: often used methods should
9243       appear first, since lookup is done sequentially. */
9244
9245    {"encode", (PyCFunction) unicode_encode, METH_VARARGS | METH_KEYWORDS, encode__doc__},
9246    {"replace", (PyCFunction) unicode_replace, METH_VARARGS, replace__doc__},
9247    {"split", (PyCFunction) unicode_split, METH_VARARGS, split__doc__},
9248    {"rsplit", (PyCFunction) unicode_rsplit, METH_VARARGS, rsplit__doc__},
9249    {"join", (PyCFunction) unicode_join, METH_O, join__doc__},
9250    {"capitalize", (PyCFunction) unicode_capitalize, METH_NOARGS, capitalize__doc__},
9251    {"title", (PyCFunction) unicode_title, METH_NOARGS, title__doc__},
9252    {"center", (PyCFunction) unicode_center, METH_VARARGS, center__doc__},
9253    {"count", (PyCFunction) unicode_count, METH_VARARGS, count__doc__},
9254    {"expandtabs", (PyCFunction) unicode_expandtabs, METH_VARARGS, expandtabs__doc__},
9255    {"find", (PyCFunction) unicode_find, METH_VARARGS, find__doc__},
9256    {"partition", (PyCFunction) unicode_partition, METH_O, partition__doc__},
9257    {"index", (PyCFunction) unicode_index, METH_VARARGS, index__doc__},
9258    {"ljust", (PyCFunction) unicode_ljust, METH_VARARGS, ljust__doc__},
9259    {"lower", (PyCFunction) unicode_lower, METH_NOARGS, lower__doc__},
9260    {"lstrip", (PyCFunction) unicode_lstrip, METH_VARARGS, lstrip__doc__},
9261    {"rfind", (PyCFunction) unicode_rfind, METH_VARARGS, rfind__doc__},
9262    {"rindex", (PyCFunction) unicode_rindex, METH_VARARGS, rindex__doc__},
9263    {"rjust", (PyCFunction) unicode_rjust, METH_VARARGS, rjust__doc__},
9264    {"rstrip", (PyCFunction) unicode_rstrip, METH_VARARGS, rstrip__doc__},
9265    {"rpartition", (PyCFunction) unicode_rpartition, METH_O, rpartition__doc__},
9266    {"splitlines", (PyCFunction) unicode_splitlines, METH_VARARGS, splitlines__doc__},
9267    {"strip", (PyCFunction) unicode_strip, METH_VARARGS, strip__doc__},
9268    {"swapcase", (PyCFunction) unicode_swapcase, METH_NOARGS, swapcase__doc__},
9269    {"translate", (PyCFunction) unicode_translate, METH_O, translate__doc__},
9270    {"upper", (PyCFunction) unicode_upper, METH_NOARGS, upper__doc__},
9271    {"startswith", (PyCFunction) unicode_startswith, METH_VARARGS, startswith__doc__},
9272    {"endswith", (PyCFunction) unicode_endswith, METH_VARARGS, endswith__doc__},
9273    {"islower", (PyCFunction) unicode_islower, METH_NOARGS, islower__doc__},
9274    {"isupper", (PyCFunction) unicode_isupper, METH_NOARGS, isupper__doc__},
9275    {"istitle", (PyCFunction) unicode_istitle, METH_NOARGS, istitle__doc__},
9276    {"isspace", (PyCFunction) unicode_isspace, METH_NOARGS, isspace__doc__},
9277    {"isdecimal", (PyCFunction) unicode_isdecimal, METH_NOARGS, isdecimal__doc__},
9278    {"isdigit", (PyCFunction) unicode_isdigit, METH_NOARGS, isdigit__doc__},
9279    {"isnumeric", (PyCFunction) unicode_isnumeric, METH_NOARGS, isnumeric__doc__},
9280    {"isalpha", (PyCFunction) unicode_isalpha, METH_NOARGS, isalpha__doc__},
9281    {"isalnum", (PyCFunction) unicode_isalnum, METH_NOARGS, isalnum__doc__},
9282    {"isidentifier", (PyCFunction) unicode_isidentifier, METH_NOARGS, isidentifier__doc__},
9283    {"isprintable", (PyCFunction) unicode_isprintable, METH_NOARGS, isprintable__doc__},
9284    {"zfill", (PyCFunction) unicode_zfill, METH_VARARGS, zfill__doc__},
9285    {"format", (PyCFunction) do_string_format, METH_VARARGS | METH_KEYWORDS, format__doc__},
9286    {"format_map", (PyCFunction) do_string_format_map, METH_O, format_map__doc__},
9287    {"__format__", (PyCFunction) unicode__format__, METH_VARARGS, p_format__doc__},
9288    {"maketrans", (PyCFunction) unicode_maketrans,
9289     METH_VARARGS | METH_STATIC, maketrans__doc__},
9290    {"__sizeof__", (PyCFunction) unicode__sizeof__, METH_NOARGS, sizeof__doc__},
9291#if 0
9292    {"capwords", (PyCFunction) unicode_capwords, METH_NOARGS, capwords__doc__},
9293#endif
9294
9295#if 0
9296    /* These methods are just used for debugging the implementation. */
9297    {"freelistsize", (PyCFunction) unicode_freelistsize, METH_NOARGS},
9298    {"_decimal2ascii", (PyCFunction) unicode__decimal2ascii, METH_NOARGS},
9299#endif
9300
9301    {"__getnewargs__",  (PyCFunction)unicode_getnewargs, METH_NOARGS},
9302    {NULL, NULL}
9303};
9304
9305static PyObject *
9306unicode_mod(PyObject *v, PyObject *w)
9307{
9308    if (!PyUnicode_Check(v))
9309        Py_RETURN_NOTIMPLEMENTED;
9310    return PyUnicode_Format(v, w);
9311}
9312
9313static PyNumberMethods unicode_as_number = {
9314    0,              /*nb_add*/
9315    0,              /*nb_subtract*/
9316    0,              /*nb_multiply*/
9317    unicode_mod,            /*nb_remainder*/
9318};
9319
9320static PySequenceMethods unicode_as_sequence = {
9321    (lenfunc) unicode_length,       /* sq_length */
9322    PyUnicode_Concat,           /* sq_concat */
9323    (ssizeargfunc) unicode_repeat,  /* sq_repeat */
9324    (ssizeargfunc) unicode_getitem,     /* sq_item */
9325    0,                  /* sq_slice */
9326    0,                  /* sq_ass_item */
9327    0,                  /* sq_ass_slice */
9328    PyUnicode_Contains,         /* sq_contains */
9329};
9330
9331static PyObject*
9332unicode_subscript(PyUnicodeObject* self, PyObject* item)
9333{
9334    if (PyIndex_Check(item)) {
9335        Py_ssize_t i = PyNumber_AsSsize_t(item, PyExc_IndexError);
9336        if (i == -1 && PyErr_Occurred())
9337            return NULL;
9338        if (i < 0)
9339            i += PyUnicode_GET_SIZE(self);
9340        return unicode_getitem(self, i);
9341    } else if (PySlice_Check(item)) {
9342        Py_ssize_t start, stop, step, slicelength, cur, i;
9343        Py_UNICODE* source_buf;
9344        Py_UNICODE* result_buf;
9345        PyObject* result;
9346
9347        if (PySlice_GetIndicesEx(item, PyUnicode_GET_SIZE(self),
9348                                 &start, &stop, &step, &slicelength) < 0) {
9349            return NULL;
9350        }
9351
9352        if (slicelength <= 0) {
9353            return PyUnicode_FromUnicode(NULL, 0);
9354        } else if (start == 0 && step == 1 && slicelength == self->length &&
9355                   PyUnicode_CheckExact(self)) {
9356            Py_INCREF(self);
9357            return (PyObject *)self;
9358        } else if (step == 1) {
9359            return PyUnicode_FromUnicode(self->str + start, slicelength);
9360        } else {
9361            source_buf = PyUnicode_AS_UNICODE((PyObject*)self);
9362            result_buf = (Py_UNICODE *)PyObject_MALLOC(slicelength*
9363                                                       sizeof(Py_UNICODE));
9364
9365            if (result_buf == NULL)
9366                return PyErr_NoMemory();
9367
9368            for (cur = start, i = 0; i < slicelength; cur += step, i++) {
9369                result_buf[i] = source_buf[cur];
9370            }
9371
9372            result = PyUnicode_FromUnicode(result_buf, slicelength);
9373            PyObject_FREE(result_buf);
9374            return result;
9375        }
9376    } else {
9377        PyErr_SetString(PyExc_TypeError, "string indices must be integers");
9378        return NULL;
9379    }
9380}
9381
9382static PyMappingMethods unicode_as_mapping = {
9383    (lenfunc)unicode_length,        /* mp_length */
9384    (binaryfunc)unicode_subscript,  /* mp_subscript */
9385    (objobjargproc)0,           /* mp_ass_subscript */
9386};
9387
9388
9389/* Helpers for PyUnicode_Format() */
9390
9391static PyObject *
9392getnextarg(PyObject *args, Py_ssize_t arglen, Py_ssize_t *p_argidx)
9393{
9394    Py_ssize_t argidx = *p_argidx;
9395    if (argidx < arglen) {
9396        (*p_argidx)++;
9397        if (arglen < 0)
9398            return args;
9399        else
9400            return PyTuple_GetItem(args, argidx);
9401    }
9402    PyErr_SetString(PyExc_TypeError,
9403                    "not enough arguments for format string");
9404    return NULL;
9405}
9406
9407/* Returns a new reference to a PyUnicode object, or NULL on failure. */
9408
9409static PyObject *
9410formatfloat(PyObject *v, int flags, int prec, int type)
9411{
9412    char *p;
9413    PyObject *result;
9414    double x;
9415
9416    x = PyFloat_AsDouble(v);
9417    if (x == -1.0 && PyErr_Occurred())
9418        return NULL;
9419
9420    if (prec < 0)
9421        prec = 6;
9422
9423    p = PyOS_double_to_string(x, type, prec,
9424                              (flags & F_ALT) ? Py_DTSF_ALT : 0, NULL);
9425    if (p == NULL)
9426        return NULL;
9427    result = PyUnicode_FromStringAndSize(p, strlen(p));
9428    PyMem_Free(p);
9429    return result;
9430}
9431
9432static PyObject*
9433formatlong(PyObject *val, int flags, int prec, int type)
9434{
9435    char *buf;
9436    int len;
9437    PyObject *str; /* temporary string object. */
9438    PyObject *result;
9439
9440    str = _PyBytes_FormatLong(val, flags, prec, type, &buf, &len);
9441    if (!str)
9442        return NULL;
9443    result = PyUnicode_FromStringAndSize(buf, len);
9444    Py_DECREF(str);
9445    return result;
9446}
9447
9448static int
9449formatchar(Py_UNICODE *buf,
9450           size_t buflen,
9451           PyObject *v)
9452{
9453    /* presume that the buffer is at least 3 characters long */
9454    if (PyUnicode_Check(v)) {
9455        if (PyUnicode_GET_SIZE(v) == 1) {
9456            buf[0] = PyUnicode_AS_UNICODE(v)[0];
9457            buf[1] = '\0';
9458            return 1;
9459        }
9460#ifndef Py_UNICODE_WIDE
9461        if (PyUnicode_GET_SIZE(v) == 2) {
9462            /* Decode a valid surrogate pair */
9463            int c0 = PyUnicode_AS_UNICODE(v)[0];
9464            int c1 = PyUnicode_AS_UNICODE(v)[1];
9465            if (0xD800 <= c0 && c0 <= 0xDBFF &&
9466                0xDC00 <= c1 && c1 <= 0xDFFF) {
9467                buf[0] = c0;
9468                buf[1] = c1;
9469                buf[2] = '\0';
9470                return 2;
9471            }
9472        }
9473#endif
9474        goto onError;
9475    }
9476    else {
9477        /* Integer input truncated to a character */
9478        long x;
9479        x = PyLong_AsLong(v);
9480        if (x == -1 && PyErr_Occurred())
9481            goto onError;
9482
9483        if (x < 0 || x > 0x10ffff) {
9484            PyErr_SetString(PyExc_OverflowError,
9485                            "%c arg not in range(0x110000)");
9486            return -1;
9487        }
9488
9489#ifndef Py_UNICODE_WIDE
9490        if (x > 0xffff) {
9491            x -= 0x10000;
9492            buf[0] = (Py_UNICODE)(0xD800 | (x >> 10));
9493            buf[1] = (Py_UNICODE)(0xDC00 | (x & 0x3FF));
9494            return 2;
9495        }
9496#endif
9497        buf[0] = (Py_UNICODE) x;
9498        buf[1] = '\0';
9499        return 1;
9500    }
9501
9502  onError:
9503    PyErr_SetString(PyExc_TypeError,
9504                    "%c requires int or char");
9505    return -1;
9506}
9507
9508/* fmt%(v1,v2,...) is roughly equivalent to sprintf(fmt, v1, v2, ...)
9509   FORMATBUFLEN is the length of the buffer in which chars are formatted.
9510*/
9511#define FORMATBUFLEN (size_t)10
9512
9513PyObject *
9514PyUnicode_Format(PyObject *format, PyObject *args)
9515{
9516    Py_UNICODE *fmt, *res;
9517    Py_ssize_t fmtcnt, rescnt, reslen, arglen, argidx;
9518    int args_owned = 0;
9519    PyUnicodeObject *result = NULL;
9520    PyObject *dict = NULL;
9521    PyObject *uformat;
9522
9523    if (format == NULL || args == NULL) {
9524        PyErr_BadInternalCall();
9525        return NULL;
9526    }
9527    uformat = PyUnicode_FromObject(format);
9528    if (uformat == NULL)
9529        return NULL;
9530    fmt = PyUnicode_AS_UNICODE(uformat);
9531    fmtcnt = PyUnicode_GET_SIZE(uformat);
9532
9533    reslen = rescnt = fmtcnt + 100;
9534    result = _PyUnicode_New(reslen);
9535    if (result == NULL)
9536        goto onError;
9537    res = PyUnicode_AS_UNICODE(result);
9538
9539    if (PyTuple_Check(args)) {
9540        arglen = PyTuple_Size(args);
9541        argidx = 0;
9542    }
9543    else {
9544        arglen = -1;
9545        argidx = -2;
9546    }
9547    if (Py_TYPE(args)->tp_as_mapping && !PyTuple_Check(args) &&
9548        !PyUnicode_Check(args))
9549        dict = args;
9550
9551    while (--fmtcnt >= 0) {
9552        if (*fmt != '%') {
9553            if (--rescnt < 0) {
9554                rescnt = fmtcnt + 100;
9555                reslen += rescnt;
9556                if (_PyUnicode_Resize(&result, reslen) < 0)
9557                    goto onError;
9558                res = PyUnicode_AS_UNICODE(result) + reslen - rescnt;
9559                --rescnt;
9560            }
9561            *res++ = *fmt++;
9562        }
9563        else {
9564            /* Got a format specifier */
9565            int flags = 0;
9566            Py_ssize_t width = -1;
9567            int prec = -1;
9568            Py_UNICODE c = '\0';
9569            Py_UNICODE fill;
9570            int isnumok;
9571            PyObject *v = NULL;
9572            PyObject *temp = NULL;
9573            Py_UNICODE *pbuf;
9574            Py_UNICODE sign;
9575            Py_ssize_t len;
9576            Py_UNICODE formatbuf[FORMATBUFLEN]; /* For formatchar() */
9577
9578            fmt++;
9579            if (*fmt == '(') {
9580                Py_UNICODE *keystart;
9581                Py_ssize_t keylen;
9582                PyObject *key;
9583                int pcount = 1;
9584
9585                if (dict == NULL) {
9586                    PyErr_SetString(PyExc_TypeError,
9587                                    "format requires a mapping");
9588                    goto onError;
9589                }
9590                ++fmt;
9591                --fmtcnt;
9592                keystart = fmt;
9593                /* Skip over balanced parentheses */
9594                while (pcount > 0 && --fmtcnt >= 0) {
9595                    if (*fmt == ')')
9596                        --pcount;
9597                    else if (*fmt == '(')
9598                        ++pcount;
9599                    fmt++;
9600                }
9601                keylen = fmt - keystart - 1;
9602                if (fmtcnt < 0 || pcount > 0) {
9603                    PyErr_SetString(PyExc_ValueError,
9604                                    "incomplete format key");
9605                    goto onError;
9606                }
9607#if 0
9608                /* keys are converted to strings using UTF-8 and
9609                   then looked up since Python uses strings to hold
9610                   variables names etc. in its namespaces and we
9611                   wouldn't want to break common idioms. */
9612                key = PyUnicode_EncodeUTF8(keystart,
9613                                           keylen,
9614                                           NULL);
9615#else
9616                key = PyUnicode_FromUnicode(keystart, keylen);
9617#endif
9618                if (key == NULL)
9619                    goto onError;
9620                if (args_owned) {
9621                    Py_DECREF(args);
9622                    args_owned = 0;
9623                }
9624                args = PyObject_GetItem(dict, key);
9625                Py_DECREF(key);
9626                if (args == NULL) {
9627                    goto onError;
9628                }
9629                args_owned = 1;
9630                arglen = -1;
9631                argidx = -2;
9632            }
9633            while (--fmtcnt >= 0) {
9634                switch (c = *fmt++) {
9635                case '-': flags |= F_LJUST; continue;
9636                case '+': flags |= F_SIGN; continue;
9637                case ' ': flags |= F_BLANK; continue;
9638                case '#': flags |= F_ALT; continue;
9639                case '0': flags |= F_ZERO; continue;
9640                }
9641                break;
9642            }
9643            if (c == '*') {
9644                v = getnextarg(args, arglen, &argidx);
9645                if (v == NULL)
9646                    goto onError;
9647                if (!PyLong_Check(v)) {
9648                    PyErr_SetString(PyExc_TypeError,
9649                                    "* wants int");
9650                    goto onError;
9651                }
9652                width = PyLong_AsLong(v);
9653                if (width == -1 && PyErr_Occurred())
9654                    goto onError;
9655                if (width < 0) {
9656                    flags |= F_LJUST;
9657                    width = -width;
9658                }
9659                if (--fmtcnt >= 0)
9660                    c = *fmt++;
9661            }
9662            else if (c >= '0' && c <= '9') {
9663                width = c - '0';
9664                while (--fmtcnt >= 0) {
9665                    c = *fmt++;
9666                    if (c < '0' || c > '9')
9667                        break;
9668                    if ((width*10) / 10 != width) {
9669                        PyErr_SetString(PyExc_ValueError,
9670                                        "width too big");
9671                        goto onError;
9672                    }
9673                    width = width*10 + (c - '0');
9674                }
9675            }
9676            if (c == '.') {
9677                prec = 0;
9678                if (--fmtcnt >= 0)
9679                    c = *fmt++;
9680                if (c == '*') {
9681                    v = getnextarg(args, arglen, &argidx);
9682                    if (v == NULL)
9683                        goto onError;
9684                    if (!PyLong_Check(v)) {
9685                        PyErr_SetString(PyExc_TypeError,
9686                                        "* wants int");
9687                        goto onError;
9688                    }
9689                    prec = PyLong_AsLong(v);
9690                    if (prec == -1 && PyErr_Occurred())
9691                        goto onError;
9692                    if (prec < 0)
9693                        prec = 0;
9694                    if (--fmtcnt >= 0)
9695                        c = *fmt++;
9696                }
9697                else if (c >= '0' && c <= '9') {
9698                    prec = c - '0';
9699                    while (--fmtcnt >= 0) {
9700                        c = *fmt++;
9701                        if (c < '0' || c > '9')
9702                            break;
9703                        if ((prec*10) / 10 != prec) {
9704                            PyErr_SetString(PyExc_ValueError,
9705                                            "prec too big");
9706                            goto onError;
9707                        }
9708                        prec = prec*10 + (c - '0');
9709                    }
9710                }
9711            } /* prec */
9712            if (fmtcnt >= 0) {
9713                if (c == 'h' || c == 'l' || c == 'L') {
9714                    if (--fmtcnt >= 0)
9715                        c = *fmt++;
9716                }
9717            }
9718            if (fmtcnt < 0) {
9719                PyErr_SetString(PyExc_ValueError,
9720                                "incomplete format");
9721                goto onError;
9722            }
9723            if (c != '%') {
9724                v = getnextarg(args, arglen, &argidx);
9725                if (v == NULL)
9726                    goto onError;
9727            }
9728            sign = 0;
9729            fill = ' ';
9730            switch (c) {
9731
9732            case '%':
9733                pbuf = formatbuf;
9734                /* presume that buffer length is at least 1 */
9735                pbuf[0] = '%';
9736                len = 1;
9737                break;
9738
9739            case 's':
9740            case 'r':
9741            case 'a':
9742                if (PyUnicode_CheckExact(v) && c == 's') {
9743                    temp = v;
9744                    Py_INCREF(temp);
9745                }
9746                else {
9747                    if (c == 's')
9748                        temp = PyObject_Str(v);
9749                    else if (c == 'r')
9750                        temp = PyObject_Repr(v);
9751                    else
9752                        temp = PyObject_ASCII(v);
9753                    if (temp == NULL)
9754                        goto onError;
9755                    if (PyUnicode_Check(temp))
9756                        /* nothing to do */;
9757                    else {
9758                        Py_DECREF(temp);
9759                        PyErr_SetString(PyExc_TypeError,
9760                                        "%s argument has non-string str()");
9761                        goto onError;
9762                    }
9763                }
9764                pbuf = PyUnicode_AS_UNICODE(temp);
9765                len = PyUnicode_GET_SIZE(temp);
9766                if (prec >= 0 && len > prec)
9767                    len = prec;
9768                break;
9769
9770            case 'i':
9771            case 'd':
9772            case 'u':
9773            case 'o':
9774            case 'x':
9775            case 'X':
9776                isnumok = 0;
9777                if (PyNumber_Check(v)) {
9778                    PyObject *iobj=NULL;
9779
9780                    if (PyLong_Check(v)) {
9781                        iobj = v;
9782                        Py_INCREF(iobj);
9783                    }
9784                    else {
9785                        iobj = PyNumber_Long(v);
9786                    }
9787                    if (iobj!=NULL) {
9788                        if (PyLong_Check(iobj)) {
9789                            isnumok = 1;
9790                            temp = formatlong(iobj, flags, prec, (c == 'i'? 'd': c));
9791                            Py_DECREF(iobj);
9792                            if (!temp)
9793                                goto onError;
9794                            pbuf = PyUnicode_AS_UNICODE(temp);
9795                            len = PyUnicode_GET_SIZE(temp);
9796                            sign = 1;
9797                        }
9798                        else {
9799                            Py_DECREF(iobj);
9800                        }
9801                    }
9802                }
9803                if (!isnumok) {
9804                    PyErr_Format(PyExc_TypeError,
9805                                 "%%%c format: a number is required, "
9806                                 "not %.200s", (char)c, Py_TYPE(v)->tp_name);
9807                    goto onError;
9808                }
9809                if (flags & F_ZERO)
9810                    fill = '0';
9811                break;
9812
9813            case 'e':
9814            case 'E':
9815            case 'f':
9816            case 'F':
9817            case 'g':
9818            case 'G':
9819                temp = formatfloat(v, flags, prec, c);
9820                if (!temp)
9821                    goto onError;
9822                pbuf = PyUnicode_AS_UNICODE(temp);
9823                len = PyUnicode_GET_SIZE(temp);
9824                sign = 1;
9825                if (flags & F_ZERO)
9826                    fill = '0';
9827                break;
9828
9829            case 'c':
9830                pbuf = formatbuf;
9831                len = formatchar(pbuf, sizeof(formatbuf)/sizeof(Py_UNICODE), v);
9832                if (len < 0)
9833                    goto onError;
9834                break;
9835
9836            default:
9837                PyErr_Format(PyExc_ValueError,
9838                             "unsupported format character '%c' (0x%x) "
9839                             "at index %zd",
9840                             (31<=c && c<=126) ? (char)c : '?',
9841                             (int)c,
9842                             (Py_ssize_t)(fmt - 1 -
9843                                          PyUnicode_AS_UNICODE(uformat)));
9844                goto onError;
9845            }
9846            if (sign) {
9847                if (*pbuf == '-' || *pbuf == '+') {
9848                    sign = *pbuf++;
9849                    len--;
9850                }
9851                else if (flags & F_SIGN)
9852                    sign = '+';
9853                else if (flags & F_BLANK)
9854                    sign = ' ';
9855                else
9856                    sign = 0;
9857            }
9858            if (width < len)
9859                width = len;
9860            if (rescnt - (sign != 0) < width) {
9861                reslen -= rescnt;
9862                rescnt = width + fmtcnt + 100;
9863                reslen += rescnt;
9864                if (reslen < 0) {
9865                    Py_XDECREF(temp);
9866                    PyErr_NoMemory();
9867                    goto onError;
9868                }
9869                if (_PyUnicode_Resize(&result, reslen) < 0) {
9870                    Py_XDECREF(temp);
9871                    goto onError;
9872                }
9873                res = PyUnicode_AS_UNICODE(result)
9874                    + reslen - rescnt;
9875            }
9876            if (sign) {
9877                if (fill != ' ')
9878                    *res++ = sign;
9879                rescnt--;
9880                if (width > len)
9881                    width--;
9882            }
9883            if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9884                assert(pbuf[0] == '0');
9885                assert(pbuf[1] == c);
9886                if (fill != ' ') {
9887                    *res++ = *pbuf++;
9888                    *res++ = *pbuf++;
9889                }
9890                rescnt -= 2;
9891                width -= 2;
9892                if (width < 0)
9893                    width = 0;
9894                len -= 2;
9895            }
9896            if (width > len && !(flags & F_LJUST)) {
9897                do {
9898                    --rescnt;
9899                    *res++ = fill;
9900                } while (--width > len);
9901            }
9902            if (fill == ' ') {
9903                if (sign)
9904                    *res++ = sign;
9905                if ((flags & F_ALT) && (c == 'x' || c == 'X' || c == 'o')) {
9906                    assert(pbuf[0] == '0');
9907                    assert(pbuf[1] == c);
9908                    *res++ = *pbuf++;
9909                    *res++ = *pbuf++;
9910                }
9911            }
9912            Py_UNICODE_COPY(res, pbuf, len);
9913            res += len;
9914            rescnt -= len;
9915            while (--width >= len) {
9916                --rescnt;
9917                *res++ = ' ';
9918            }
9919            if (dict && (argidx < arglen) && c != '%') {
9920                PyErr_SetString(PyExc_TypeError,
9921                                "not all arguments converted during string formatting");
9922                Py_XDECREF(temp);
9923                goto onError;
9924            }
9925            Py_XDECREF(temp);
9926        } /* '%' */
9927    } /* until end */
9928    if (argidx < arglen && !dict) {
9929        PyErr_SetString(PyExc_TypeError,
9930                        "not all arguments converted during string formatting");
9931        goto onError;
9932    }
9933
9934    if (_PyUnicode_Resize(&result, reslen - rescnt) < 0)
9935        goto onError;
9936    if (args_owned) {
9937        Py_DECREF(args);
9938    }
9939    Py_DECREF(uformat);
9940    return (PyObject *)result;
9941
9942  onError:
9943    Py_XDECREF(result);
9944    Py_DECREF(uformat);
9945    if (args_owned) {
9946        Py_DECREF(args);
9947    }
9948    return NULL;
9949}
9950
9951static PyObject *
9952unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds);
9953
9954static PyObject *
9955unicode_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9956{
9957    PyObject *x = NULL;
9958    static char *kwlist[] = {"object", "encoding", "errors", 0};
9959    char *encoding = NULL;
9960    char *errors = NULL;
9961
9962    if (type != &PyUnicode_Type)
9963        return unicode_subtype_new(type, args, kwds);
9964    if (!PyArg_ParseTupleAndKeywords(args, kwds, "|Oss:str",
9965                                     kwlist, &x, &encoding, &errors))
9966        return NULL;
9967    if (x == NULL)
9968        return (PyObject *)_PyUnicode_New(0);
9969    if (encoding == NULL && errors == NULL)
9970        return PyObject_Str(x);
9971    else
9972        return PyUnicode_FromEncodedObject(x, encoding, errors);
9973}
9974
9975static PyObject *
9976unicode_subtype_new(PyTypeObject *type, PyObject *args, PyObject *kwds)
9977{
9978    PyUnicodeObject *tmp, *pnew;
9979    Py_ssize_t n;
9980
9981    assert(PyType_IsSubtype(type, &PyUnicode_Type));
9982    tmp = (PyUnicodeObject *)unicode_new(&PyUnicode_Type, args, kwds);
9983    if (tmp == NULL)
9984        return NULL;
9985    assert(PyUnicode_Check(tmp));
9986    pnew = (PyUnicodeObject *) type->tp_alloc(type, n = tmp->length);
9987    if (pnew == NULL) {
9988        Py_DECREF(tmp);
9989        return NULL;
9990    }
9991    pnew->str = (Py_UNICODE*) PyObject_MALLOC(sizeof(Py_UNICODE) * (n+1));
9992    if (pnew->str == NULL) {
9993        _Py_ForgetReference((PyObject *)pnew);
9994        PyObject_Del(pnew);
9995        Py_DECREF(tmp);
9996        return PyErr_NoMemory();
9997    }
9998    Py_UNICODE_COPY(pnew->str, tmp->str, n+1);
9999    pnew->length = n;
10000    pnew->hash = tmp->hash;
10001    Py_DECREF(tmp);
10002    return (PyObject *)pnew;
10003}
10004
10005PyDoc_STRVAR(unicode_doc,
10006             "str(string[, encoding[, errors]]) -> str\n\
10007\n\
10008Create a new string object from the given encoded string.\n\
10009encoding defaults to the current default string encoding.\n\
10010errors can be 'strict', 'replace' or 'ignore' and defaults to 'strict'.");
10011
10012static PyObject *unicode_iter(PyObject *seq);
10013
10014PyTypeObject PyUnicode_Type = {
10015    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10016    "str",              /* tp_name */
10017    sizeof(PyUnicodeObject),        /* tp_size */
10018    0,                  /* tp_itemsize */
10019    /* Slots */
10020    (destructor)unicode_dealloc,    /* tp_dealloc */
10021    0,                  /* tp_print */
10022    0,                  /* tp_getattr */
10023    0,                  /* tp_setattr */
10024    0,                  /* tp_reserved */
10025    unicode_repr,           /* tp_repr */
10026    &unicode_as_number,         /* tp_as_number */
10027    &unicode_as_sequence,       /* tp_as_sequence */
10028    &unicode_as_mapping,        /* tp_as_mapping */
10029    (hashfunc) unicode_hash,        /* tp_hash*/
10030    0,                  /* tp_call*/
10031    (reprfunc) unicode_str,     /* tp_str */
10032    PyObject_GenericGetAttr,        /* tp_getattro */
10033    0,                  /* tp_setattro */
10034    0,                  /* tp_as_buffer */
10035    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_BASETYPE |
10036    Py_TPFLAGS_UNICODE_SUBCLASS,    /* tp_flags */
10037    unicode_doc,            /* tp_doc */
10038    0,                  /* tp_traverse */
10039    0,                  /* tp_clear */
10040    PyUnicode_RichCompare,      /* tp_richcompare */
10041    0,                  /* tp_weaklistoffset */
10042    unicode_iter,           /* tp_iter */
10043    0,                  /* tp_iternext */
10044    unicode_methods,            /* tp_methods */
10045    0,                  /* tp_members */
10046    0,                  /* tp_getset */
10047    &PyBaseObject_Type,         /* tp_base */
10048    0,                  /* tp_dict */
10049    0,                  /* tp_descr_get */
10050    0,                  /* tp_descr_set */
10051    0,                  /* tp_dictoffset */
10052    0,                  /* tp_init */
10053    0,                  /* tp_alloc */
10054    unicode_new,            /* tp_new */
10055    PyObject_Del,           /* tp_free */
10056};
10057
10058/* Initialize the Unicode implementation */
10059
10060void _PyUnicode_Init(void)
10061{
10062    int i;
10063
10064    /* XXX - move this array to unicodectype.c ? */
10065    Py_UNICODE linebreak[] = {
10066        0x000A, /* LINE FEED */
10067        0x000D, /* CARRIAGE RETURN */
10068        0x001C, /* FILE SEPARATOR */
10069        0x001D, /* GROUP SEPARATOR */
10070        0x001E, /* RECORD SEPARATOR */
10071        0x0085, /* NEXT LINE */
10072        0x2028, /* LINE SEPARATOR */
10073        0x2029, /* PARAGRAPH SEPARATOR */
10074    };
10075
10076    /* Init the implementation */
10077    free_list = NULL;
10078    numfree = 0;
10079    unicode_empty = _PyUnicode_New(0);
10080    if (!unicode_empty)
10081        return;
10082
10083    for (i = 0; i < 256; i++)
10084        unicode_latin1[i] = NULL;
10085    if (PyType_Ready(&PyUnicode_Type) < 0)
10086        Py_FatalError("Can't initialize 'unicode'");
10087
10088    /* initialize the linebreak bloom filter */
10089    bloom_linebreak = make_bloom_mask(
10090        linebreak, sizeof(linebreak) / sizeof(linebreak[0])
10091        );
10092
10093    PyType_Ready(&EncodingMapType);
10094}
10095
10096/* Finalize the Unicode implementation */
10097
10098int
10099PyUnicode_ClearFreeList(void)
10100{
10101    int freelist_size = numfree;
10102    PyUnicodeObject *u;
10103
10104    for (u = free_list; u != NULL;) {
10105        PyUnicodeObject *v = u;
10106        u = *(PyUnicodeObject **)u;
10107        if (v->str)
10108            PyObject_DEL(v->str);
10109        Py_XDECREF(v->defenc);
10110        PyObject_Del(v);
10111        numfree--;
10112    }
10113    free_list = NULL;
10114    assert(numfree == 0);
10115    return freelist_size;
10116}
10117
10118void
10119_PyUnicode_Fini(void)
10120{
10121    int i;
10122
10123    Py_XDECREF(unicode_empty);
10124    unicode_empty = NULL;
10125
10126    for (i = 0; i < 256; i++) {
10127        if (unicode_latin1[i]) {
10128            Py_DECREF(unicode_latin1[i]);
10129            unicode_latin1[i] = NULL;
10130        }
10131    }
10132    (void)PyUnicode_ClearFreeList();
10133}
10134
10135void
10136PyUnicode_InternInPlace(PyObject **p)
10137{
10138    register PyUnicodeObject *s = (PyUnicodeObject *)(*p);
10139    PyObject *t;
10140    if (s == NULL || !PyUnicode_Check(s))
10141        Py_FatalError(
10142            "PyUnicode_InternInPlace: unicode strings only please!");
10143    /* If it's a subclass, we don't really know what putting
10144       it in the interned dict might do. */
10145    if (!PyUnicode_CheckExact(s))
10146        return;
10147    if (PyUnicode_CHECK_INTERNED(s))
10148        return;
10149    if (interned == NULL) {
10150        interned = PyDict_New();
10151        if (interned == NULL) {
10152            PyErr_Clear(); /* Don't leave an exception */
10153            return;
10154        }
10155    }
10156    /* It might be that the GetItem call fails even
10157       though the key is present in the dictionary,
10158       namely when this happens during a stack overflow. */
10159    Py_ALLOW_RECURSION
10160        t = PyDict_GetItem(interned, (PyObject *)s);
10161    Py_END_ALLOW_RECURSION
10162
10163        if (t) {
10164            Py_INCREF(t);
10165            Py_DECREF(*p);
10166            *p = t;
10167            return;
10168        }
10169
10170    PyThreadState_GET()->recursion_critical = 1;
10171    if (PyDict_SetItem(interned, (PyObject *)s, (PyObject *)s) < 0) {
10172        PyErr_Clear();
10173        PyThreadState_GET()->recursion_critical = 0;
10174        return;
10175    }
10176    PyThreadState_GET()->recursion_critical = 0;
10177    /* The two references in interned are not counted by refcnt.
10178       The deallocator will take care of this */
10179    Py_REFCNT(s) -= 2;
10180    PyUnicode_CHECK_INTERNED(s) = SSTATE_INTERNED_MORTAL;
10181}
10182
10183void
10184PyUnicode_InternImmortal(PyObject **p)
10185{
10186    PyUnicode_InternInPlace(p);
10187    if (PyUnicode_CHECK_INTERNED(*p) != SSTATE_INTERNED_IMMORTAL) {
10188        PyUnicode_CHECK_INTERNED(*p) = SSTATE_INTERNED_IMMORTAL;
10189        Py_INCREF(*p);
10190    }
10191}
10192
10193PyObject *
10194PyUnicode_InternFromString(const char *cp)
10195{
10196    PyObject *s = PyUnicode_FromString(cp);
10197    if (s == NULL)
10198        return NULL;
10199    PyUnicode_InternInPlace(&s);
10200    return s;
10201}
10202
10203void
10204_Py_ReleaseInternedUnicodeStrings(void)
10205{
10206    PyObject *keys;
10207    PyUnicodeObject *s;
10208    Py_ssize_t i, n;
10209    Py_ssize_t immortal_size = 0, mortal_size = 0;
10210
10211    if (interned == NULL || !PyDict_Check(interned))
10212        return;
10213    keys = PyDict_Keys(interned);
10214    if (keys == NULL || !PyList_Check(keys)) {
10215        PyErr_Clear();
10216        return;
10217    }
10218
10219    /* Since _Py_ReleaseInternedUnicodeStrings() is intended to help a leak
10220       detector, interned unicode strings are not forcibly deallocated;
10221       rather, we give them their stolen references back, and then clear
10222       and DECREF the interned dict. */
10223
10224    n = PyList_GET_SIZE(keys);
10225    fprintf(stderr, "releasing %" PY_FORMAT_SIZE_T "d interned strings\n",
10226            n);
10227    for (i = 0; i < n; i++) {
10228        s = (PyUnicodeObject *) PyList_GET_ITEM(keys, i);
10229        switch (s->state) {
10230        case SSTATE_NOT_INTERNED:
10231            /* XXX Shouldn't happen */
10232            break;
10233        case SSTATE_INTERNED_IMMORTAL:
10234            Py_REFCNT(s) += 1;
10235            immortal_size += s->length;
10236            break;
10237        case SSTATE_INTERNED_MORTAL:
10238            Py_REFCNT(s) += 2;
10239            mortal_size += s->length;
10240            break;
10241        default:
10242            Py_FatalError("Inconsistent interned string state.");
10243        }
10244        s->state = SSTATE_NOT_INTERNED;
10245    }
10246    fprintf(stderr, "total size of all interned strings: "
10247            "%" PY_FORMAT_SIZE_T "d/%" PY_FORMAT_SIZE_T "d "
10248            "mortal/immortal\n", mortal_size, immortal_size);
10249    Py_DECREF(keys);
10250    PyDict_Clear(interned);
10251    Py_DECREF(interned);
10252    interned = NULL;
10253}
10254
10255
10256/********************* Unicode Iterator **************************/
10257
10258typedef struct {
10259    PyObject_HEAD
10260    Py_ssize_t it_index;
10261    PyUnicodeObject *it_seq; /* Set to NULL when iterator is exhausted */
10262} unicodeiterobject;
10263
10264static void
10265unicodeiter_dealloc(unicodeiterobject *it)
10266{
10267    _PyObject_GC_UNTRACK(it);
10268    Py_XDECREF(it->it_seq);
10269    PyObject_GC_Del(it);
10270}
10271
10272static int
10273unicodeiter_traverse(unicodeiterobject *it, visitproc visit, void *arg)
10274{
10275    Py_VISIT(it->it_seq);
10276    return 0;
10277}
10278
10279static PyObject *
10280unicodeiter_next(unicodeiterobject *it)
10281{
10282    PyUnicodeObject *seq;
10283    PyObject *item;
10284
10285    assert(it != NULL);
10286    seq = it->it_seq;
10287    if (seq == NULL)
10288        return NULL;
10289    assert(PyUnicode_Check(seq));
10290
10291    if (it->it_index < PyUnicode_GET_SIZE(seq)) {
10292        item = PyUnicode_FromUnicode(
10293            PyUnicode_AS_UNICODE(seq)+it->it_index, 1);
10294        if (item != NULL)
10295            ++it->it_index;
10296        return item;
10297    }
10298
10299    Py_DECREF(seq);
10300    it->it_seq = NULL;
10301    return NULL;
10302}
10303
10304static PyObject *
10305unicodeiter_len(unicodeiterobject *it)
10306{
10307    Py_ssize_t len = 0;
10308    if (it->it_seq)
10309        len = PyUnicode_GET_SIZE(it->it_seq) - it->it_index;
10310    return PyLong_FromSsize_t(len);
10311}
10312
10313PyDoc_STRVAR(length_hint_doc, "Private method returning an estimate of len(list(it)).");
10314
10315static PyMethodDef unicodeiter_methods[] = {
10316    {"__length_hint__", (PyCFunction)unicodeiter_len, METH_NOARGS,
10317     length_hint_doc},
10318    {NULL,      NULL}       /* sentinel */
10319};
10320
10321PyTypeObject PyUnicodeIter_Type = {
10322    PyVarObject_HEAD_INIT(&PyType_Type, 0)
10323    "str_iterator",         /* tp_name */
10324    sizeof(unicodeiterobject),      /* tp_basicsize */
10325    0,                  /* tp_itemsize */
10326    /* methods */
10327    (destructor)unicodeiter_dealloc,    /* tp_dealloc */
10328    0,                  /* tp_print */
10329    0,                  /* tp_getattr */
10330    0,                  /* tp_setattr */
10331    0,                  /* tp_reserved */
10332    0,                  /* tp_repr */
10333    0,                  /* tp_as_number */
10334    0,                  /* tp_as_sequence */
10335    0,                  /* tp_as_mapping */
10336    0,                  /* tp_hash */
10337    0,                  /* tp_call */
10338    0,                  /* tp_str */
10339    PyObject_GenericGetAttr,        /* tp_getattro */
10340    0,                  /* tp_setattro */
10341    0,                  /* tp_as_buffer */
10342    Py_TPFLAGS_DEFAULT | Py_TPFLAGS_HAVE_GC,/* tp_flags */
10343    0,                  /* tp_doc */
10344    (traverseproc)unicodeiter_traverse, /* tp_traverse */
10345    0,                  /* tp_clear */
10346    0,                  /* tp_richcompare */
10347    0,                  /* tp_weaklistoffset */
10348    PyObject_SelfIter,          /* tp_iter */
10349    (iternextfunc)unicodeiter_next,     /* tp_iternext */
10350    unicodeiter_methods,            /* tp_methods */
10351    0,
10352};
10353
10354static PyObject *
10355unicode_iter(PyObject *seq)
10356{
10357    unicodeiterobject *it;
10358
10359    if (!PyUnicode_Check(seq)) {
10360        PyErr_BadInternalCall();
10361        return NULL;
10362    }
10363    it = PyObject_GC_New(unicodeiterobject, &PyUnicodeIter_Type);
10364    if (it == NULL)
10365        return NULL;
10366    it->it_index = 0;
10367    Py_INCREF(seq);
10368    it->it_seq = (PyUnicodeObject *)seq;
10369    _PyObject_GC_TRACK(it);
10370    return (PyObject *)it;
10371}
10372
10373size_t
10374Py_UNICODE_strlen(const Py_UNICODE *u)
10375{
10376    int res = 0;
10377    while(*u++)
10378        res++;
10379    return res;
10380}
10381
10382Py_UNICODE*
10383Py_UNICODE_strcpy(Py_UNICODE *s1, const Py_UNICODE *s2)
10384{
10385    Py_UNICODE *u = s1;
10386    while ((*u++ = *s2++));
10387    return s1;
10388}
10389
10390Py_UNICODE*
10391Py_UNICODE_strncpy(Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10392{
10393    Py_UNICODE *u = s1;
10394    while ((*u++ = *s2++))
10395        if (n-- == 0)
10396            break;
10397    return s1;
10398}
10399
10400Py_UNICODE*
10401Py_UNICODE_strcat(Py_UNICODE *s1, const Py_UNICODE *s2)
10402{
10403    Py_UNICODE *u1 = s1;
10404    u1 += Py_UNICODE_strlen(u1);
10405    Py_UNICODE_strcpy(u1, s2);
10406    return s1;
10407}
10408
10409int
10410Py_UNICODE_strcmp(const Py_UNICODE *s1, const Py_UNICODE *s2)
10411{
10412    while (*s1 && *s2 && *s1 == *s2)
10413        s1++, s2++;
10414    if (*s1 && *s2)
10415        return (*s1 < *s2) ? -1 : +1;
10416    if (*s1)
10417        return 1;
10418    if (*s2)
10419        return -1;
10420    return 0;
10421}
10422
10423int
10424Py_UNICODE_strncmp(const Py_UNICODE *s1, const Py_UNICODE *s2, size_t n)
10425{
10426    register Py_UNICODE u1, u2;
10427    for (; n != 0; n--) {
10428        u1 = *s1;
10429        u2 = *s2;
10430        if (u1 != u2)
10431            return (u1 < u2) ? -1 : +1;
10432        if (u1 == '\0')
10433            return 0;
10434        s1++;
10435        s2++;
10436    }
10437    return 0;
10438}
10439
10440Py_UNICODE*
10441Py_UNICODE_strchr(const Py_UNICODE *s, Py_UNICODE c)
10442{
10443    const Py_UNICODE *p;
10444    for (p = s; *p; p++)
10445        if (*p == c)
10446            return (Py_UNICODE*)p;
10447    return NULL;
10448}
10449
10450Py_UNICODE*
10451Py_UNICODE_strrchr(const Py_UNICODE *s, Py_UNICODE c)
10452{
10453    const Py_UNICODE *p;
10454    p = s + Py_UNICODE_strlen(s);
10455    while (p != s) {
10456        p--;
10457        if (*p == c)
10458            return (Py_UNICODE*)p;
10459    }
10460    return NULL;
10461}
10462
10463Py_UNICODE*
10464PyUnicode_AsUnicodeCopy(PyObject *object)
10465{
10466    PyUnicodeObject *unicode = (PyUnicodeObject *)object;
10467    Py_UNICODE *copy;
10468    Py_ssize_t size;
10469
10470    /* Ensure we won't overflow the size. */
10471    if (PyUnicode_GET_SIZE(unicode) > ((PY_SSIZE_T_MAX / sizeof(Py_UNICODE)) - 1)) {
10472        PyErr_NoMemory();
10473        return NULL;
10474    }
10475    size = PyUnicode_GET_SIZE(unicode) + 1; /* copy the nul character */
10476    size *= sizeof(Py_UNICODE);
10477    copy = PyMem_Malloc(size);
10478    if (copy == NULL) {
10479        PyErr_NoMemory();
10480        return NULL;
10481    }
10482    memcpy(copy, PyUnicode_AS_UNICODE(unicode), size);
10483    return copy;
10484}
10485
10486/* A _string module, to export formatter_parser and formatter_field_name_split
10487   to the string.Formatter class implemented in Python. */
10488
10489static PyMethodDef _string_methods[] = {
10490    {"formatter_field_name_split", (PyCFunction) formatter_field_name_split,
10491     METH_O, PyDoc_STR("split the argument as a field name")},
10492    {"formatter_parser", (PyCFunction) formatter_parser,
10493     METH_O, PyDoc_STR("parse the argument as a format string")},
10494    {NULL, NULL}
10495};
10496
10497static struct PyModuleDef _string_module = {
10498    PyModuleDef_HEAD_INIT,
10499    "_string",
10500    PyDoc_STR("string helper module"),
10501    0,
10502    _string_methods,
10503    NULL,
10504    NULL,
10505    NULL,
10506    NULL
10507};
10508
10509PyMODINIT_FUNC
10510PyInit__string(void)
10511{
10512    return PyModule_Create(&_string_module);
10513}
10514
10515
10516#ifdef __cplusplus
10517}
10518#endif
10519